### MAIN_GENRE and Scraping
We retrieve the genre information from the Rotten Tomatoes website since it is well-known and if frequently updated. Before scraping, we inspected the website using /robots.txt. We then proceeded scraping the information while complying with the instructions posted on the page.

The code used to scrape can be found in web_crawler.py

In [15]:
# pip install selenium
pip install unidecode

--- Logging error ---
Traceback (most recent call last):
  File "/Applications/anaconda3/lib/python3.9/site-packages/pip/_internal/utils/logging.py", line 177, in emit
    self.console.print(renderable, overflow="ignore", crop=False, style=style)
  File "/Applications/anaconda3/lib/python3.9/site-packages/pip/_vendor/rich/console.py", line 1673, in print
    extend(render(renderable, render_options))
  File "/Applications/anaconda3/lib/python3.9/site-packages/pip/_vendor/rich/console.py", line 1305, in render
    for render_output in iter_render:
  File "/Applications/anaconda3/lib/python3.9/site-packages/pip/_internal/utils/logging.py", line 134, in __rich_console__
    for line in lines:
  File "/Applications/anaconda3/lib/python3.9/site-packages/pip/_vendor/rich/segment.py", line 249, in split_lines
    for segment in segments:
  File "/Applications/anaconda3/lib/python3.9/site-packages/pip/_vendor/rich/console.py", line 1283, in render
    renderable = rich_cast(renderable)
  File 

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.metrics import RocCurveDisplay, recall_score, precision_score, roc_curve, roc_auc_score, balanced_accuracy_score
import matplotlib.pyplot as plt
import matplotlib as mpl

In [3]:
# import for scrapping
from unidecode import unidecode
import web_crawler as wc

In [4]:
df=pd.read_csv('grouped_df.csv')

In [5]:
na_genre = df[df['genre'].isna()]

na_genre_cols = ['type', 'show_title','week']
na_genre = na_genre[na_genre_cols]
print("shape of na_genre initially: ", na_genre.shape)

def mapping_category(string):
  ''' 
  Method to find the category which will later be used in url parsing 
  '''
  string = unidecode(string)

  if(string=="Films"):
    string = "m"
  else:
    string = "tv"
  # print(string)
  return string

def mapping_title(name):
  '''
  Method to find the title name which will later be used in url parsing
  '''
  name = unidecode(name)
  name = name.lower()
  #removing all punctuation
  # https://www.geeksforgeeks.org/python-remove-punctuation-from-string/
  punctuation = '''!()-[]{};:'"\,<>./?@#$%^&*_~'''
  for punctuation in punctuation:
    name = name.replace(punctuation, '')
  list = name.split()
  return "_".join(list)

def alt_mapping_category(string):
  """
  Method to find the category which will later be used in url parsing just in case 
  given category was incorrectly listed 
  """

  #in case the type was incorrectly listed 
  string = unidecode(string)
  if(string=="m"):
    string = "tv"
  else:
    string = "m"
  # print(string)
  return string

def mapping_year(week):
  """
  Method to find the release year which will later be used in url parsing in the
  case that shows have duplicate names 
  """
  return week[:4]

na_genre['type'] = na_genre['type'].map(mapping_category)

na_genre['show_title'] = na_genre['show_title'].map(mapping_title)

na_genre['type2'] = na_genre['type'].map(alt_mapping_category)

na_genre['year'] = na_genre['week'].map(mapping_year)

na_genre["url"] = "https://www.rottentomatoes.com/" + na_genre["type"]+"/"+na_genre['show_title']+"_"+na_genre['year'] #first try

na_genre["url2"] = "https://www.rottentomatoes.com/" + na_genre["type"]+"/"+na_genre['show_title'] #second try 

na_genre["url3"] = "https://www.rottentomatoes.com/" + na_genre["type2"]+"/"+na_genre['show_title']+"_"+na_genre['year'] #third try

na_genre["url4"] = "https://www.rottentomatoes.com/" + na_genre["type2"]+"/"+na_genre['show_title'] #fourth try 

na_genre = na_genre.drop_duplicates(subset=['show_title'])

pd.set_option('display.max_columns', None)

# na_genre.to_csv("output_filename.csv", index=False, encoding='utf8')

# na_genre.head(50)

shape of na_genre initially:  (1865, 3)


In [6]:
na_genre.reset_index(drop=True)
# import web_crawler as wc

Unnamed: 0,type,show_title,week,type2,year,url,url2,url3,url4
0,m,83,2022-04-03,tv,2022,https://www.rottentomatoes.com/m/83_2022,https://www.rottentomatoes.com/m/83,https://www.rottentomatoes.com/tv/83_2022,https://www.rottentomatoes.com/tv/83
1,m,10_days_of_a_bad_man,2023-08-27,tv,2023,https://www.rottentomatoes.com/m/10_days_of_a_...,https://www.rottentomatoes.com/m/10_days_of_a_...,https://www.rottentomatoes.com/tv/10_days_of_a...,https://www.rottentomatoes.com/tv/10_days_of_a...
2,m,10_days_of_a_good_man,2023-03-12,tv,2023,https://www.rottentomatoes.com/m/10_days_of_a_...,https://www.rottentomatoes.com/m/10_days_of_a_...,https://www.rottentomatoes.com/tv/10_days_of_a...,https://www.rottentomatoes.com/tv/10_days_of_a...
3,m,1000_miles_from_christmas,2022-01-02,tv,2022,https://www.rottentomatoes.com/m/1000_miles_fr...,https://www.rottentomatoes.com/m/1000_miles_fr...,https://www.rottentomatoes.com/tv/1000_miles_f...,https://www.rottentomatoes.com/tv/1000_miles_f...
4,m,12_strong,2022-07-24,tv,2022,https://www.rottentomatoes.com/m/12_strong_2022,https://www.rottentomatoes.com/m/12_strong,https://www.rottentomatoes.com/tv/12_strong_2022,https://www.rottentomatoes.com/tv/12_strong
...,...,...,...,...,...,...,...,...,...
1860,m,maboroshi,2024-01-21,tv,2024,https://www.rottentomatoes.com/m/maboroshi_2024,https://www.rottentomatoes.com/m/maboroshi,https://www.rottentomatoes.com/tv/maboroshi_2024,https://www.rottentomatoes.com/tv/maboroshi
1861,m,que_viva_mexico,2023-06-04,tv,2023,https://www.rottentomatoes.com/m/que_viva_mexi...,https://www.rottentomatoes.com/m/que_viva_mexico,https://www.rottentomatoes.com/tv/que_viva_mex...,https://www.rottentomatoes.com/tv/que_viva_mexico
1862,m,ijogbon,2023-10-22,tv,2023,https://www.rottentomatoes.com/m/ijogbon_2023,https://www.rottentomatoes.com/m/ijogbon,https://www.rottentomatoes.com/tv/ijogbon_2023,https://www.rottentomatoes.com/tv/ijogbon
1863,m,shb_wl_`zw,2022-02-06,tv,2022,https://www.rottentomatoes.com/m/shb_wl_`zw_2022,https://www.rottentomatoes.com/m/shb_wl_`zw,https://www.rottentomatoes.com/tv/shb_wl_`zw_2022,https://www.rottentomatoes.com/tv/shb_wl_`zw


In [7]:
url_df = na_genre[['url','url2','url3','url4']]
url_df.head()

Unnamed: 0,url,url2,url3,url4
0,https://www.rottentomatoes.com/m/83_2022,https://www.rottentomatoes.com/m/83,https://www.rottentomatoes.com/tv/83_2022,https://www.rottentomatoes.com/tv/83
1,https://www.rottentomatoes.com/m/10_days_of_a_...,https://www.rottentomatoes.com/m/10_days_of_a_...,https://www.rottentomatoes.com/tv/10_days_of_a...,https://www.rottentomatoes.com/tv/10_days_of_a...
2,https://www.rottentomatoes.com/m/10_days_of_a_...,https://www.rottentomatoes.com/m/10_days_of_a_...,https://www.rottentomatoes.com/tv/10_days_of_a...,https://www.rottentomatoes.com/tv/10_days_of_a...
3,https://www.rottentomatoes.com/m/1000_miles_fr...,https://www.rottentomatoes.com/m/1000_miles_fr...,https://www.rottentomatoes.com/tv/1000_miles_f...,https://www.rottentomatoes.com/tv/1000_miles_f...
4,https://www.rottentomatoes.com/m/12_strong_2022,https://www.rottentomatoes.com/m/12_strong,https://www.rottentomatoes.com/tv/12_strong_2022,https://www.rottentomatoes.com/tv/12_strong


In [8]:
scrapped_name, scrapped_cat = wc.automate_data_collection(url_df.to_numpy(), 2)

['need to check!!', 'need to check!!']


In [9]:
scrapped_name

['NA', 'NA']

In [10]:
#Creates a dataframe with genre, show_title, and audience scores for titles with Na values for genre
movie_data = []
i=0
for entry in scrapped_name:
    if entry!= 'NA':
        title, details = entry.split('\n', 1)
        info, tomato_meter, _,_, audience_score, _, _ = details.split('\n')
        comma1=info.find(',')
        comma2=info.rfind(',')
        genre=info[comma1+2:comma2]
        movie_data.append([title, genre, tomato_meter, audience_score])

# Create DataFrame
df = pd.DataFrame(movie_data, columns=['show_title', 'genre', 'tomatometer', 'audience_score'])
# df.to_csv('genreinfo.csv', index=False)


In [11]:
df

Unnamed: 0,show_title,genre,tomatometer,audience_score
