### MAIN_GENRE and Scraping
We retrieve the genre information from the Rotten Tomatoes website since it is well-known and if frequently updated. Before scraping, we inspected the website using /robots.txt. We then proceeded scraping the information while complying with the instructions posted on the page.

The code used to scrape can be found in web_crawler.py

In [None]:
pip install selenium
pip install unidecode

: 

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.metrics import RocCurveDisplay, recall_score, precision_score, roc_curve, roc_auc_score, balanced_accuracy_score
import matplotlib.pyplot as plt
import matplotlib as mpl

: 

In [None]:
# import for scrapping
from unidecode import unidecode
import web_crawler as wc

: 

In [None]:
comb_df=pd.read_csv('comb_df.csv')

: 

In [None]:
na_genre = comb_df[comb_df['MAIN_GENRE'].isna()]
type(na_genre)
g = na_genre.drop_duplicates(subset='show_title')
type(g)
len(g)
g

: 

In [None]:
na_genre = comb_df[comb_df['MAIN_GENRE'].isna()]
na_genre = na_genre.drop_duplicates(subset='show_title')

na_genre_cols = ['category', 'show_title','week']
na_genre = na_genre[na_genre_cols]
print("shape of na_genre initially: ", na_genre.shape)

def mapping_category(string):
  ''' 
  Method to find the category which will later be used in url parsing 
  '''
  string = unidecode(string)

  if(string=="Films (English)"):
    string = "m"
  else:
    string = "tv"
  # print(string)
  return string

def mapping_title(name):
  '''
  Method to find the title name which will later be used in url parsing
  '''
  name = unidecode(name)
  name = name.lower()
  #removing all punctuation
  # https://www.geeksforgeeks.org/python-remove-punctuation-from-string/
  punctuation = '''!()-[]{};:'"\,<>./?@#$%^&*_~'''
  for punctuation in punctuation:
    name = name.replace(punctuation, '')
  list = name.split()
  return "_".join(list)

def alt_mapping_category(string):
  """
  Method to find the category which will later be used in url parsing just in case 
  given category was incorrectly listed 
  """

  #in case the type was incorrectly listed 
  string = unidecode(string)
  if(string=="m"):
    string = "tv"
  else:
    string = "m"
  # print(string)
  return string

def mapping_year(week):
  """
  Method to find the release year which will later be used in url parsing in the
  case that shows have duplicate names 
  """
  return week[:4]

na_genre['category'] = na_genre['category'].map(mapping_category)

na_genre['show_title'] = na_genre['show_title'].map(mapping_title)

na_genre['category2'] = na_genre['category'].map(alt_mapping_category)

na_genre['year'] = na_genre['week'].map(mapping_year)

na_genre["url"] = "https://www.rottentomatoes.com/" + na_genre["category"]+"/"+na_genre['show_title']+"_"+na_genre['year'] #first try

na_genre["url2"] = "https://www.rottentomatoes.com/" + na_genre["category"]+"/"+na_genre['show_title'] #second try 

na_genre["url3"] = "https://www.rottentomatoes.com/" + na_genre["category2"]+"/"+na_genre['show_title']+"_"+na_genre['year'] #third try

na_genre["url4"] = "https://www.rottentomatoes.com/" + na_genre["category2"]+"/"+na_genre['show_title'] #fourth try 

na_genre = na_genre.drop_duplicates(subset=['show_title'])

pd.set_option('display.max_columns', None)

na_genre.to_csv("output_filename.csv", index=False, encoding='utf8')

# na_genre.head(50)

: 

In [None]:
na_genre.reset_index(drop=True)
# import web_crawler as wc

: 

In [None]:
url_df = na_genre[['url','url2','url3','url4']]
url_df.head()

: 

In [None]:
scrapped_name, scrapped_cat = wc.automate_data_collection(url_df.to_numpy(), 1768)

: 

In [None]:
#Creates a dataframe with genre, show_title, and audience scores for titles with Na values for genre
movie_data = []
i=0
for entry in scrapped_name:
    if entry!= 'NA':
        title, details = entry.split('\n', 1)
        info, tomato_meter, _,_, audience_score, _, _ = details.split('\n')
        comma1=info.find(',')
        comma2=info.rfind(',')
        genre=info[comma1+2:comma2]
        movie_data.append([title, genre, tomato_meter, audience_score])

# Create DataFrame
df = pd.DataFrame(movie_data, columns=['show_title', 'genre', 'tomatometer', 'audience_score'])
# df.to_csv('genreinfo.csv', index=False)


: 