## Setting up/formatting the dataset

In this notebook, we perform the following tasks:

- Consolidate and merge different datasets.
- Combine scraped data.
- Generate the dataset for training our model.

In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.metrics import RocCurveDisplay, recall_score, precision_score, roc_curve, roc_auc_score, balanced_accuracy_score
import matplotlib.pyplot as plt
import matplotlib as mpl

In [21]:
main=pd.read_csv('datasets/all-weeks-global.csv')
movies=pd.read_csv('datasets/bestmovies.csv')
shows=pd.read_csv('datasets/bestshows.csv')
globai=pd.read_csv('datasets/whatwewatch.csv')

In [22]:
main

Unnamed: 0,week,category,weekly_rank,show_title,season_title,weekly_hours_viewed,runtime,weekly_views,cumulative_weeks_in_top_10,Unnamed: 9,episode_launch_details
0,2024-03-24,Films (English),1,Irish Wish,,40000000,1.5667,25500000.0,2,False,
1,2024-03-24,Films (English),2,Damsel,,35700000,1.8333,19500000.0,3,False,
2,2024-03-24,Films (English),3,Shooter,,11700000,2.0833,5600000.0,5,False,
3,2024-03-24,Films (English),4,Ford v. Ferrari,,11200000,2.5500,4400000.0,2,False,
4,2024-03-24,Films (English),5,The Casagrandes Movie,,5700000,1.4167,4000000.0,1,False,
...,...,...,...,...,...,...,...,...,...,...,...
5715,2021-07-04,TV (Non-English),6,Elite,Elite: Season 1,10530000,,,1,False,
5716,2021-07-04,TV (Non-English),7,Elite,Elite: Season 3,10200000,,,1,False,
5717,2021-07-04,TV (Non-English),8,Elite,Elite: Season 2,10140000,,,1,False,
5718,2021-07-04,TV (Non-English),9,Katla,Katla: Season 1,9190000,,,1,False,


In [23]:
main['is_staggered_launch']=main['Unnamed: 9']

In [24]:
def strip_season(title):
    return title.split(': Season')[0].strip()
globai['Title'] = globai['Title'].apply(strip_season)

In [25]:
main=main[['week', 'category','show_title', 'weekly_hours_viewed', 'runtime', 'weekly_views', 'cumulative_weeks_in_top_10', 'is_staggered_launch']]
globai=globai[['Title','Available Globally?','Release Date']].set_index('Title')
shows=shows[['TITLE','MAIN_GENRE','SCORE','NUMBER_OF_VOTES', 'MAIN_PRODUCTION', 'NUMBER_OF_SEASONS' ]].set_index('TITLE')
movies=movies[['TITLE','MAIN_GENRE','SCORE','NUMBER_OF_VOTES', 'MAIN_PRODUCTION' ]].set_index('TITLE')
showsmovies = pd.concat([shows, movies], axis=0)

In [26]:
new=main.join(globai, on='show_title', how='left')
comb_df=new.join(showsmovies, on='show_title', how='left')
comb_df.head()

Unnamed: 0,week,category,show_title,weekly_hours_viewed,runtime,weekly_views,cumulative_weeks_in_top_10,is_staggered_launch,Available Globally?,Release Date,MAIN_GENRE,SCORE,NUMBER_OF_VOTES,MAIN_PRODUCTION,NUMBER_OF_SEASONS
0,2024-03-24,Films (English),Irish Wish,40000000,1.5667,25500000.0,2,False,,,,,,,
1,2024-03-24,Films (English),Damsel,35700000,1.8333,19500000.0,3,False,,,,,,,
2,2024-03-24,Films (English),Shooter,11700000,2.0833,5600000.0,5,False,No,2016-11-16,war,7.5,35547.0,US,3.0
2,2024-03-24,Films (English),Shooter,11700000,2.0833,5600000.0,5,False,No,2016-11-16,thriller,7.2,329417.0,US,
2,2024-03-24,Films (English),Shooter,11700000,2.0833,5600000.0,5,False,No,2018-06-22,war,7.5,35547.0,US,3.0


In [27]:
#detatching the language from type
comb_df[['type', 'language']] = comb_df['category'].str.split(' \(', expand=True)
comb_df['language'] = comb_df['language'].str.replace(')', '')
comb_df.drop(columns=['category'], inplace=True)

In [28]:
comb_df

Unnamed: 0,week,show_title,weekly_hours_viewed,runtime,weekly_views,cumulative_weeks_in_top_10,is_staggered_launch,Available Globally?,Release Date,MAIN_GENRE,SCORE,NUMBER_OF_VOTES,MAIN_PRODUCTION,NUMBER_OF_SEASONS,type,language
0,2024-03-24,Irish Wish,40000000,1.5667,25500000.0,2,False,,,,,,,,Films,English
1,2024-03-24,Damsel,35700000,1.8333,19500000.0,3,False,,,,,,,,Films,English
2,2024-03-24,Shooter,11700000,2.0833,5600000.0,5,False,No,2016-11-16,war,7.5,35547.0,US,3.0,Films,English
2,2024-03-24,Shooter,11700000,2.0833,5600000.0,5,False,No,2016-11-16,thriller,7.2,329417.0,US,,Films,English
2,2024-03-24,Shooter,11700000,2.0833,5600000.0,5,False,No,2018-06-22,war,7.5,35547.0,US,3.0,Films,English
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5717,2021-07-04,Elite,10140000,,,1,False,Yes,2020-03-13,,,,,,TV,Non-English
5717,2021-07-04,Elite,10140000,,,1,False,Yes,2021-06-18,,,,,,TV,Non-English
5718,2021-07-04,Katla,9190000,,,1,False,Yes,2021-06-17,,,,,,TV,Non-English
5719,2021-07-04,Record of Ragnarok,9140000,,,1,False,No,2023-01-26,,,,,,TV,Non-English


In [29]:
comb_df['Release Date'] = pd.to_datetime(comb_df['Release Date'])

In [30]:
grouped_df = comb_df.groupby('show_title').agg({
    'type': lambda x: x.mode().iat[0] if not x.empty else None,
    'weekly_hours_viewed': 'mean',
    'cumulative_weeks_in_top_10': 'max',
    'runtime': lambda x: x.sum() if not x.isnull().all() else None,
    'NUMBER_OF_SEASONS': 'max',
    'language': lambda x: x.mode().iat[0] if not x.empty else None,
    'is_staggered_launch': lambda x: x.mode().iat[0] if not x.empty else None,
    'Available Globally?': lambda x: x.dropna().mode().iat[0] if not x.dropna().empty else None,
    'Release Date': 'min',
    'MAIN_GENRE': lambda x: list(set(x)) if not x.isnull().all() else None,
    'SCORE': 'mean',
    'NUMBER_OF_VOTES': 'mean',
    'MAIN_PRODUCTION': lambda x: x.dropna().mode().iat[0] if not x.dropna().empty else None,
    'week': 'max',
})
grouped_df.reset_index(inplace=True)

In [34]:
grouped_df.columns = [column.lower().replace(' ', '_') for column in grouped_df.columns]
grouped_df.rename(columns={'main_genre': 'genre', 'available_globally?': 'available_globally'}, inplace=True)
grouped_df

Unnamed: 0,show_title,type,weekly_hours_viewed,cumulative_weeks_in_top_10,runtime,number_of_seasons,language,is_staggered_launch,available_globally,release_date,genre,score,number_of_votes,main_production,week
0,'83,Films,6.825000e+06,2,,,Non-English,False,No,NaT,,,,,2022-04-03
1,10 Days of a Bad Man,Films,6.900000e+06,2,4.1334,,Non-English,False,,NaT,,,,,2023-08-27
2,10 Days of a Good Man,Films,7.415000e+06,2,,,Non-English,False,,NaT,,,,,2023-03-12
3,1000 Miles from Christmas,Films,6.795000e+06,2,,,Non-English,False,,NaT,,,,,2022-01-02
4,12 Strong,Films,8.210000e+06,2,,,English,False,No,NaT,,,,,2022-07-24
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1860,maboroshi,Films,2.400000e+06,1,1.8500,,Non-English,False,,NaT,,,,,2024-01-21
1861,¡Que viva México!,Films,1.249250e+07,4,,,Non-English,False,Yes,2023-05-11,,,,,2023-06-04
1862,Ìjọ̀gbọ̀n,Films,4.950000e+06,2,3.8666,,Non-English,False,,NaT,,,,,2023-10-22
1863,أصحاب ...ولا أعزّ,Films,2.910000e+06,3,,,Non-English,False,No,2022-01-20,,,,,2022-02-06


SINCE THERE ARE ONLY 1800 DATA POINTS I SAW WE JUST SCRAPE FOR ALL OF THEM AND THEN WE CAN GET THE TOMATOMETER AND AUDIENCE SCORE FOR ALL

In [35]:
grouped_df.drop(columns=['score', 'number_of_votes'], inplace=True)
grouped_df.loc[:, 'genre'] = np.nan

In [37]:
grouped_df

Unnamed: 0,show_title,type,weekly_hours_viewed,cumulative_weeks_in_top_10,runtime,number_of_seasons,language,is_staggered_launch,available_globally,release_date,genre,main_production,week
0,'83,Films,6.825000e+06,2,,,Non-English,False,No,NaT,,,2022-04-03
1,10 Days of a Bad Man,Films,6.900000e+06,2,4.1334,,Non-English,False,,NaT,,,2023-08-27
2,10 Days of a Good Man,Films,7.415000e+06,2,,,Non-English,False,,NaT,,,2023-03-12
3,1000 Miles from Christmas,Films,6.795000e+06,2,,,Non-English,False,,NaT,,,2022-01-02
4,12 Strong,Films,8.210000e+06,2,,,English,False,No,NaT,,,2022-07-24
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1860,maboroshi,Films,2.400000e+06,1,1.8500,,Non-English,False,,NaT,,,2024-01-21
1861,¡Que viva México!,Films,1.249250e+07,4,,,Non-English,False,Yes,2023-05-11,,,2023-06-04
1862,Ìjọ̀gbọ̀n,Films,4.950000e+06,2,3.8666,,Non-English,False,,NaT,,,2023-10-22
1863,أصحاب ...ولا أعزّ,Films,2.910000e+06,3,,,Non-English,False,No,2022-01-20,,,2022-02-06


In [36]:
#exporting grouped_df to get genres + scores via scraping
grouped_df.to_csv('scraping/grouped_df.csv', index=False)

From the scraping, we get genres formatted like "Drama/Biography" and "Drama/Mystery & thriller". We want all possible genres to be many hot encoded.

In [117]:
# make a test dataframe
m1 = ['83', 'Drama/Biography', '80%', '95%']
m2 = ['10 DAYS OF A BAD MAN', 'Drama','--','42%']
m3 = ['10 DAYS OF A GOOD MAN', 'Drama/Mystery & thriller', '--', '66%']
test_data = np.asarray([np.asarray(m1), np.asarray(m2),np.asarray(m3)])
test_df = pd.DataFrame(test_data, columns=['show_title', 'genre', 'tomatometer', 'audience_score'])
test_df

Unnamed: 0,show_title,genre,tomatometer,audience_score
0,83,Drama/Biography,80%,95%
1,10 DAYS OF A BAD MAN,Drama,--,42%
2,10 DAYS OF A GOOD MAN,Drama/Mystery & thriller,--,66%


In [118]:
# get the genres column
genres = test_df.loc[:, 'genre']
genres = genres.explode().unique()
genres

array(['Drama/Biography', 'Drama', 'Drama/Mystery & thriller'],
      dtype=object)

In [119]:
# get a list of all the possible genres
def clean_genres(genres):
  clean_genres = []
  for genre in genres:
    words = genre.split('/')
    for word in words:
        word = word.upper()
        if '&' in word:
            sub_words = word.split(' & ')
            for sub_word in sub_words:
                if sub_word.strip() not in clean_genres:
                    clean_genres.append(sub_word.strip())
        elif word.strip() not in clean_genres:
            clean_genres.append(word.strip())
  return clean_genres

genre_categories = clean_genres(genres)
genre_categories

['DRAMA', 'BIOGRAPHY', 'MYSTERY', 'THRILLER']

In [120]:
# manyhot encoding function
def manyhot(column=None, col=None):
    column = column.tolist()
    for x in range(len(column)):
        column[x] = column[x].upper()
    main = np.empty(0)

    for row in column:
        arr = np.empty(0)
        for val in col:
            if val in row:
                arr = np.append(arr, 1)
            else:
                arr = np.append(arr, 0)
        main = np.append(main, arr)
    return main.reshape(len(column), len(col))

In [123]:
# manyhot encode the genres column
manyhot_genres = manyhot(genres, genre_categories)
manyhot_genres

array([[1., 1., 0., 0.],
       [1., 0., 0., 0.],
       [1., 0., 1., 1.]])

In [124]:
# set the genres column to our many hot encoded column
test_df['genre'] = manyhot_genres.tolist()
test_df


Unnamed: 0,show_title,genre,tomatometer,audience_score
0,83,"[1.0, 1.0, 0.0, 0.0]",80%,95%
1,10 DAYS OF A BAD MAN,"[1.0, 0.0, 0.0, 0.0]",--,42%
2,10 DAYS OF A GOOD MAN,"[1.0, 0.0, 1.0, 1.0]",--,66%
