<a href="https://colab.research.google.com/github/bcschaefer/TV-Season-Predictor/blob/main/Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Reading CSVs and Engineering Columns**

In [None]:
import pandas as pd
import numpy as np

info = pd.read_csv("/content/drive/MyDrive/Movie Database/title.akas.tsv",
                   usecols=['titleId', 'title', 'language', 'types'],
                   sep = '\t', na_values='\\N', dtype='string')

In [None]:
basics = pd.read_csv("/content/drive/MyDrive/Movie Database/title.basics.tsv",
                     usecols=['tconst', 'titleType', 'startYear', 'endYear', 'runtimeMinutes', 'genres'],
                     dtype={'tconst':'string', 'startYear' :'Int64', 'endYear':'Int64',
                            'runtimeMinutes':'string', 'genres':'string'},
                     sep = '\t', na_values='\\N')


In [None]:
ratings = pd.read_csv("/content/drive/MyDrive/Movie Database/title.ratings.tsv",
                      dtype={'tconst':'string', 'averageRating':'float', 'numVotes':'Int64'},
                      sep = '\t', na_values='\\N',)

In [None]:
print(info)
variants = info['titleId'].value_counts().reset_index()
variants = variants.rename(columns={'count':'variants'})


ratings = ratings.drop_duplicates(subset='tconst')
basics = basics.drop_duplicates(subset='tconst')
info = info.drop_duplicates(subset='titleId')
ratings.to_csv('ratings.csv')
basics.to_csv('basics.csv')
info.to_csv('info.csv')

shows_raw = pd.merge(info, basics, left_on='titleId', right_on='tconst')
shows_raw = pd.merge(shows_raw, ratings, left_on="tconst",right_on='tconst')
shows_raw = pd.merge(shows_raw, variants, left_on="tconst",right_on='titleId', how='inner')
print(shows_raw)

shows_raw.to_csv('shows_raw.csv')
print(f"Shape: {shows_raw.shape}")

            titleId                      title language     types
0         tt0000001                 Carmencita     <NA>  original
8         tt0000002     Le clown et ses chiens     <NA>  original
16        tt0000003             Pauvre Pierrot     <NA>  original
25        tt0000004                Un bon bock     <NA>  original
33        tt0000005           Blacksmith Scene     <NA>  original
...             ...                        ...      ...       ...
48099261  tt9916848              Episode #3.17     <NA>  original
48099269  tt9916850              Episode #3.19     <NA>  original
48099277  tt9916852              Episode #3.20     <NA>  original
48099285  tt9916856                   The Wind     <NA>  original
48099287  tt9916880  Horrid Henry Knows It All     <NA>  original

[10710491 rows x 4 columns]
         titleId_x                       title language     types     tconst  \
0        tt0000001                  Carmencita     <NA>  original  tt0000001   
1        tt0000002 

In [None]:
shows_raw = pd.read_csv('/content/drive/MyDrive/Movie Database/shows_raw-2.csv', low_memory=False)
print(shows_raw.columns)

Index(['Unnamed: 0', 'titleId_x', 'title', 'language', 'types', 'tconst',
       'titleType', 'startYear', 'endYear', 'runtimeMinutes', 'genres',
       'averageRating', 'numVotes', 'titleId_y', 'variants'],
      dtype='object')


**Further Cleaning**

In [None]:
shows = shows_raw[shows_raw['titleType'] == 'tvSeries']
shows = shows.drop_duplicates(subset='tconst') #ensure there are no duplicates

shows = shows.drop(columns=['Unnamed: 0', 'language', 'titleId_y', 'types', 'titleType'])
print(shows.columns)
shows.to_csv('shows_clean.csv')

Index(['titleId_x', 'title', 'tconst', 'startYear', 'endYear',
       'runtimeMinutes', 'genres', 'averageRating', 'numVotes', 'variants'],
      dtype='object')


In [None]:
shows = pd.read_csv('/content/drive/MyDrive/Movie Database/shows_clean.csv', low_memory=False)
print(shows_raw.columns)
print(shows_raw.shape)

Index(['Unnamed: 0', 'titleId_x', 'title', 'language', 'types', 'tconst',
       'titleType', 'startYear', 'endYear', 'runtimeMinutes', 'genres',
       'averageRating', 'numVotes', 'titleId_y', 'variants'],
      dtype='object')
(95107, 15)


**One Hot Encoding Genres**

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer


#shows['genres'].fillna('No Genre')
mlb = MultiLabelBinarizer() #transforms list of list to ohe

mask = shows['genres'].notnull()
shows = shows[mask]

genres = shows.loc[mask, 'genres'].dropna().str.split(',')

one_hot_encoding = mlb.fit_transform(genres)

one_hot_df = pd.DataFrame(one_hot_encoding, columns=mlb.classes_, index=shows.index)

shows = shows.drop(columns=['genres'])
shows = pd.concat([shows, one_hot_df], axis=1)
print(mlb.classes_)
print(shows.shape)

shows.to_csv('shows_ohe.csv')

['Action' 'Adult' 'Adventure' 'Animation' 'Biography' 'Comedy' 'Crime'
 'Documentary' 'Drama' 'Family' 'Fantasy' 'Game-Show' 'History' 'Horror'
 'Music' 'Musical' 'Mystery' 'News' 'Reality-TV' 'Romance' 'Sci-Fi'
 'Short' 'Sport' 'Talk-Show' 'Thriller' 'War' 'Western']
(91609, 37)


**Finding Number Of Seasons**

In [None]:
import pandas as pd
import numpy as np

shows = pd.read_csv('/content/drive/MyDrive/Movie Database/shows_ohe-3.csv', low_memory=False)
mask = shows['endYear'].notnull()

showsDrop = shows[mask]

def get_range(row):
    return (row['endYear'] - row['startYear'])



#ffill replaces with last non-nan datapoint
shows['endYear'] = shows['endYear'].fillna(method='ffill')

showsDrop['seasons'] = showsDrop.apply(get_range, axis=1)
shows['seasons'] = shows.apply(get_range, axis=1)

print(showsDrop.shape)
print(shows.shape)
shows.to_csv('showSeasons.csv')
showsDrop.to_csv('showsDrop.csv')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  showsDrop['seasons'] = showsDrop.apply(get_range, axis=1)


(42351, 39)
(91609, 39)


In [None]:
import pandas as pd
import numpy as np
shows = pd.read_csv('/content/drive/MyDrive/Movie Database/showSeasons.csv', low_memory=False)
showsDrop = pd.read_csv('/content/drive/MyDrive/Movie Database/showsDrop.csv', low_memory=False)

In [None]:
shows = shows.fillna(method='ffill').dropna()
X = shows.loc[:, ['startYear', 'runtimeMinutes', 'averageRating',
                  'numVotes', 'Action', 'Adult', 'Adventure',
                  'Animation', 'Biography', 'Comedy', 'Crime', 'Documentary',
                  'Drama', 'Family', 'Fantasy', 'Game-Show', 'History', 'Horror',
                  'Music', 'Musical', 'Mystery', 'News', 'Reality-TV', 'Romance',
                  'Sci-Fi', 'Short', 'Sport', 'Talk-Show', 'Thriller', 'War', 'Western']].to_numpy()
y = shows.loc[:, 'seasons'].to_numpy()

print(X.shape)
print(y.shape)

showsDrop = showsDrop.dropna()
Xdrop = showsDrop.loc[:, ['startYear', 'runtimeMinutes', 'averageRating',
                  'numVotes', 'Action', 'Adult', 'Adventure',
                  'Animation', 'Biography', 'Comedy', 'Crime', 'Documentary',
                  'Drama', 'Family', 'Fantasy', 'Game-Show', 'History', 'Horror',
                  'Music', 'Musical', 'Mystery', 'News', 'Reality-TV', 'Romance',
                  'Sci-Fi', 'Short', 'Sport', 'Talk-Show', 'Thriller', 'War', 'Western']].to_numpy()
ydrop = showsDrop.loc[:, 'seasons'].to_numpy()

print(Xdrop.shape)
print(ydrop.shape)

(91608, 31)
(91608,)
(27448, 31)
(27448,)


**Base Case**

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

#mlp
(X_train, X_test, y_train, y_test) = train_test_split(X, y, test_size=0.4)
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
mlp = MLPRegressor(max_iter=1)
mlp.fit(X_train, y_train)

y_pred = mlp.predict(X_test)

print(mlp.score(X_test, y_test))     # print accuracy

0.15124465891818561




**Shows Model**

In [None]:
#mlp
(X_train, X_test, y_train, y_test) = train_test_split(X, y, test_size=0.4)
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
mlp = MLPRegressor(max_iter=10000)
mlp.fit(X_train, y_train)

y_pred = mlp.predict(X_test)

print(mlp.score(X_test, y_test))     # print accuracy

0.1663630948187852


**Drop Shows Model**

In [None]:
#mlp
(X_train, X_test, y_train, y_test) = train_test_split(Xdrop, ydrop, test_size=0.4)
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
mlp = MLPRegressor(max_iter=10000)
mlp.fit(X_train, y_train)

y_pred = mlp.predict(X_test)

print(mlp.score(X_test, y_test))     # print accuracy

0.18056287427174988


**Using Classifier Instead of Regressor**

In [None]:
from sklearn.neural_network import MLPClassifier

showsDrop = pd.read_csv('/content/drive/MyDrive/Movie Database/showsDrop.csv', low_memory=False)

showsDrop = showsDrop.dropna()
Xdrop = showsDrop.loc[:, ['startYear', 'runtimeMinutes', 'averageRating',
                  'numVotes', 'Action', 'Adult', 'Adventure',
                  'Animation', 'Biography', 'Comedy', 'Crime', 'Documentary',
                  'Drama', 'Family', 'Fantasy', 'Game-Show', 'History', 'Horror',
                  'Music', 'Musical', 'Mystery', 'News', 'Reality-TV', 'Romance',
                  'Sci-Fi', 'Short', 'Sport', 'Talk-Show', 'Thriller', 'War', 'Western']].to_numpy()
ydrop = showsDrop.loc[:, 'seasons'].to_numpy()

#mlp
(X_train, X_test, y_train, y_test) = train_test_split(Xdrop, ydrop, test_size=0.4)
scaler = StandardScaler()
scaler.fit(X_train)
scaler.fit(X_test)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
mlp = MLPClassifier(max_iter=10000)
mlp.fit(X_train, y_train)

y_pred = mlp.predict(X_test)

print(mlp.score(X_test, y_test))     # print accuracy

0.3861566484517304


**Add Variants Feature**

In [None]:
showsDrop = pd.read_csv('/content/drive/MyDrive/Movie Database/showsDrop.csv', low_memory=False)

showsDrop = showsDrop.dropna()
Xdrop = showsDrop.loc[:, ['startYear', 'runtimeMinutes', 'averageRating',
                  'variants', 'numVotes', 'Action', 'Adult', 'Adventure',
                  'Animation', 'Biography', 'Comedy', 'Crime', 'Documentary',
                  'Drama', 'Family', 'Fantasy', 'Game-Show', 'History', 'Horror',
                  'Music', 'Musical', 'Mystery', 'News', 'Reality-TV', 'Romance',
                  'Sci-Fi', 'Short', 'Sport', 'Talk-Show', 'Thriller', 'War', 'Western']].to_numpy()
ydrop = showsDrop.loc[:, 'seasons'].to_numpy()

#mlp
(X_train, X_test, y_train, y_test) = train_test_split(Xdrop, ydrop, test_size=0.4)
scaler = StandardScaler()
scaler.fit(X_train)
scaler.fit(X_test)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
mlp = MLPClassifier(max_iter=10000)
mlp.fit(X_train, y_train)

y_pred = mlp.predict(X_test)

print(mlp.score(X_test, y_test))     # print accuracy

0.3895264116575592


**Feature Dropping**

In [None]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif

print(Xdrop.shape)
sel = SelectKBest(score_func=f_classif, k=10)
sel.fit(Xdrop, ydrop)
X = sel.transform(Xdrop)
print(X.shape)

(27448, 32)
(27448, 10)


In [None]:
(X_train, X_test, y_train, y_test) = train_test_split(X, ydrop, test_size=0.4)
scaler = StandardScaler()
scaler.fit(X_train)
scaler.fit(X_test)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
mlp = MLPClassifier(max_iter=10000)
mlp.fit(X_train, y_train)

y_pred = mlp.predict(X_test)

print(mlp.score(X_test, y_test))     # print accuracy

0.39489981785063755


In [None]:
cols = showsDrop.loc[:, ['startYear', 'runtimeMinutes', 'averageRating',
                  'variants', 'numVotes', 'Action', 'Adult', 'Adventure',
                  'Animation', 'Biography', 'Comedy', 'Crime', 'Documentary',
                  'Drama', 'Family', 'Fantasy', 'Game-Show', 'History', 'Horror',
                  'Music', 'Musical', 'Mystery', 'News', 'Reality-TV', 'Romance',
                  'Sci-Fi', 'Short', 'Sport', 'Talk-Show', 'Thriller', 'War', 'Western']]
print(cols.columns[sel.get_support()])

Index(['startYear', 'variants', 'numVotes', 'Drama', 'Family', 'Game-Show',
       'Music', 'News', 'Romance', 'Talk-Show'],
      dtype='object')


In [None]:
from sklearn.feature_selection import SequentialFeatureSelector

clf = MLPClassifier(max_iter=1000)                           # create classifier
sel = SequentialFeatureSelector(clf, n_features_to_select=10) # create selectord
sel.fit(Xdrop, ydrop)                                                # cross validate
print(sel.get_support())                                     # print feature mas
X = sel.transform(X)                                         # select columns
print(X.shape)

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn.neural_network import MLPClassifier
import pandas as pd
import numpy as np

showsDrop = pd.read_csv('/content/drive/MyDrive/Movie Database/showsDrop.csv', low_memory=False)

showsDrop = showsDrop.dropna()
Xdrop = showsDrop.loc[:, ['startYear', 'runtimeMinutes', 'averageRating',
                  'variants', 'numVotes', 'Action', 'Adult', 'Adventure',
                  'Animation', 'Biography', 'Comedy', 'Crime', 'Documentary',
                  'Drama', 'Family', 'Fantasy', 'Game-Show', 'History', 'Horror',
                  'Music', 'Musical', 'Mystery', 'News', 'Reality-TV', 'Romance',
                  'Sci-Fi', 'Short', 'Sport', 'Talk-Show', 'Thriller', 'War', 'Western']].to_numpy()
ydrop = showsDrop.loc[:, 'seasons'].to_numpy()


print(Xdrop.shape)
sel = SelectKBest(score_func=f_classif, k=10)
sel.fit(Xdrop, ydrop)
X = sel.transform(Xdrop)
print(X.shape)


maxAccuracy = 0                                # store accurcies in list
maxIter = 0                                 # store max iterations in list
for i in range(100, 1000, 100):                 # search max iterations between 100 and 900
  mlp = MLPClassifier(max_iter=i)               # create classifier
  accuracy = cross_val_score(mlp, X, ydrop, cv=5).mean()   # get mean cross validation accuracy
  if accuracy > maxAccuracy:
    maxAccuracy = accuracy
    maxIter = i





