# Uploading Data Set

In [1]:
import sys

# If you're on Colab:
if 'google.colab' in sys.modules:
    DATA_PATH = 'https://raw.githubusercontent.com/LambdaSchool/DS-Unit-2-Applied-Modeling/master/data/'

# If you're working locally:
else:
    DATA_PATH = '../data/'
    
# Ignore this Numpy warning when using Plotly Express:
# FutureWarning: Method .ptp is deprecated and will be removed in a future version. Use numpy.ptp instead.
import warnings
warnings.filterwarnings(action='ignore', category=FutureWarning, module='numpy')

# Importing the necessary libraries

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from category_encoders import OneHotEncoder, OrdinalEncoder
from sklearn.impute import SimpleImputer
import category_encoders as ce
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest
from sklearn.preprocessing import StandardScaler, FunctionTransformer
from sklearn.metrics import classification_report, accuracy_score
from sklearn.pipeline import make_pipeline
from xgboost import XGBClassifier
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
import matplotlib.pyplot as plt

# Reading in movies dataframe

In [3]:
df = pd.read_csv('/Users/bradbrauser/Desktop/Data Science/MoviesOnStreamingPlatforms_updated.csv')

# Wrangle function

The dataset has two rating features - IMDb and Rotten Tomatoes.

IMDb is great for seeing what general audiences think of a movie. If you don’t care what the critics say and want to see what people like yourself think of a movie, then you should use IMDb. Just be aware that fans often skew the vote with 10-star ratings, which may inflate scores somewhat.

Rotten Tomatoes offers the best overall picture of whether a movie is worth seeing at a glance. If you only trust the opinions of top critics and just want to know if a movie is at least decent, you should use Rotten Tomatoes. While the Fresh/Rotten binary can oversimplify the often complex opinions of critics, it should still help you weed out lousy films.

My goal with this project is more in line with IMDb, as even though scores may be skewed a bit by fans of the movies, I still want to know what the public thinks, because it seems that more often than not critics do not always line up with the public opinion.

Not only that, but the Rotten Tomatoes category has over 11,000 null values (IMDb only has less than 600, so with only a little over 16,000 titles, it seemed to be more advantageous to just drop the column all together:

In [5]:
print(df['IMDb'].isnull().sum())
print(df['Rotten Tomatoes'].isnull().sum())

571
11586


In [12]:
def wrangle(df, thresh=500):
    df = df.copy()
    
    # Setting Title as index
    df.set_index(df['Title'], inplace = True)
    
    # Since Rotten Tomatoes feature has over 11,000 missing ratings, I'm going to just drop the Rotten Tomatoes column
    df = df.drop(['Rotten Tomatoes'], axis = 1)
    
    # Dropping rows if nulls exist in IMDb column
    df.dropna(subset=['IMDb'], how='all')
    
    # Creating new target column
    df['Worth Watching?'] = df['IMDb'] >= 6.0
    
    # Creating individual genre columns
    df['Action'] = df['Genres'].str.contains('Action')
    df['Adventure'] = df['Genres'].str.contains('Adventure')
    df['Animation'] = df['Genres'].str.contains('Animation')
    df['Biography'] = df['Genres'].str.contains('Biography')
    df['Comedy'] = df['Genres'].str.contains('Comedy')
    df['Crime'] = df['Genres'].str.contains('Crime')
    df['Documentary'] = df['Genres'].str.contains('Documentary')
    df['Drama'] = df['Genres'].str.contains('Drama')
    df['Family'] = df['Genres'].str.contains('Family')
    df['Fantasy'] = df['Genres'].str.contains('Fantasy')
    df['Film Noir'] = df['Genres'].str.contains('Film Noir')
    df['History'] = df['Genres'].str.contains('History')
    df['Horror'] = df['Genres'].str.contains('Horror')
    df['Music'] = df['Genres'].str.contains('Music')
    df['Musical'] = df['Genres'].str.contains('Musical')
    df['Mystery'] = df['Genres'].str.contains('Mystery')
    df['Romance'] = df['Genres'].str.contains('Romance')
    df['Sci-Fi'] = df['Genres'].str.contains('Sci-Fi')
    df['Short Film'] = df['Genres'].str.contains('Short Film')
    df['Sport'] = df['Genres'].str.contains('Sport')
    df['Superhero'] = df['Genres'].str.contains('Superhero')
    df['Thriller'] = df['Genres'].str.contains('Thriller')
    df['War'] = df['Genres'].str.contains('War')
    df['Western'] = df['Genres'].str.contains('Western')

    # Dropping unnecessary values
    df.drop(['Genres', 'Unnamed: 0', 'ID', 'Type', 'Title', 'IMDb'], axis=1, inplace=True)
    
    # Dropping other nulls
    df = df.dropna()
    
    # Turning boolean values into binary
    df = df*1
    
    # Split label and feature matrix
    # Target
    y = df['Worth Watching?']
    # Feature
    df.drop(['Worth Watching?'], axis=1, inplace=True)
    
    return df, y

# Wrangling data into features and target
X, y = wrangle(df)

# Baseline - Evaluation metric: Accuracy
print(y.value_counts(normalize = True))

# Train test split on years movies were released
cutoff = 2010
X_train = X[X['Year'] < cutoff]
y_train = y.loc[X_train.index]
X_val = X[X['Year'] > cutoff]
y_val = y.loc[X_val.index]

1    0.513514
0    0.486486
Name: Worth Watching?, dtype: float64


A few comments about the wrangling function above:

- I set the 'Title' column as the index because I wanted to be able to see the 'Title' of the movie rather than the index number when doing various slicing.

- My question for this project is simply: "Is this worth watching?" If you decide to watch a documentary on Netflix, which documentaries are going to be worth the time to sit through?

- The target is actually a bit of a combination between the IMDb and Rotten Tomatoes column. I use the IMDb rating as a basis for the target, but make it into a binary classification by setting the lower limit of what is "Worth Watching?" to 6.0. My rationale for choosing this lower limit is two-fold:

    - In most school systems, the grading scale is based on multiples of 10% (90% = A, 80% = B, etc.). 60%, although seemingly low, is still considered passing.
    - 

# Linear Model

In [None]:
# Logistic Model
log_model = Pipeline([
                ('oe', OrdinalEncoder()),
                ('imputer', SimpleImputer()),
                ('classifier', LogisticRegression(random_state = 42, max_iter = 100, 
                                                  verbose = 5, n_jobs = 4, ))        
])



log_model.fit(X_train, y_train);

print('Train accuracy:', log_model.score(X_train, y_train))
print('Val accuracy:', log_model.score(X_val, y_val))

# Random Forest Model

In [None]:
from scipy.stats import randint, uniform

rf_model = make_pipeline(
    ce.OneHotEncoder(), 
    SimpleImputer(strategy = 'median'), 
    StandardScaler(), 
    RandomForestClassifier(criterion='entropy', max_depth=20, n_estimators=77, random_state=42, 
                           max_features = 0.5234288634835691))

param_distributions = {   
    'simpleimputer__strategy': ['mean', 'median'], 
    'randomforestclassifier__n_estimators': randint(50, 500), 
    'randomforestclassifier__max_depth': [5, 10, 15, 20, None], 
    'randomforestclassifier__max_features': uniform(0, 1), 
}

search = RandomizedSearchCV(
    rf_model, 
    param_distributions=param_distributions, 
    n_iter=10, 
    cv=3, 
    scoring='accuracy', 
    verbose=10, 
    return_train_score=True, 
    n_jobs=-1
)

search.fit(X_train, y_train);

# Fitting the model
rf_model.fit(X_train, y_train)

print('Training Accuracy:', rf_model.score(X_train, y_train))
print('Validation Accuracy:', rf_model.score(X_val, y_val))
print('Best hyperparameters', search.best_params_)
print('Cross-validation MAE', -search.best_score_)

# Visualizations

In [None]:
# Confusion Matrix - Random Forest
from sklearn.metrics import plot_confusion_matrix, classification_report
import matplotlib.pyplot as plt

plt.rcParams['figure.dpi'] = 100
plot_confusion_matrix(rf_model, X_val, y_val, values_format='.0f', xticks_rotation='vertical')

In [None]:
# Confusion Matrix - Logistic Regression
from sklearn.metrics import plot_confusion_matrix, classification_report
import matplotlib.pyplot as plt

plt.rcParams['figure.dpi'] = 100
plot_confusion_matrix(log_model, X_val, y_val, values_format='.0f', xticks_rotation='vertical')

In [None]:
# Permutation Importances - Random Forest
from sklearn.inspection import permutation_importance

result = permutation_importance(rf_model, X_val, y_val, 
                                n_repeats=5, random_state=0)

df = pd.DataFrame({'feature': X_val.columns,
                   'importances_mean': np.round(result['importances_mean'], 3),
                   'importances_std': result['importances_std']})

df.sort_values(by='importances_mean', ascending=False)

In [None]:
# Permutation Importances - Logistic Regression
from sklearn.inspection import permutation_importance

result = permutation_importance(log_model, X_val, y_val, 
                                n_repeats=5, random_state=0)

df = pd.DataFrame({'feature': X_val.columns,
                   'importances_mean': np.round(result['importances_mean'], 3),
                   'importances_std': result['importances_std']})

df.sort_values(by='importances_mean', ascending=False)