# Code 4: EDA and Model build logic 

### Importing necessary libraries

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from numpy import size, array
%matplotlib inline
import matplotlib.lines as lines
import matplotlib.gridspec as gridspec

from scipy.stats import probplot
from warnings import filterwarnings
filterwarnings('ignore')

plt.rcParams['font.family'] = 'monospace'

from time import time
import numpy as np

from sklearn import linear_model
from sklearn import datasets
from sklearn.svm import l1_min_c

import math
from sklearn.preprocessing import StandardScaler
from sklearn import preprocessing
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import OneHotEncoder
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split #for split the data
from sklearn.metrics import accuracy_score ,precision_score,recall_score,f1_score #for accuracy_score
from sklearn.metrics import confusion_matrix #for confusion matrix
from sklearn.model_selection import KFold #for K-fold cross validation
from sklearn.model_selection import cross_val_score #score evaluation
from sklearn.model_selection import cross_val_predict #prediction

from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.model_selection import train_test_split, cross_val_score
from statistics import mean

pd.set_option('display.max_colwidth', None)

import warnings
warnings.filterwarnings('ignore')

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
# read in data
data = pd.read_pickle(r'/content/gdrive/MyDrive/MSDS_CAPSTONE/Data/master_data_v3.pkl')

### Defining color schemes

In [None]:
colors = ['#261421','#5e0000','#b33a3a','#c1071e','#b33a3a','#f2b0a5']
bg_color = '#fbfbfb'
txt_color = '#5c5c5c'

sns.palplot(colors)

## EDA 
### Distribution of the Movie Quality Score

In [None]:
fig = plt.figure(tight_layout=True, figsize=(15,9))
gs = gridspec.GridSpec(nrows=2, ncols=2, width_ratios=[3,1])

fig.patch.set_facecolor(bg_color)

ax0 = fig.add_subplot(gs[:,0])
ax1 = fig.add_subplot(gs[0,1])
ax2 = fig.add_subplot(gs[1,1])

ax0.set_facecolor(bg_color)
ax1.set_facecolor(bg_color)
ax2.set_facecolor(bg_color)

mean = data['RT_comb_score'].mean()
median = data['RT_comb_score'].median()

#################
#### KDE-PLOT####
#################
ax0.axvline(x=mean, ymin=0, ymax=1, zorder=2, color='#fff', alpha=0.5, lw=2, ls='--')
ax0.axvline(x=median, ymin=0, ymax=1, zorder=2, color='#fff', alpha=0.5, lw=2, ls=':')

ax0.annotate(
    s=f"Mean: {np.round(mean,1)}",
    xy=(mean, 1),
    xytext=(mean - 0.2,1.25),
    color=txt_color,
    fontsize=14, fontweight='light', 
    fontfamily='calibri', fontstyle='italic',
    va='center', ha='center',
    bbox=dict(
        boxstyle='square,pad=0.3',
        facecolor=bg_color,edgecolor=txt_color
    ),
    arrowprops=dict(
        arrowstyle='->', 
        color='#000',
        connectionstyle='arc3, rad=0.5'
    )
)

ax0.annotate(
    s=f"Median: {np.round(median,1)}",
    xy=(median, 1),
    xytext=(median + 0.2, 1.25),
    color=txt_color,
    fontsize=14, fontweight='light', 
    fontfamily='calibri', fontstyle='italic',
    va='center', ha='center',
    bbox=dict(
        boxstyle='square,pad=0.3',
        facecolor=bg_color,edgecolor=txt_color
    ),
    arrowprops=dict(
        arrowstyle='->', 
        color='#000',
        connectionstyle='arc3, rad=-0.25'
    )
)

sns.kdeplot(
    data=data, x='RT_comb_score', shade=True, color=colors[0],edgecolor=colors[4], lw=1, alpha=0.8, ax=ax0, zorder=1
)

ax0.set_xlabel('')
ax0.set_ylabel('')
ax0.set_yticks([])

##################
#### BOX-PLOT ####
##################

ax1.boxplot(
    data=data, x='RT_comb_score',
    vert=False, patch_artist=True,
    boxprops=dict(facecolor=colors[4], color='#fff', lw=0),
    whiskerprops=dict(color='gray', lw=1, ls='--'),
    capprops=dict(color='gray', lw=1, ls='--'),
    medianprops=dict(color='#fff', lw=2),
    flierprops=dict(markerfacecolor=colors[0],alpha=0.75)
)

ax1.annotate(
    s='Right -outliers',
    xy=(200, 165),
    xytext=(190,225),
    color=txt_color,
    fontsize=14, fontweight='light', 
    fontfamily='calibri', fontstyle='italic',
    xycoords='axes points',
    arrowprops=dict(arrowstyle="<-",connectionstyle="arc3, rad=-0.25")
    #arrowprops=dict(arrowstyle='-[', widthB=1.0,lengthB=0.2,angleB=None)
)

ax1.set_xlabel('')
ax1.set_ylabel('')
ax1.set_xticks([])
ax1.set_yticks([])

###################
#### PROB-PLOT ####
###################

res = probplot(x=data['RT_comb_score'], plot=ax2)

l0 = ax2.get_lines()[0]
l1 = ax2.get_lines()[1]

l0.set_marker('D')
l0.set_alpha(0.25)
l0.set_color(colors[3])
l1.set_color(colors[4])
l1.set_linestyle('--')
l1.set_linewidth(0.5)
l1.set_alpha(0.75)

ax2.set_xlabel('')
ax2.set_ylabel('')
ax2.set_xticks([])
ax2.set_yticks([])
ax2.set_title('')

# Text & Titles
fig.text(
    s='Content Quality Score - Distribution',
    x=0, y=0.975,
    color=txt_color,
    fontsize=25, fontweight='bold'
)

#fig.text(
#    s='''
#    Our Content Quality Score is more or less normally distributed.
#    However we do have some outliers to 
#    the right of the curve, which can
#    be seen in the boxplot as well as 
#    on the probability distribution plot.
#    ''',
#    x=0, y=0.85,
#    color=txt_color
#)

fig.text(
    s='Box-Plot', rotation=90, 
    x=0.60, y=0.80,fontsize=20,
    color=txt_color , fontweight='bold'
)

fig.text(
    s='Probability-Plot', rotation=90, 
    x=0.60, y=0.175,
    color=txt_color,fontsize=20,fontweight='bold'
)

# seperation lines
sl1 = lines.Line2D(xdata=[0.63,0.63], ydata=[0.05,0.5], lw=1, alpha=0.5, color='#aeaeae', transform=fig.transFigure, figure=fig)
sl2 = lines.Line2D(xdata=[0.63,0.63], ydata=[0.6,0.95], lw=1, alpha=0.5, color='#aeaeae', transform=fig.transFigure, figure=fig)
fig.lines.extend([sl1, sl2])

# despine
for spine in ['top','left','right','bottom']:
    ax0.spines[spine].set_visible(False)
    ax1.spines[spine].set_visible(False)
    ax2.spines[spine].set_visible(False)

# show
plt.show()

In [None]:
# create a helper function
def group_df(data:pd.DataFrame, col:str) -> pd.DataFrame:
    tmp = data.groupby(col).agg({'Title':'count','Runtime':'mean','RT_comb_score':'mean'})
    tmp = tmp.sort_values(by='Title', ascending=False).reset_index()
    tmp = tmp.rename(columns={'Title':'Count', 'Runtime':'MeanRuntime','RT_comb_score':'MeanScore'})
    return tmp

In [None]:
# create grouped dataframes for analysis
data_genre = group_df(data, 'Genre')[:5]
data_language = group_df(data, 'Lang')[:5]

# calculate ratio for alpha values
data_genre['Ratio'] = data_genre['Count'].apply(lambda x: x / data_genre['Count'].sum())

In [None]:
temp =(data.groupby(['Rel_year', 'Title']).size() 
   .sort_values(ascending=False) 
   .reset_index(name='count') 
   .drop_duplicates(subset='Title'))

In [None]:
tmp = temp.groupby('Rel_year').agg({'Title':'count'}).reset_index()
tmp = tmp.rename(columns={'Title':'Count'})
tmp.columns

In [None]:
data_time = tmp.copy()

In [None]:
# basic overview how many titles over time
data_time = data_time[data_time['Rel_year'] <= 2022]
data_time = data_time.rename(columns={'Title':'Count'})
sum_titles = data_time['Count'].sum()

# plot
fig, ax = plt.subplots(figsize=(12,6))
fig.patch.set_facecolor(bg_color)
ax.set_facecolor(bg_color)

ax.plot(data=data_time['Rel_year'], y1= data_time['Count'], color=colors[4], lw=0.5)
ax.fill_between(x=data_time['Rel_year'], y1=0, y2=data_time['Count'], color=colors[0], alpha=0.85)

ax.axhline(y=0, color=colors[4], lw=2, alpha=1)
ax.set_xlim(data_time['Rel_year'].min(), data_time['Rel_year'].max())

ax.yaxis.tick_right()
ax.tick_params(axis='both', which='both', length=0)

# Text & Titles
fig.text(
    s='Number of Titles over Time (until 2022)',
    x=0, y=0.975,
    color=txt_color,
    fontsize=25, fontweight='bold'
)

#fig.text(
#    s='''
#    The amount of titles added by Netflix
#    has steadily risen increased over the years. 
#    Note that 2022 data only includes Q1 releases
 #   ''',
 #   x=-0.01, y=0.85,
 #   color=txt_color
#)

fig.text(
    s='Total Movies:',
    x=0.770, y=0.97,
    color=txt_color,
    fontsize=15,
)

fig.text(
    s=sum_titles,
    x=0.85, y=0.93,
    color=txt_color,
    fontsize= 20,fontweight='bold'
)

# seperation lines
sl1 = lines.Line2D(xdata=[0.75,0.75], ydata=[0.9,1], lw=1, alpha=0.5, color='#aeaeae', transform=fig.transFigure, figure=fig)
#sl2 = lines.Line2D(xdata=[0.75,0.80], ydata=[0.9,0.92], lw=1, alpha=0.5, color='#aeaeae', transform=fig.transFigure, figure=fig)
#fig.lines.extend([sl1,sl2])

# despine
for spine in ['top','left','right','bottom']:
    ax.spines[spine].set_visible(False)

plt.show()

There are 622 films represented in the dataset and 98 columns.

In [None]:
!pip install circlify

In [None]:
# figure, grid
fig = plt.figure(tight_layout=True, figsize=(15,10))
gs = gridspec.GridSpec(nrows= 2, ncols=2)

fig.patch.set_facecolor(bg_color)

ax0 = fig.add_subplot(gs[0,0])
ax1 = fig.add_subplot(gs[1,0])
ax2 = fig.add_subplot(gs[:,1])

# plots
ax0 = sns.kdeplot(
    data=data, x='Runtime', ax=ax0,
    shade=True, color=colors[0],
    edgecolor=colors[4], lw=1, alpha=0.8
)

ax1.boxplot(
    data=data, x='Runtime',
    vert=False, patch_artist=True,
    boxprops=dict(facecolor=colors[4], color='#fff', lw=0),
    whiskerprops=dict(color='gray', lw=1, ls='--'),
    capprops=dict(color='gray', lw=1, ls='--'),
    medianprops=dict(color='#fff', lw=2),
    flierprops=dict(markerfacecolor=colors[0],alpha=0.75)
)

ax2.scatter(
    y=data['Runtime'], x=data['RT_comb_score'],
    color=colors[3], alpha=0.5, s=1*data['Runtime']
)

# Text & Titles
fig.text(
    s='Runtime - Distribution and Relation with Quality Score',
    x=0, y=1.1,
    color=txt_color,
    fontsize=25, fontweight='bold'
)

#fig.text(
#    s='''
#    as we can see the runtime is negatively skewed with
#    outliers to the left. From the scatterplot we can tell
#    that there is no relation between runtime and Quality Score.
#    ''',
#    x=0, y=1.02,
#    color=txt_color
#)

fig.text(
    s='''
    Runtime vs. Quality Score
    ''',
    x=0.5, y= 1,
    color=txt_color,fontsize= 20,fontweight='bold'
)

fig.text(
    s='''
    Runtime Distribution
    ''',
    x=0, y= 1,
    color=txt_color,fontsize= 20,fontweight='bold'
)

fig.text(
    s='''
    Runtime Outliers
    ''',
    x=0, y=0.34,
    color=txt_color,fontsize= 20,fontweight='bold'
)

# seperation lines
sl1 = lines.Line2D(xdata=[0.4,0.4], ydata=[1,1.05], lw=1, alpha=0.5, color='#aeaeae', transform=fig.transFigure, figure=fig)
sl2 = lines.Line2D(xdata=[0,0.0], ydata=[1,1.05], lw=1, alpha=0.5, color='#aeaeae', transform=fig.transFigure, figure=fig)
sl3 = lines.Line2D(xdata=[0,0.0], ydata=[0.35,0.4], lw=1, alpha=0.5, color='#aeaeae', transform=fig.transFigure, figure=fig)
fig.lines.extend([sl1,sl2,sl3])

# ax colors
ax0.set_facecolor(bg_color)
ax1.set_facecolor(bg_color)
ax2.set_facecolor(bg_color)

# labels & ticks
ax0.set_xlabel('')
ax0.set_ylabel('')
ax0.set_yticks([])

ax1.set_yticks([])
ax1.set_xticks([])

#ax2.set_yticks([])

ax0.tick_params(length=0, colors=txt_color)
#ax2.tick_params(length=0, colors=txt_color)

# despine
for spine in ['top','left','right','bottom']:
    ax1.spines[spine].set_visible(False)
    
for spine in ['top','left','right']:
    ax0.spines[spine].set_visible(False)
    ax2.spines[spine].set_visible(False)

ax2.spines['bottom'].set_color(txt_color)
ax2.spines['bottom'].set_alpha(0.25)

plt.show()

In [None]:
import circlify

fig = plt.figure(tight_layout=True, figsize=(15,10))
gs = gridspec.GridSpec(nrows=1, ncols=2, width_ratios=[1.5,0.5])

fig.patch.set_facecolor(bg_color)
fig.subplots_adjust(wspace=1, right=2)

ax0 = fig.add_subplot(gs[0,0])
ax1 = fig.add_subplot(gs[0,1])

ax0.set_facecolor(bg_color)
ax1.set_facecolor(bg_color)

# create circles based on title count
circles = circlify.circlify(
    data=data_genre['Count'].tolist(),
    show_enclosure=False,
    target_enclosure=circlify.Circle(x=0,y=0,r=1)
)

# find and set limit
lim = max(
    max(
        abs(circle.x) + circle.r,
        abs(circle.y) + circle.r
    )
    for circle in circles
)

ax0.set_xlim(-lim, lim)
ax0.set_ylim(-lim, lim)

# labels
labels = data_genre['Genre'][::-1]
scores = data_genre['MeanScore'][::-1]
ratios = data_genre['Ratio'][::-1]

# print circles
for label, score, ratio, circle in zip(labels, scores, ratios, circles):
    x,y,r = circle
    ax0.add_patch(
        plt.Circle(
            (x,y), r, 
            alpha=(1*ratio+0.5), lw=1, 
            fill=True, facecolor=colors[0]
            )
        )
    ax0.annotate(
        s=f"{label}\n{np.round(score,1)}",
        xy=(x,y),
        va='center',ha='center', color='#fff'
    )

# average runtime per genre
ax1.set_xlim(0, data_genre['MeanRuntime'].max()+10)

ax1 = sns.scatterplot(
    data=data_genre, x=10, y='Genre', color='#000', s=200
)
ax1 = sns.scatterplot(
    data=data_genre, x='MeanRuntime', y='Genre', color=colors[0], s=2e3
)

for idx in range(0,len(data_genre['Genre'])):
    xmin = 10/(data_genre['MeanRuntime'].max()+10)
    xmax = data_genre['MeanRuntime'][idx]/(data_genre['MeanRuntime'].max()+10)

    ax1.axhline(
        y=data_genre['Genre'][idx], 
        xmin=xmin, 
        xmax=xmax,
        color=txt_color, zorder=0
    )

    ax1.annotate(
        s=f"{int(data_genre['MeanRuntime'][idx])}\nmin",
        xy=(data_genre['MeanRuntime'][idx],data_genre['Genre'][idx]),
        va='center', ha='center',
        color='#fff'
        
    )

ax1.set_xticks([])
ax1.set_xlabel('')
ax1.set_ylabel('')
ax1.tick_params(axis='both', which='both', length=0)

# despine
for spine in ['top','left','right','bottom']:
    ax1.spines[spine].set_visible(False)

ax0.axis('off')

# Text & Titles
fig.text(
    s='TOP 5 - Content Genres',
    x=0, y=0.975,
    color=txt_color,
    fontsize=17, fontweight='bold'
)

fig.text(
    s='''
    by number of titles (size)
    average score and runtime
    ''',
    x=-0.01, y=0.925,
    color=txt_color
)

fig.text(
    s='''
    Dramas and Documentaries are  
    the biggest (44 and 36 Titles)
    in size and highest in the 
    quality score (on average).
    ''',
    x=0.51, y=0.7,
    color=txt_color,
    fontsize=9,alpha=0.5
)

fig.text(
    s='avg. Runtime',
    rotation=90,
    x=0.665, y=0.875,
    color=txt_color,
    fontsize=9,alpha=0.5
)

# seperation lines
sl1 = lines.Line2D(xdata=[0.525,0.525], ydata=[0.68,0.78], lw=2, alpha=0.5, color='#aeaeae', transform=fig.transFigure, figure=fig)
sl2 = lines.Line2D(xdata=[0.675,0.675], ydata=[0.05,0.95], lw=1, alpha=0.25, color='#aeaeae', transform=fig.transFigure, figure=fig)
fig.lines.extend([sl1,sl2])

plt.show()

In [None]:
# prepare data for top genre boxplot
cols = [*data_genre['Genre'].value_counts().index]

data_top_genre = data.copy()
data_top_genre['TopGenre'] = data_top_genre['Genre'].apply(lambda x: 1 if x in cols else 0)
data = data_top_genre[data_top_genre['TopGenre'] == 1]

# violin plot
fig, ax = plt.subplots(figsize=(15,6))

fig.patch.set_facecolor(bg_color)
ax.set_facecolor(bg_color)

sns.violinplot(data=data, x='Genre', y='RT_comb_score', palette=colors, saturation=0.5, ax=ax)

ax.set_ylabel('')
ax.set_xlabel('')
ax.tick_params(axis='x',length=0)

# despine
for spine in ['top','left','right']:
    ax.spines[spine].set_visible(False)

# Text & Titles
fig.text(
    s='Genres versus Quality Score',
    x=0.1, y=1,
    color=txt_color,
    fontsize=25, fontweight='bold'
)

#fig.text(
#    s='''
#    Dramas have are the highest scoring genre,
#    interestingly have more postive outliers.  
#    ''',
#    x=0.09, y=1.02,
#    color=txt_color
#)

plt.show()

In [None]:
!pip install squarify
import squarify

In [None]:
fig = plt.figure(figsize=(15,10))

gs = gridspec.GridSpec(nrows=2, ncols=2, height_ratios=[3,1])

ax0 = fig.add_subplot(gs[0,:])
ax1 = fig.add_subplot(gs[1,0])
ax2 = fig.add_subplot(gs[1,1])

fig.patch.set_facecolor(bg_color)
fig.subplots_adjust(wspace=0.2, hspace=0.1)
ax0.set_facecolor(bg_color)
ax1.set_facecolor(bg_color)
ax2.set_facecolor(bg_color)

# create labels for treemap
labels = [label +'\n#'+ str(count) +' Titles' for label, count in zip(data_language['Lang'],data_language['Count'])]

# create treemap
squarify.plot(
    sizes=data_language['Count'], label=labels, color=colors, 
    pad=True, ax=ax0, text_kwargs=dict(color='white', fontsize=16, fontweight='light'))

# average runtime
ax1.bar(
    x=data_language['Lang'], height=data_language['MeanScore'],
    color='#000', edgecolor='#000', lw=1, alpha=0.45
)


ax1.tick_params(length=0)
ax1.set_yticks([])
ax1.set_ylabel('')

# average scores
ax2.bar(
    x=data_language['Lang'], height=data_language['MeanRuntime'],
    color='#000', edgecolor='#000', lw=1, alpha=0.45
)

ax2.tick_params(length=0)
ax2.set_yticks([])
ax2.set_ylabel('')

# annotations
for idx in range(0,len(data_language['Lang'])):
    ax1.annotate(
        s=f" {np.round(data_language['MeanScore'][idx],2)} ",
        xy=(data_language['Lang'][idx], 0.1),
        rotation=90,
        va='center', ha='center',
        color='#fff', fontsize=12
    )
    ax2.annotate(
        s=f" {int(data_language['MeanRuntime'][idx])} min",
        xy=(data_language['Lang'][idx], 60),
        rotation = 90,
        va='center', ha='center',
        color='#fff', fontsize=12
    )
    
# despine
ax0.axis('off')
for spine in ['top','left','right']:
    ax1.spines[spine].set_visible(False)
    ax2.spines[spine].set_visible(False)

# Text & Titles
fig.text(
    s='TOP 5 - Languages by number of titles (size), average score and runtime',
    x=0, y=0.975,
    color=txt_color,
    fontsize=25, fontweight='bold'
)

#fig.text(
#    s='''
#    by number of titles (size)
#    average score and runtime
#    ''',
#    x=0.09, y=0.925,
#    color=txt_color, font = 16
#)

fig.text(
    s='Avg. Score',
    rotation=90,
    x=0.1075, y=0.17,
    color=txt_color,
    fontsize=16,alpha=0.5,fontweight='bold'
)

fig.text(
    s='Avg. Runtime',
    rotation=90,
    x=0.5275, y=0.17,
    color=txt_color,
    fontsize=16,alpha=0.5, fontweight='bold'
)

sl1 = lines.Line2D(xdata=[0.125,0.125], ydata=[0.15,0.3], lw=1, alpha=0.25, color='#aeaeae', transform=fig.transFigure, figure=fig)
sl2 = lines.Line2D(xdata=[0.545,0.545], ydata=[0.15,0.3], lw=1, alpha=0.25, color='#aeaeae', transform=fig.transFigure, figure=fig)
fig.lines.extend([sl1,sl2])

plt.show()

In [None]:
data_month = data.groupby('Rel_month').mean()[['RT_comb_score']].reset_index()

fig, ax = plt.subplots(figsize=(15,6))

fig.patch.set_facecolor(bg_color)
ax.set_facecolor(bg_color)

ax.plot(data=data_month['Rel_month'], y1=data_month['RT_comb_score'], color=colors[4], lw=10)
ax.fill_between(x=np.arange(0,12), y1=data_month['RT_comb_score'], color=colors[0], alpha=0.05, label='Avg. Score')

sns.swarmplot(data=data, x='Rel_month', y='RT_comb_score', palette=colors, ax=ax)

ax.set_ylabel('')
ax.set_xlabel('')
ax.set_ylim(0,1.1)
ax.tick_params(axis='both',length=0)

# despine
for spine in ['top','left','right']:
    ax.spines[spine].set_visible(False)

# Text & Titles
fig.text(
    s='Quality Score by Month',
    x=0.1, y=1,
    color=txt_color,
    fontsize=25, fontweight='bold'
)

#fig.text(
#    s='''
#    The average score is distributed
#    evenly from Jan to Sep, however from
#    Oct (Q3) onwards we can see an 
#    improvement in the score.
#    ''',
#    x=0.09, y=0.94,
#    color=txt_color
#)

plt.legend(loc='Top center',frameon=False, fontsize= 12)
plt.show()

In [None]:
data = pd.read_pickle(r'/content/gdrive/MyDrive/MSDS_CAPSTONE/Data/master_data_v3.pkl')

In [None]:
data.isnull().sum().sum()

In [None]:
data.info()

In [None]:
data.shape

## Subsetting data by columns that will not be used /are colinear 

In [None]:
df_sub = data.drop(columns = ['Lang_eng','Genre','Rel_year', 'Rating','Dir1', 'Dir1_films', 'Dir1_film_scores','Dir1_qual', 'Dir2', 'Dir2_films', 'Dir2_film_scores','Dir2_qual','Dir3', 'Dir3_films', 'Dir3_film_scores','Dir3_qual','Writer1', 'Writer1_films', 'Writer1_film_scores','Writer1_qual','Writer2', 'Writer2_films', 'Writer2_film_scores', 'Writer2_qual', 'Writer3', 'Writer3_films', 'Writer3_film_scores','Writer3_qual','Actor1',
       'Actor1_films', 'Actor1_film_scores','Actor1_qual','Actor2','Actor2_films', 'Actor2_film_scores', 'Actor2_qual','Actor3','Actor3_films', 'Actor3_film_scores','Actor3_qual','Actor4','Actor4_films', 'Actor4_film_scores','Actor4_qual','Actor5','Actor5_films', 'Actor5_film_scores','Actor5_qual','RT_url', 'IMDb_url','Plot_summ', 'Plot_keywords']) 

In [None]:
df_sub.info()

## Missing value treatment

In [None]:
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64','uint8']
numeric_df = df_sub.select_dtypes(include=numerics)
numeric_df.isnull().sum()

In [None]:
numeric_df['Based_on_existing_material'] = numeric_df['Based_on_existing_material'].fillna(0)
numeric_df['Based_on_real_life'] = numeric_df['Based_on_existing_material'].fillna(0)

In [None]:
numeric_df.isnull().sum()

In [None]:
#Labeling unknown actor/director/writer by -1
numeric_df= numeric_df.fillna(-9999)
numeric_df.isnull().sum().sum()

In [None]:
numeric_df['Dir_avg_qual'].min()

### Label encoding

In [None]:
char = ['object']
char_df = df_sub.select_dtypes(include=char)
#char_df = char_df.fillna('NA')
char_df.isnull().sum().sum()

In [None]:
char_df.isnull().sum()

In [None]:
char_df['Lang'].value_counts()

In [None]:
char_df1 =pd.get_dummies(char_df,  columns = ['Lang'])

In [None]:
char_df1.columns

In [None]:
char_df1.info()

In [None]:
char_df1.isnull().sum().sum()

In [None]:
char_df1.shape

Reviewing Distribution of Char and Numeric datatypes

In [None]:
numeric_df.hist(bins =20, figsize =(20,20), color = 'r')
plt.show()

In [None]:
char_df1.hist(bins =20, figsize =(20,20), color = 'r')
plt.show()

In [None]:
#final dataset post missing value treatment
final_df = pd.concat([char_df1, numeric_df], axis = 1)
final_df.shape

### Correlation study

### Correlation limiting to top 5 Languages = Eng, Spanish, Hindi, French, Korean

In [None]:
#keeping only top 5 lang

final_df1 = final_df.drop(columns = ['Rating_reason',  'Lang_Arabic', 'Lang_Cantonese',
       'Lang_Chinese', 'Lang_Danish', 'Lang_Dutch', 
       'Lang_Filipino',  'Lang_Georgian ', 'Lang_German',
        'Lang_Hungarian ', 'Lang_Indonesian', 'Lang_Italian',
       'Lang_Japanese', 'Lang_Khmer ', 'Lang_Malay',
       'Lang_Malayalam', 'Lang_Mandarin', 'Lang_Marathi', 'Lang_Norwegian',
       'Lang_Polish', 'Lang_Portuguese', 'Lang_Romanian', 'Lang_Russian', 'Lang_Swedish', 'Lang_Tamil', 'Lang_Thai',
       'Lang_Ukranian', 'Lang_Wolof'])

In [None]:
corrMatrix = final_df1.corr()

In [None]:
plt.figure(figsize= (20,20))
sns.heatmap(corrMatrix, annot = True , cmap = "Reds")
plt.show()

## Model Build
### Creating the dataframe suitable for model build

In [None]:
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64','uint8']
numeric_df = final_df1.select_dtypes(include=numerics)
numeric_df= numeric_df.fillna(0)

In [None]:
numeric_df.isnull().sum().sum()

In [None]:
(numeric_df.columns)

In [None]:
numeric_df1 = numeric_df.drop(columns = ['RT_score', 'RT_comb_score','RT_rev_cnt'])
numeric_df1.columns
len = len(numeric_df1.columns)

In [None]:
from sklearn.preprocessing import StandardScaler
#Standardizing the x's
scalar = StandardScaler()
numeric_df2 = scalar.fit_transform(numeric_df1)
numeric_df2 = pd.DataFrame(numeric_df2, columns = numeric_df1.columns)

### Multicollineariy check

In [None]:
#numeric_df3 = numeric_df2.drop(columns = ['Dir1_qual','Dir2_qual','Dir3_qual'])
numeric_df2.shape

In [None]:
#Checking for multicollinearity for the numeric variables
from statsmodels.stats.outliers_influence import variance_inflation_factor
# VIF dataframe
vif_data = pd.DataFrame()
vif_data["feature"] = numeric_df2.columns
# calculating VIF for each feature
vif_data["VIF"] = [variance_inflation_factor(numeric_df2.values, i)
for i in range(44)]
print(vif_data)

In [None]:
vif_data.to_csv('vif.csv')

In [None]:
vif_data['VIF'].describe()

In [None]:
to_keep= vif_data[vif_data['VIF'] <= 12] # ideal value 10-12
to_keep.shape

In [None]:
cols = list(to_keep['feature'].T)
#cols

### X and y separation

In [None]:
y = numeric_df['RT_comb_score']
#y1 = numeric_df['RT_score']
X = numeric_df2[cols]

### Checking for correlation in the remaining Xs

In [None]:
corr = X.corr()
plt.figure(figsize= (10,10))
sns.heatmap(corr, cmap = 'RdGy')
plt.show()

## PCA 

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=12)
pca.fit(X)
var= pca.explained_variance_ratio_
var1=np.cumsum(np.round(pca.explained_variance_ratio_, decimals=4)*100)
print(var1)

In [None]:
pca = PCA(.95) #Keep Principal Components that retain 95% variance
ProjectedDf = pca.fit_transform(X)
ProjectedDf

In [None]:
counts = np.unique(pca.explained_variance_, return_counts=True)
len= np.count_nonzero(counts)

## Testing with Regression

## Train test split

In [None]:
#Splitting train dataset into train and validation datasets
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.3, random_state=101)

## Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression
linear_regressor = LinearRegression()  # create object for the class
linear_regressor.fit(X_train, y_train)  # perform linear regression
Y_pred = linear_regressor.predict(X_train)  # make predictions

In [None]:
plt.scatter(y_train, Y_pred, color = 'r')
plt.show()

In [None]:
linear_regressor.score(X_train, y_train) , linear_regressor.score(X_valid, y_valid)

The score is the coefficient of determination and is defined as ((y_true - y_pred)** 2).sum() and  is the total sum of squares ((y_true - y_true.mean()) ** 2).sum(). The best possible score is 1.0 and it can be negative (because the model can be arbitrarily worse). A constant model that always predicts the expected value of y, disregarding the input features, would get a  score of 0.0.

## Transforming y

In [None]:
y.describe()

In [None]:
#RT_comb_score
y_class = np.array(y >= 0.4, dtype=int).reshape(-1)
np.unique(y_class, return_counts=True)

In [None]:
241/622

## Variable Importance

In [None]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=20)

#Train the model using the training sets y_pred=clf.predict(X_test)
clf.fit(X, y_class)

In [None]:
feature_imp = pd.Series(clf.feature_importances_, index=X.columns).sort_values(ascending=False)
plt.figure(figsize=(10,6))
sns.barplot(x=feature_imp, y=feature_imp.index)
# Add labels to your graph
plt.xlabel('Feature Importance Score')
plt.ylabel('Features')
plt.title("Visualizing Important Features")
plt.tight_layout()

## Train test split

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y , test_size=0.3, random_state=101)

In [None]:
# fit decisiontree based models
from sklearn.metrics import mean_squared_error
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor

models = [
    ('lgbm',LGBMRegressor()),
    ('catb',CatBoostRegressor(verbose=0)),
    ('xgb',XGBRegressor(verbosity=0)),
    ('RF',RandomForestRegressor(verbose=0))
]

results = dict()

for name, model in models:
    model.fit(X_train, y_train)
    y_hat = model.predict(X_valid)
    rmse = mean_squared_error(y_valid, y_hat, squared=False)
    results[name] = rmse 

In [None]:
df_results = pd.DataFrame([results])
df_results

In [None]:
plt.scatter(y_valid, y_hat, color = 'r')
plt.show()

## Random forest regressor with hyper parameter tuning 
Takes a long time to run , don't run unless necessary

In [None]:
#Hyperparameter tuning 
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor()

# define gridsearch parameters
param_grid = {
              'n_estimators': [100,300, 500, 700],
              'max_features': ["auto", "sqrt", "log2"],
              'max_depth': [3, 5, 7],
              'min_samples_split': [8, 10, 12],
              'min_samples_leaf': [8, 10, 12]}

# instantiate gridsearchcv class
rfgs = GridSearchCV(estimator=rf,
                    param_grid=param_grid,
                    cv=5,
                    verbose=3,
                    n_jobs=-1)

# fit model
rfmodel= rfgs.fit(X_train, y_train)

In [None]:
#best fit model
best_rfgs = rfgs.best_estimator_
best_rfgs

In [None]:
best_rfgs.score(X_train,y_train)

In [None]:
y_hat_train = best_rfgs.predict(X_train)
y_hat_valid = best_rfgs.predict(X_valid)

In [None]:
from sklearn.metrics import mean_squared_error
print('RMSE for Random forest regressor Train :',mean_squared_error(y_train, y_hat_train, squared=False))
print('RMSE for Random forest regressor Validation:',mean_squared_error(y_valid, y_hat_valid, squared=False))

#Feature Importance - Impurities based

In [None]:
#feature_names = [f"feature {i}" for i in range(X.shape[1])]
feature_names = X_train.columns
importances = best_rfgs.feature_importances_
std = np.std([tree.feature_importances_ for tree in best_rfgs], axis=0)
forest_importances = pd.Series(importances, index=feature_names)

fig, ax = plt.subplots()
forest_importances.plot.bar(yerr=std, ax=ax)
ax.set_title("Feature importances using MDI")
ax.set_ylabel("Mean decrease in impurity")
fig.tight_layout()

In [None]:
from sklearn.inspection import permutation_importance
result = permutation_importance(
    best_rfgs, X_valid, y_valid, n_repeats=10, random_state=42, n_jobs=2
)

forest_importances = pd.Series(result.importances_mean, index=feature_names)
fig, ax = plt.subplots()
forest_importances.plot.bar(yerr=result.importances_std, ax=ax)
ax.set_title("Feature importances using permutation on full model")
ax.set_ylabel("Mean accuracy decrease")
fig.tight_layout()
plt.show()

## Shap values

In [None]:
import shap
explainer = shap.TreeExplainer(best_rfgs)
shap_values = explainer.shap_values(X_train)
shap.summary_plot(shap_values, features=X_train, feature_names=X_train.columns, max_display= 50)

## K Nearest Neighbour Regressors

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y , test_size=0.5, random_state=101)

In [None]:
from sklearn.neighbors import KNeighborsRegressor
knn = KNeighborsRegressor(n_neighbors = 8)
knnmodel = knn.fit(X_train, y_train)
y_hat_valid = knnmodel.predict(X_valid)
error = sqrt(mean_squared_error(y_valid,y_hat_valid ))

In [None]:
error

In [None]:
from math import sqrt
rmse_val = [] #to store rmse values for different k
for K in range(20):
    K = K+1
    model = neighbors.KNeighborsRegressor(n_neighbors = K)

    model.fit(X_train, y_train)  #fit the model
    pred=model.predict(X_valid) #make prediction on test set
    error = sqrt(mean_squared_error(y_valid,pred)) #calculate rmse
    rmse_val.append(error) #store rmse values
    print('RMSE value for k= ' , K , 'is:', error)

In [None]:
#plotting the rmse values against k values
curve = pd.DataFrame(rmse_val) #elbow curve 
curve.plot()

In [None]:
from sklearn.model_selection import GridSearchCV
params = {'n_neighbors':[2,3,4,5,6,7,8,9]}

knn = neighbors.KNeighborsRegressor()

model = GridSearchCV(knn, params, cv=5)
model.fit(X_train,y_train)
model.best_params_

In [None]:
y_hat = model.predict(X_train)
y_hat_valid = model.predict(X_valid)

In [None]:
plt.scatter(y_train, y_hat, color = 'r')
plt.show()

In [None]:
plt.scatter(y_valid, y_hat_valid, color = 'r')
plt.show()

## Random Forest-Classification

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
#X, y = make_classification(n_samples=1000, n_features=4, n_informative=2, n_redundant=0, random_state=0, shuffle=False)
clf = RandomForestClassifier(max_depth=3, random_state=0)
clf.fit(X_train, y_train)

In [None]:
pred_x= clf.predict(X_train)

In [None]:
print("Precision :", round((precision_score(y_train, pred_x))*100,2))
print("Recall  :",round((recall_score(y_train, pred_x))*100,2))
print("F1 Score :",round((f1_score(y_train, pred_x))*100,2))

In [None]:
cm = confusion_matrix(y_train, pred_x)
#visualize confusion matrix 
sns.heatmap(confusion_matrix(y_train, pred_x),annot=True,fmt='3.0f',cmap="Blues")
plt.title('Confusion Matrix for RF', y=1.05, size=15)
plt.ylabel('Predicted Survival')
plt.xlabel('Actual Survival')

In [None]:
pred_xt= clf.predict(X_valid)
print("Precision :", round((precision_score(y_valid, pred_xt))*100,2))
print("Recall  :",round((recall_score(y_valid, pred_xt))*100,2))
print("F1 Score :",round((f1_score(y_valid, pred_xt))*100,2))

In [None]:
cm = confusion_matrix(y_valid, pred_xt)
#visualize confusion matrix 
sns.heatmap(confusion_matrix(y_valid, pred_xt),annot=True,fmt='3.0f',cmap="Blues")
plt.title('Confusion Matrix for  RF', y=1.05, size=15)
plt.ylabel('Predicted Survival')
plt.xlabel('Actual Survival')

## Gradient Booseted Trees

In [None]:
# define gridsearch parameters
from sklearn.ensemble import GradientBoostingClassifier
param_grid = {
              'n_estimators': [100,300, 500, 700],
              'max_features': ['auto', 'log2'],
              'max_depth': [3, 5, 7,10],
              'min_samples_split': [5,8, 10, 12],
              'min_samples_leaf': [5,8, 10, 12],
              'learning_rate': [0.01,0.1,0.5]}

# instantiate gridsearchcv class
gb = GridSearchCV(estimator=GradientBoostingClassifier(),
                    param_grid=param_grid,
                    cv=5,
                    verbose=20,
                    n_jobs=-1)

#fit_params = {"eval_set":[(X_train,y_train),(X_valid, y_valid)], "eval_names":['train','valid']}
# fit model
gbmodel= gb.fit(X_train, y_train)

In [None]:
best_xgb = gbmodel.best_estimator_
best_xgb

In [None]:
#plot_importance()
feature_imp = pd.Series(best_xgb.feature_importances_, index=X_train.columns).sort_values(ascending=False)
plt.figure(figsize=(10,6))
sns.barplot(x=feature_imp, y=feature_imp.index)
# Add labels to your graph
plt.xlabel('Feature Importance Score')
plt.ylabel('Features')
plt.title("Visualizing Important Features")
plt.tight_layout()

In [None]:
y_hat_valid = best_xgb.predict(X_valid)
acc_xgb = round(accuracy_score(y_hat_valid,y_valid)*100,2)
print('The accuracy of the Gradient Boosted Trees is',acc_xgb)

In [None]:
cm = confusion_matrix(y_valid, y_hat_valid)
#visualize confusion matrix 
sns.heatmap(confusion_matrix(y_valid, y_hat_valid),annot=True,fmt='3.0f',cmap="Blues")
plt.title('Confusion Matrix for Gradient Boosted Trees Classification', y=1.05, size=15)
plt.ylabel('Predicted Survival')
plt.xlabel('Actual Survival')
print("Precision :", round((precision_score(y_valid, y_hat_valid))*100,2))
print("Recall  :",round((recall_score(y_valid, y_hat_valid))*100,2))
print("F1 Score :",round((f1_score(y_valid, y_hat_valid))*100,2))

In [None]:
from sklearn.metrics import precision_recall_curve

y_scores = best_xgb.predict_proba(X_valid)
y_scores = y_scores[:,1]

precision, recall, threshold = precision_recall_curve(y_valid, y_scores)
def plot_precision_and_recall(precision, recall, threshold):
    plt.plot(threshold, precision[:-1], "r-", label="Precision", linewidth=5)
    plt.plot(threshold, recall[:-1], "b", label="Recall", linewidth=5)
    plt.xlabel("Threshold", fontsize=19)
    plt.legend(loc="upper right", fontsize=19)
    plt.ylim([0, 1])

plt.figure(figsize=(14, 7))
plot_precision_and_recall(precision, recall, threshold)
plt.show()

In [None]:
!pip install shap

In [None]:
import shap
explainer = shap.TreeExplainer(best_xgb)
shap_values = explainer.shap_values(X_train)
shap.summary_plot(shap_values, features=X_train, feature_names=X_train.columns, max_display= 50)

In [None]:
shap.summary_plot(shap_values, X_train, plot_type = 'bar')

# Binning y variable

In [None]:
#shap_test = explainer.shap_values(X_valid)

In [None]:
#shap_test_val = np.array(shap_test).reshape(-1)

In [None]:
!pip install catboost

## Testing with 10 bins

In [None]:
y_decile = pd.qcut(y, q=10, precision=0)
y_decile.value_counts()

In [None]:
numeric_df['y_decile'] = pd.qcut(numeric_df['RT_comb_score'], labels =False, q=10, precision=0)
numeric_df['y_decile'].describe()
y_declie= numeric_df['y_decile'].astype('int')

In [None]:
y_bin = 

In [None]:
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
# split into train and test
X_train, X_valid, y_train, y_valid = train_test_split(X, y_declie, test_size=0.2, shuffle=True, random_state=1)

encoder = OneHotEncoder(handle_unknown='ignore')
X_train_en = encoder.fit_transform(X=X_train)
X_valid_en = encoder.transform(X=X_valid)

## Testing with Regressors

In [None]:
# fit decisiontree based models
from sklearn.metrics import mean_squared_error
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor

models = [
    ('lgbm',LGBMRegressor()),
    ('catb',CatBoostRegressor(verbose=0)),
    ('xgb',XGBRegressor(verbosity=0)),
    ('RF',RandomForestRegressor(verbose=0))
]

results = dict()

for name, model in models:
    model.fit(X_train_en, y_train)
    y_hat = model.predict(X_valid_en)
    rmse = mean_squared_error(y_valid, y_hat, squared=False)
    results[name] = rmse

In [None]:
df_results = pd.DataFrame([results])
df_results

GBoost Classifier with hyperparameter tuning - Times out difficult to run

In [None]:
# define gridsearch parameters
from sklearn.ensemble import GradientBoostingClassifier
param_grid = {
              'n_estimators': [100,300, 500, 700],
              'max_features': ['auto', 'log2'],
              'max_depth': [3, 5, 7,10],
              'min_samples_split': [5,8, 10, 12],
              'min_samples_leaf': [5,8, 10, 12],
              'learning_rate': [0.01,0.1,0.5]}

# instantiate gridsearchcv class
gb = GridSearchCV(estimator=GradientBoostingClassifier(),
                    param_grid=param_grid,
                    cv=5,
                    verbose=20,
                    n_jobs=-1)

#fit_params = {"eval_set":[(X_train,y_train),(X_valid, y_valid)], "eval_names":['train','valid']}
# fit model
gbmodel= gb.fit(X_train, y_train)

In [None]:
best_xgb = GradientBoostingClassifier(learning_rate=0.01, max_depth=5, max_features='log2',
                           min_samples_leaf=10, min_samples_split=12,
                           n_estimators=700)

In [None]:
gbmodel= best_xgb.fit(X_train, y_train)

In [None]:
y_hat_valid = best_xgb.predict(X_valid)
acc_xgb = round(accuracy_score(y_hat_valid,y_valid)*100,2)
print('The accuracy of the Gradient Boosted Trees is',acc_xgb)

In [None]:
#plot_importance()
feature_imp = pd.Series(best_xgb.feature_importances_, index=X_train.columns).sort_values(ascending=False)
plt.figure(figsize=(10,6))
sns.barplot(x=feature_imp, y=feature_imp.index)
# Add labels to your graph
plt.xlabel('Feature Importance Score')
plt.ylabel('Features')
plt.title("Visualizing Important Features")
plt.tight_layout()

In [None]:
cm = confusion_matrix(y_valid, y_hat_valid)
#visualize confusion matrix 
sns.heatmap(confusion_matrix(y_valid, y_hat_valid),annot=True,fmt='3.0f',cmap="Blues")
plt.title('Confusion Matrix for Gradient Boosted Trees Classification', y=1.05, size=15)
plt.ylabel('Predicted Survival')
plt.xlabel('Actual Survival')
#print("Precision :", round((precision_score(y_valid, y_hat_valid))*100,2))
#print("Recall  :",round((recall_score(y_valid, y_hat_valid))*100,2))
#print("F1 Score :",round((f1_score(y_valid, y_hat_valid))*100,2))

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_valid, y_hat_valid))

# Testing with 4 bins 

In [None]:
numeric_df['y_quar'] = pd.qcut(numeric_df['RT_comb_score'], labels =False, q=10, precision=0)
numeric_df['y_quar'].describe()
y_quar= numeric_df['y_quar'].astype('int')

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y_quar, test_size=0.2, shuffle=True, random_state=1)

## Testing with Regressors 

In [None]:
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
# split into train and test
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=1)

encoder = OneHotEncoder(handle_unknown='ignore')
X_train_en = encoder.fit_transform(X=X_train)
X_valid_en = encoder.transform(X=X_valid)

In [None]:
# fit decisiontree based models
from sklearn.metrics import mean_squared_error
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor

models = [
    ('lgbm',LGBMRegressor()),
    ('catb',CatBoostRegressor(verbose=0)),
    ('xgb',XGBRegressor(verbosity=0)),
    ('RF',RandomForestRegressor(verbose=0))
]

results = dict()

for name, model in models:
    model.fit(X_train_en, y_train)
    y_hat = model.predict(X_train_en)
    rmse = mean_squared_error(y_train, y_hat, squared=False)
    results[name] = rmse

In [None]:
df_results = pd.DataFrame([results])
df_results

## Testing with Classifiers

In [None]:
#RT_comb_score
y_class = np.array(y >= 0.49, dtype=int).reshape(-1) #P75
np.unique(y_class, return_counts=True)

In [None]:
159/622

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y_class, test_size=0.2, shuffle=True, random_state=1)

In [None]:
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier

models = [
    ('lgbm',LGBMClassifier()),
    ('catb',CatBoostClassifier(verbose=0)),
    ('xgb',XGBClassifier(verbosity=0)),
    ('RF',RandomForestClassifier(verbose=0))
]

results = dict()

for name, model in models:
    model.fit(X_train, y_train)
    y_hat_valid = model.predict(X_valid)
    accuracy = round(accuracy_score(y_hat_valid,y_valid)*100,2)
    results[name] = accuracy

In [None]:
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
class_weights = dict({0:1, 1:5})
models = [
    ('lgbm',LGBMClassifier(iterations=500,
                           verbose=100,
                           eval_metric="Precision",
                           scale_pos_weight=5)),
    ('catb',CatBoostClassifier(iterations=500,
                           verbose=100,
                           eval_metric="Precision",
                           scale_pos_weight=5)),
    ('xgb',XGBClassifier(iterations=500,
                           verbose=100,
                           scale_pos_weight=5)),

    ]

results = dict()

for name, model in models:
    model.fit(X_train, y_train)
    y_hat_valid = model.predict(X_valid)
    accuracy = round(accuracy_score(y_hat_valid,y_valid)*100,2)
    results[name] = {accuracy , precision}

In [None]:
df_results = pd.DataFrame([results])
df_results

## RF with weighted class

In [None]:
model = RandomForestClassifier(class_weight={0: 1, 1: 5})

In [None]:
model.fit(X_train,y_train)

In [None]:
accuracy= []
recall =[]
roc_auc= []
precision = []
y_pred = model.predict(X_train)
accuracy.append(round(accuracy_score(y_train, y_pred),4))
recall.append(round(recall_score(y_train, y_pred),4))
roc_auc.append(round(roc_auc_score(y_train, y_pred),4))
precision.append(round(precision_score(y_train, y_pred),4))

model_names = ['RF_Train']
result_df = pd.DataFrame({'Accuracy':accuracy,'Recall':recall, 'Roc_Auc':roc_auc, 'Precision':precision}, index=model_names)
result_df

In [None]:
confusion_matrix_train = confusion_matrix(y_train, model.predict(X_train))
confusion_matrix_train = pd.DataFrame(confusion_matrix_train,
                                      index=["Actual_No","Actual_Yes"],
                                      columns=["Predicted_No","Predicted_Yes"])

display(confusion_matrix_train)

In [None]:
from sklearn.metrics import roc_auc_score
accuracy= []
recall =[]
roc_auc= []
precision = []
model_names =[]

y_pred = model.predict(X_valid)
accuracy.append(round(accuracy_score(y_valid, y_pred),4))
recall.append(round(recall_score(y_valid, y_pred),4))
roc_auc.append(round(roc_auc_score(y_valid, y_pred),4))
precision.append(round(precision_score(y_valid, y_pred),4))

model_names = ['RF_Validation']
result_df1 = pd.DataFrame({'Accuracy':accuracy,'Recall':recall, 'Roc_Auc':roc_auc, 'Precision':precision}, index=model_names)
result_df1

In [None]:
confusion_matrix_test = confusion_matrix(y_valid, model.predict(X_valid))
confusion_matrix_test = pd.DataFrame(confusion_matrix_test,
                                     index=["Actual_No","Actual_Yes"],
                                     columns=["Predicted_No","Predicted_Yes"])

display(confusion_matrix_test)

###XGBoost

In [None]:
model = XGBClassifier(iterations=500,
                           verbose=100,
                           scale_pos_weight=5)

In [None]:
model.fit(X_train, y_train)

In [None]:
accuracy= []
recall =[]
roc_auc= []
precision = []
y_pred = model.predict(X_train)
accuracy.append(round(accuracy_score(y_train, y_pred),4))
recall.append(round(recall_score(y_train, y_pred),4))
roc_auc.append(round(roc_auc_score(y_train, y_pred),4))
precision.append(round(precision_score(y_train, y_pred),4))

model_names = ['XGBoost_Train']
result_df = pd.DataFrame({'Accuracy':accuracy,'Recall':recall, 'Roc_Auc':roc_auc, 'Precision':precision}, index=model_names)
result_df

In [None]:
confusion_matrix_train = confusion_matrix(y_train, model.predict(X_train))
confusion_matrix_train = pd.DataFrame(confusion_matrix_train,
                                      index=["Actual_No","Actual_Yes"],
                                      columns=["Predicted_No","Predicted_Yes"])

display(confusion_matrix_train)

In [None]:
from sklearn.metrics import roc_auc_score
accuracy= []
recall =[]
roc_auc= []
precision = []
model_names =[]

y_pred = model.predict(X_valid)
accuracy.append(round(accuracy_score(y_valid, y_pred),4))
recall.append(round(recall_score(y_valid, y_pred),4))
roc_auc.append(round(roc_auc_score(y_valid, y_pred),4))
precision.append(round(precision_score(y_valid, y_pred),4))

model_names = ['XGBoost_Validation']
result_df1 = pd.DataFrame({'Accuracy':accuracy,'Recall':recall, 'Roc_Auc':roc_auc, 'Precision':precision}, index=model_names)
result_df1

In [None]:
confusion_matrix_test = confusion_matrix(y_valid, model.predict(X_valid))
confusion_matrix_test = pd.DataFrame(confusion_matrix_test,
                                     index=["Actual_No","Actual_Yes"],
                                     columns=["Predicted_No","Predicted_Yes"])

display(confusion_matrix_test)

###Lgbm 

In [None]:
from lightgbm import LGBMClassifier
model = LGBMClassifier(iterations=500,
                           verbose=100,
                           eval_metric="Accuracy",
                           scale_pos_weight=5)

In [None]:
model.fit(X_train,y_train)

In [None]:
from sklearn.metrics import roc_auc_score
accuracy= []
recall =[]
roc_auc= []
precision = []
y_pred = model.predict(X_train)
accuracy.append(round(accuracy_score(y_train, y_pred),4))
recall.append(round(recall_score(y_train, y_pred),4))
roc_auc.append(round(roc_auc_score(y_train, y_pred),4))
precision.append(round(precision_score(y_train, y_pred),4))

model_names = ['Lgbm_Train']
result_df = pd.DataFrame({'Accuracy':accuracy,'Recall':recall, 'Roc_Auc':roc_auc, 'Precision':precision}, index=model_names)
result_df

In [None]:
confusion_matrix_train = confusion_matrix(y_train, model.predict(X_train))
confusion_matrix_train = pd.DataFrame(confusion_matrix_train,
                                      index=["Actual_No","Actual_Yes"],
                                      columns=["Predicted_No","Predicted_Yes"])

display(confusion_matrix_train)

In [None]:
#plot_importance()
feature_imp = pd.Series(model.feature_importances_, index=X_train.columns).sort_values(ascending=False)
plt.figure(figsize=(10,6))
sns.barplot(x=feature_imp, y=feature_imp.index)
# Add labels to your graph
plt.xlabel('Feature Importance Score')
plt.ylabel('Features')
plt.title("Visualizing Important Features")
plt.tight_layout()

In [None]:
feat_imp = pd.DataFrame(model.feature_importances_, index=X.columns, columns=["Importance"])
feat_imp.sort_values(by="Importance", ascending=False)

In [None]:
!pip install shap

In [None]:
import shap
shap_values = shap.TreeExplainer(model).shap_values(X_valid)
shap.summary_plot(shap_values, X_valid)

In [None]:
from sklearn.metrics import roc_auc_score
accuracy= []
recall =[]
roc_auc= []
precision = []
model_names =[]

y_pred = model.predict(X_valid)
accuracy.append(round(accuracy_score(y_valid, y_pred),4))
recall.append(round(recall_score(y_valid, y_pred),4))
roc_auc.append(round(roc_auc_score(y_valid, y_pred),4))
precision.append(round(precision_score(y_valid, y_pred),4))

model_names = ['Lgbm_Validation']
result_df1 = pd.DataFrame({'Accuracy':accuracy,'Recall':recall, 'Roc_Auc':roc_auc, 'Precision':precision}, index=model_names)
result_df1

In [None]:
confusion_matrix_test = confusion_matrix(y_valid, model.predict(X_valid))
confusion_matrix_test = pd.DataFrame(confusion_matrix_test,
                                     index=["Actual_No","Actual_Yes"],
                                     columns=["Predicted_No","Predicted_Yes"])

display(confusion_matrix_test)

### Testing catboost with hyperparameter tuning

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y_class, test_size=0.2, shuffle=True, random_state=1)

In [None]:
! pip install catboost

In [None]:
from catboost import CatBoostClassifier
class_weights = dict({0:1, 1:5})
model = CatBoostClassifier(iterations=500,
                           verbose=100,
                           eval_metric="Accuracy",
                           class_weights=class_weights)

In [None]:
model.fit(X_train,y_train)

In [None]:
confusion_matrix_train = confusion_matrix(y_train, model.predict(X_train))
confusion_matrix_train = pd.DataFrame(confusion_matrix_train,
                                      index=["Actual_No","Actual_Yes"],
                                      columns=["Predicted_No","Predicted_Yes"])

display(confusion_matrix_train)

In [None]:
accuracy= []
recall =[]
roc_auc= []
precision = []
y_pred = model.predict(X_train)
accuracy.append(round(accuracy_score(y_train, y_pred),4))
recall.append(round(recall_score(y_train, y_pred),4))
roc_auc.append(round(roc_auc_score(y_train, y_pred),4))
precision.append(round(precision_score(y_train, y_pred),4))

model_names = ['Catboost_Train']
result_df = pd.DataFrame({'Accuracy':accuracy,'Recall':recall, 'Roc_Auc':roc_auc, 'Precision':precision}, index=model_names)
result_df

In [None]:
confusion_matrix_test = confusion_matrix(y_valid, model.predict(X_valid))
confusion_matrix_test = pd.DataFrame(confusion_matrix_test,
                                     index=["Actual_No","Actual_Yes"],
                                     columns=["Predicted_No","Predicted_Yes"])

display(confusion_matrix_test)

In [None]:
from sklearn.metrics import roc_auc_score
accuracy= []
recall =[]
roc_auc= []
precision = []
model_names =[]

y_pred = model.predict(X_valid)
accuracy.append(round(accuracy_score(y_valid, y_pred),4))
recall.append(round(recall_score(y_valid, y_pred),4))
roc_auc.append(round(roc_auc_score(y_valid, y_pred),4))
precision.append(round(precision_score(y_valid, y_pred),4))

model_names = ['Catboost_Validation']
result_df1 = pd.DataFrame({'Accuracy':accuracy,'Recall':recall, 'Roc_Auc':roc_auc, 'Precision':precision}, index=model_names)
result_df1

In [None]:
feat_imp = pd.DataFrame(model.feature_importances_, index=X.columns, columns=["Importance"])
feat_imp.sort_values(by="Importance", ascending=False).head(15)

In [None]:
! pip install shap

In [None]:
import shap
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_valid)
shap.summary_plot(shap_values, features=X_valid, feature_names=X_valid.columns, max_display= 44)

In [None]:
shap.dependence_plot("Genre_Comedy", shap_values, X_valid)

In [None]:
shap.dependence_plot("Genre_LGBTQ", shap_values, X_valid)

In [None]:
shap.initjs()
shap.force_plot(explainer.expected_value, shap_values[0:5,:],X_valid.iloc[0:5,:])
#shap.force_plot(explainer.expected_value[1], shap_values[1][0,:], X_valid.iloc[0,:])

In [None]:
explainer.expected_value