In [60]:
import numpy as np
import pandas as pd
import os
import collections as c
import matplotlib.pyplot as plt
from sklearn.preprocessing import KBinsDiscretizer

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import CategoricalNB
from sklearn import tree
from sklearn.preprocessing import OrdinalEncoder
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

from imblearn.over_sampling import RandomOverSampler

#os.getcwd()


In [2]:
## base = '/home/abhishek/Documents/ds_sem2/dmml/assignments/bollywood_movie_data'
## folder in which dataset is stored
url = 'https://raw.githubusercontent.com/abhishek116002/dmml_assignments/main/assignment1/bollywood_movie_data/bollywood_movie_data.csv'

In [3]:
#os.chdir(base)

data = pd.read_csv(url, sep = ',')


data[data['Movie Name'] == 'Dhoom 3']

Unnamed: 0,Movie Name,Release Period,Whether Remake,Whether Franchise,Genre,New Actor,New Director,New Music Director,Lead Star,Director,Music Director,Number of Screens,Revenue(INR),Budget(INR)
17,Dhoom 3,Normal,No,Yes,action,No,No,No,Aamir Khan,Vijay Krishna Acharya,Pritam,3650,1750000000,5243760000


In [4]:
## There was an error in the dataset. interchanging revenue and budget column names fixes it. So did that. 

data = data.assign(
    hit = (   data.loc[:,'Revenue(INR)']< data.loc[:,'Budget(INR)']   )
)
data['hit'] = data['hit'].astype(int)
data.drop(labels = ['Budget(INR)', 'Movie Name'], axis =1, inplace = True)
data.rename(columns = {'Revenue(INR)': 'Budget(INR)'}, inplace = True)
data

Unnamed: 0,Release Period,Whether Remake,Whether Franchise,Genre,New Actor,New Director,New Music Director,Lead Star,Director,Music Director,Number of Screens,Budget(INR),hit
0,Normal,No,No,suspense,Yes,No,No,Jeet Goswami,Ravi Varma,Baba Jagirdar,5,5000000,0
1,Holiday,No,No,drama,Yes,No,Yes,Karan Bhanushali,Sagar Ballary,Amardeep Nijjer,75,15000000,0
2,Holiday,No,No,thriller,No,No,No,Mahie Gill,Ram Gopal Verma,Sandeep Chowta,525,75000000,0
3,Holiday,No,No,drama,Yes,No,No,Aadar Jain,Habib Faisal,Amit Trivedi,800,210000000,0
4,Holiday,No,No,adult,Yes,Yes,Yes,Aadil Khan,Aadil Khan,Babloo Ustad,1,1000000,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1693,Holiday,No,No,action,No,Yes,No,Zayed Khan,Vikram Chopra,Pritam,375,82500000,1
1694,Normal,No,No,drama,No,Yes,Yes,Zeenat Aman,Sanghamitra Chaudhuri,Dev Sikdar,10,8000000,0
1695,Normal,No,No,drama,No,No,No,Zeenat Aman,Sanjay Sharma,Nikhil,20,12500000,0
1696,Normal,No,No,drama,No,Yes,No,Zulfi Sayed,Akbar Khan,Naushad,135,100000000,0


In [5]:
# Discretize Number of Screens and Budget(INR) features
data = data.assign(screens=pd.cut( 
                        data['Number of Screens'], 
                        bins=[0,1,10,100,200,400,600,1000,2000,3000,5000]
                      ),
                   budget=pd.cut( 
                        data['Budget(INR)'], 
                        bins=[300000,1000000,5000000,10000000,50000000,100000000,200000000,500000000,1000000000,3000000000]
                      ),
                   hits = data['hit']
           )
data.drop(labels = ['Budget(INR)', 'Number of Screens', 'hit'], axis =1, inplace = True)

In [6]:
X,y = data.iloc[:,:12], data.iloc[:,12]
X

Unnamed: 0,Release Period,Whether Remake,Whether Franchise,Genre,New Actor,New Director,New Music Director,Lead Star,Director,Music Director,screens,budget
0,Normal,No,No,suspense,Yes,No,No,Jeet Goswami,Ravi Varma,Baba Jagirdar,"(1, 10]","(1000000, 5000000]"
1,Holiday,No,No,drama,Yes,No,Yes,Karan Bhanushali,Sagar Ballary,Amardeep Nijjer,"(10, 100]","(10000000, 50000000]"
2,Holiday,No,No,thriller,No,No,No,Mahie Gill,Ram Gopal Verma,Sandeep Chowta,"(400, 600]","(50000000, 100000000]"
3,Holiday,No,No,drama,Yes,No,No,Aadar Jain,Habib Faisal,Amit Trivedi,"(600, 1000]","(200000000, 500000000]"
4,Holiday,No,No,adult,Yes,Yes,Yes,Aadil Khan,Aadil Khan,Babloo Ustad,"(0, 1]","(300000, 1000000]"
...,...,...,...,...,...,...,...,...,...,...,...,...
1693,Holiday,No,No,action,No,Yes,No,Zayed Khan,Vikram Chopra,Pritam,"(200, 400]","(50000000, 100000000]"
1694,Normal,No,No,drama,No,Yes,Yes,Zeenat Aman,Sanghamitra Chaudhuri,Dev Sikdar,"(1, 10]","(5000000, 10000000]"
1695,Normal,No,No,drama,No,No,No,Zeenat Aman,Sanjay Sharma,Nikhil,"(10, 100]","(10000000, 50000000]"
1696,Normal,No,No,drama,No,Yes,No,Zulfi Sayed,Akbar Khan,Naushad,"(100, 200]","(50000000, 100000000]"


In [7]:
## perform ordinal encoding on categorical variables
## variable values that occur more often are assigned higher value
X_trans = X
enc = OrdinalEncoder()
cols = X.columns
for i in cols:
    A = data.loc[:,[i]]
    enc.fit( np.array( A.value_counts().reset_index()[i] ).reshape(-1,1) )
    X_trans.loc[:,i] = enc.transform(  np.array(data.loc[:,i]).reshape(-1,1)   )
    
X_trans

Unnamed: 0,Release Period,Whether Remake,Whether Franchise,Genre,New Actor,New Director,New Music Director,Lead Star,Director,Music Director,screens,budget
0,1.0,0.0,0.0,12.0,1.0,0.0,0.0,260.0,710.0,103.0,1.0,1.0
1,0.0,0.0,0.0,5.0,1.0,0.0,1.0,282.0,754.0,49.0,2.0,3.0
2,0.0,0.0,0.0,13.0,0.0,0.0,0.0,328.0,689.0,461.0,5.0,4.0
3,0.0,0.0,0.0,5.0,1.0,0.0,0.0,0.0,303.0,54.0,6.0,6.0
4,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,4.0,108.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
1693,0.0,0.0,0.0,0.0,0.0,1.0,0.0,761.0,1000.0,386.0,4.0,4.0
1694,1.0,0.0,0.0,5.0,0.0,1.0,1.0,762.0,777.0,157.0,1.0,2.0
1695,1.0,0.0,0.0,5.0,0.0,0.0,0.0,762.0,795.0,336.0,2.0,3.0
1696,1.0,0.0,0.0,5.0,0.0,1.0,0.0,763.0,57.0,326.0,3.0,4.0


In [8]:
X_train, X_test, y_train, y_test = train_test_split(X_trans, y, test_size=0.25, random_state=0)
## oversample the 'yes' class so that both classes have the same number of data points
oversample = RandomOverSampler(sampling_strategy=0.5)
X_overtrain, y_overtrain = oversample.fit_resample(X_train, y_train)

# Decision Trees

In [74]:


clf = tree.DecisionTreeClassifier( class_weight = {1:5, 0:1}, max_depth = 8, min_samples_split = 2)
clf = clf.fit(X_overtrain, y_overtrain)
y_pred = clf.predict(X_test)

print('Model accuracy score: {0:0.5f}'. format(accuracy_score(y_test, y_pred)))
print('Model f1 score: {0:0.5f}'. format(f1_score(y_test, y_pred, pos_label = 1)))

Model accuracy score: 0.72235
Model f1 score: 0.62179


In [10]:
y_pred_train = clf.predict(X_train)
print('Training-set f1 score: {0:0.5f}'. format(f1_score(y_train, y_pred_train, pos_label = 1)))

Training-set f1 score: 0.77049


In [11]:
clf.get_depth()

8

In [12]:
cm  = confusion_matrix(y_test, y_pred)
TP = cm[1,1]
TN = cm[0,0]
FP = cm[0,1]
FN = cm[1,0]



print('Confusion matrix\n\n', cm)

print('\nTrue Positives(TP) = ', cm[1,1])

print('\nTrue Negatives(TN) = ', cm[0,0])

print('\nFalse Positives(FP) = ', cm[0,1])

print('\nFalse Negatives(FN) = ', cm[1,0])

print(classification_report(y_test, y_pred))


false_positive_rate = FP / float(FP + TN)


print('False Positive Rate : {0:0.4f}'.format(false_positive_rate))

specificity = TN / (TN + FP)

print('Specificity : {0:0.4f}'.format(specificity))

Confusion matrix

 [[207  95]
 [ 27  96]]

True Positives(TP) =  96

True Negatives(TN) =  207

False Positives(FP) =  95

False Negatives(FN) =  27
              precision    recall  f1-score   support

           0       0.88      0.69      0.77       302
           1       0.50      0.78      0.61       123

    accuracy                           0.71       425
   macro avg       0.69      0.73      0.69       425
weighted avg       0.77      0.71      0.73       425

False Positive Rate : 0.3146
Specificity : 0.6854


# Naive Bayes Algorithm

In [13]:
## get the number of categories in each feature
cols = X.columns
cats = []
for i in cols:
    cats.append(X[i].unique().size)
cats

[2, 2, 2, 14, 2, 2, 2, 764, 1048, 630, 10, 9]

In [73]:
cnb = CategoricalNB(alpha =1, min_categories = cats)
cnb = cnb.fit(X_overtrain, y_overtrain)
y_pred = cnb.predict(X_test)

print('Model accuracy score: {0:0.5f}'. format(accuracy_score(y_test, y_pred)))
print('Model f1 score: {0:0.5f}'. format(f1_score(y_test, y_pred, pos_label = 1)))

Model accuracy score: 0.76235
Model f1 score: 0.66445


In [15]:
y_pred_train = cnb.predict(X_train)
print('Training-set f1 score: {0:0.5f}'. format(f1_score(y_train, y_pred_train, pos_label = 1)))

Training-set f1 score: 0.77273


In [16]:
cm  = confusion_matrix(y_test, y_pred)
TP = cm[1,1]
TN = cm[0,0]
FP = cm[0,1]
FN = cm[1,0]



print('Confusion matrix\n\n', cm)

print('\nTrue Positives(TP) = ', cm[1,1])

print('\nTrue Negatives(TN) = ', cm[0,0])

print('\nFalse Positives(FP) = ', cm[0,1])

print('\nFalse Negatives(FN) = ', cm[1,0])

print(classification_report(y_test, y_pred))


false_positive_rate = FP / float(FP + TN)


print('False Positive Rate : {0:0.4f}'.format(false_positive_rate))

specificity = TN / (TN + FP)

print('Specificity : {0:0.4f}'.format(specificity))

Confusion matrix

 [[224  78]
 [ 23 100]]

True Positives(TP) =  100

True Negatives(TN) =  224

False Positives(FP) =  78

False Negatives(FN) =  23
              precision    recall  f1-score   support

           0       0.91      0.74      0.82       302
           1       0.56      0.81      0.66       123

    accuracy                           0.76       425
   macro avg       0.73      0.78      0.74       425
weighted avg       0.81      0.76      0.77       425

False Positive Rate : 0.2583
Specificity : 0.7417


## Random Forest

In [72]:


clf = RandomForestClassifier( class_weight = {1:5, 0:1}, max_depth = 10, min_samples_split = 2)
clf = clf.fit(X_overtrain, y_overtrain)
y_pred = clf.predict(X_test)

print('Model accuracy score: {0:0.5f}'. format(accuracy_score(y_test, y_pred)))
print('Model f1 score: {0:0.5f}'. format(f1_score(y_test, y_pred, pos_label = 1)))

Model accuracy score: 0.77882
Model f1 score: 0.66901


In [58]:
y_pred_train = clf.predict(X_train)
print('Training-set f1 score: {0:0.5f}'. format(f1_score(y_train, y_pred_train, pos_label = 1)))

Training-set f1 score: 0.86979


In [59]:
cm  = confusion_matrix(y_test, y_pred)
TP = cm[1,1]
TN = cm[0,0]
FP = cm[0,1]
FN = cm[1,0]



print('Confusion matrix\n\n', cm)

print('\nTrue Positives(TP) = ', cm[1,1])

print('\nTrue Negatives(TN) = ', cm[0,0])

print('\nFalse Positives(FP) = ', cm[0,1])

print('\nFalse Negatives(FN) = ', cm[1,0])

print(classification_report(y_test, y_pred))


false_positive_rate = FP / float(FP + TN)


print('False Positive Rate : {0:0.4f}'.format(false_positive_rate))

specificity = TN / (TN + FP)

print('Specificity : {0:0.4f}'.format(specificity))

Confusion matrix

 [[235  67]
 [ 30  93]]

True Positives(TP) =  93

True Negatives(TN) =  235

False Positives(FP) =  67

False Negatives(FN) =  30
              precision    recall  f1-score   support

           0       0.89      0.78      0.83       302
           1       0.58      0.76      0.66       123

    accuracy                           0.77       425
   macro avg       0.73      0.77      0.74       425
weighted avg       0.80      0.77      0.78       425

False Positive Rate : 0.2219
Specificity : 0.7781


## AdaBoost Classifier

In [71]:


clf = AdaBoostClassifier( random_state= 0)
clf = clf.fit(X_overtrain, y_overtrain)
y_pred = clf.predict(X_test)

print('Model accuracy score: {0:0.5f}'. format(accuracy_score(y_test, y_pred)))
print('Model f1 score: {0:0.5f}'. format(f1_score(y_test, y_pred, pos_label = 1)))

Model accuracy score: 0.77882
Model f1 score: 0.63281


In [69]:
y_pred_train = clf.predict(X_train)
print('Training-set f1 score: {0:0.5f}'. format(f1_score(y_train, y_pred_train, pos_label = 1)))

Training-set f1 score: 0.69928


In [70]:
cm  = confusion_matrix(y_test, y_pred)
TP = cm[1,1]
TN = cm[0,0]
FP = cm[0,1]
FN = cm[1,0]



print('Confusion matrix\n\n', cm)

print('\nTrue Positives(TP) = ', cm[1,1])

print('\nTrue Negatives(TN) = ', cm[0,0])

print('\nFalse Positives(FP) = ', cm[0,1])

print('\nFalse Negatives(FN) = ', cm[1,0])

print(classification_report(y_test, y_pred))


false_positive_rate = FP / float(FP + TN)


print('False Positive Rate : {0:0.4f}'.format(false_positive_rate))

specificity = TN / (TN + FP)

print('Specificity : {0:0.4f}'.format(specificity))

Confusion matrix

 [[250  52]
 [ 42  81]]

True Positives(TP) =  81

True Negatives(TN) =  250

False Positives(FP) =  52

False Negatives(FN) =  42
              precision    recall  f1-score   support

           0       0.86      0.83      0.84       302
           1       0.61      0.66      0.63       123

    accuracy                           0.78       425
   macro avg       0.73      0.74      0.74       425
weighted avg       0.78      0.78      0.78       425

False Positive Rate : 0.1722
Specificity : 0.8278
