In [1]:
import numpy as np
import pandas as pd
import imdb
from numpy import asarray
from sklearn.model_selection import train_test_split
from sklearn import svm
from category_encoders.target_encoder import TargetEncoder
from sklearn.preprocessing import LabelEncoder
from category_encoders.ordinal import OrdinalEncoder
from category_encoders.woe import WOEEncoder
from category_encoders.target_encoder import TargetEncoder
from category_encoders.sum_coding import SumEncoder
from category_encoders.m_estimate import MEstimateEncoder
from category_encoders.leave_one_out import LeaveOneOutEncoder
from category_encoders.helmert import HelmertEncoder
from category_encoders.cat_boost import CatBoostEncoder
from category_encoders.james_stein import JamesSteinEncoder
from category_encoders.one_hot import OneHotEncoder

In [2]:
successLevel = pd.read_csv('movies-revenue-classification.csv')
voice_actors = pd.read_csv('movie-voice-actors.csv')
directors = pd.read_csv('movie-director.csv')

features = ['release_date', 'genre', 'MPAA_rating', 'director', 'character', 'voice-actor']

In [3]:
"""Converts release-date data type to datetime instead of string."""
print("\nParsing Date: ")
print("-" * 25)

# Checking date format consistency.
date_lengths = successLevel.release_date.str.len()

print("Date Lengths :")
print(date_lengths.value_counts())
print("-" * 25)

print("Release-Date datatype before Parsing: ", successLevel.release_date.dtype)

# Fixing Parsing Wrong Dates
for i in range(successLevel.shape[0]):
    date = successLevel.loc[i, 'release_date']
    if 2 < int(date[-2]) < 7:
        new_date = date[:-2] + "19" + date[-2:]
        successLevel.loc[i, 'release_date'] = new_date
    elif int(date[-2]) == 2 and int(date[-1]) > 2:
        new_date = date[:-2] + "19" + date[-2:]
        successLevel.loc[i, 'release_date'] = new_date

successLevel.release_date = pd.to_datetime(successLevel.release_date)

print("Release-Date datatype after Parsing: ", successLevel.release_date.dtype)
print("-" * 50)


Parsing Date: 
-------------------------
Date Lengths :
9    331
8    132
Name: release_date, dtype: int64
-------------------------
Release-Date datatype before Parsing:  object
Release-Date datatype after Parsing:  datetime64[ns]
--------------------------------------------------


In [4]:
directors.rename(columns={'name': 'movie_title'}, inplace=True)
voice_actors.rename(columns={'movie': 'movie_title'}, inplace=True)

In [5]:
MovieSuccessLevels_actors = pd.merge(successLevel, voice_actors, on="movie_title", how="outer")
data = pd.merge(MovieSuccessLevels_actors, directors, on="movie_title", how="outer")
data = data.dropna(axis=0, subset=['MovieSuccessLevel'])
print("Shape after Joining :", data.shape)

Shape after Joining : (897, 8)


In [6]:
data.head(10)

Unnamed: 0,movie_title,release_date,genre,MPAA_rating,MovieSuccessLevel,character,voice-actor,director
0,Recess: School's Out,2001-02-16,Comedy,G,C,,,
1,D2: The Mighty Ducks,1994-03-25,Comedy,PG,B,,,
2,Home on the Range,2004-04-02,Comedy,PG,C,Alameda Slim,Randy Quaid,Will Finn
3,Home on the Range,2004-04-02,Comedy,PG,C,Audrey the Chicken,Estelle Harris,Will Finn
4,Home on the Range,2004-04-02,Comedy,PG,C,Buck,Cuba Gooding Jr.,Will Finn
5,Home on the Range,2004-04-02,Comedy,PG,C,Grace,Jennifer Tilly,Will Finn
6,Home on the Range,2004-04-02,Comedy,PG,C,Jeb the Goat,Joe Flaherty,Will Finn
7,Home on the Range,2004-04-02,Comedy,PG,C,Junior the Buffalo,Lance LeGault,Will Finn
8,Home on the Range,2004-04-02,Comedy,PG,C,Larry the Duck,Marshall Efron,Will Finn
9,Home on the Range,2004-04-02,Comedy,PG,C,Lucky Jack,Charles Haid,Will Finn


# Training Models

## Polynomial Classification

In [7]:
from category_encoders.m_estimate import MEstimateEncoder

# Y Encoding

In [18]:
#labelEncoder = LabelEncoder()
#encodedLabel = labelEncoder.fit_transform(data["MovieSuccessLevel"])
#encodedLabel = pd.DataFrame(encodedLabel)
encodedLabel = []
# 0 1 2 ...
for i in data['MovieSuccessLevel']:
    if(i == 'S'):
        encodedLabel.append(0)
    elif(i == 'A'):
        encodedLabel.append(1)
    elif(i == 'B'):
        encodedLabel.append(2)
    elif(i == 'C'):
        encodedLabel.append(3)
    elif(i == 'D'):
        encodedLabel.append(4)

encodedLabel = pd.DataFrame(encodedLabel)
encodedLabel.head(10)

Unnamed: 0,0
0,3
1,2
2,3
3,3
4,3
5,3
6,3
7,3
8,3
9,3


In [9]:
data.head(10)

Unnamed: 0,movie_title,release_date,genre,MPAA_rating,MovieSuccessLevel,character,voice-actor,director
0,Recess: School's Out,2001-02-16,Comedy,G,C,,,
1,D2: The Mighty Ducks,1994-03-25,Comedy,PG,B,,,
2,Home on the Range,2004-04-02,Comedy,PG,C,Alameda Slim,Randy Quaid,Will Finn
3,Home on the Range,2004-04-02,Comedy,PG,C,Audrey the Chicken,Estelle Harris,Will Finn
4,Home on the Range,2004-04-02,Comedy,PG,C,Buck,Cuba Gooding Jr.,Will Finn
5,Home on the Range,2004-04-02,Comedy,PG,C,Grace,Jennifer Tilly,Will Finn
6,Home on the Range,2004-04-02,Comedy,PG,C,Jeb the Goat,Joe Flaherty,Will Finn
7,Home on the Range,2004-04-02,Comedy,PG,C,Junior the Buffalo,Lance LeGault,Will Finn
8,Home on the Range,2004-04-02,Comedy,PG,C,Larry the Duck,Marshall Efron,Will Finn
9,Home on the Range,2004-04-02,Comedy,PG,C,Lucky Jack,Charles Haid,Will Finn


In [10]:
y = encodedLabel
y = np.squeeze(y)
y.shape

(897,)

# Target SECOND BEST

In [11]:
targetEncoder = TargetEncoder()
encodedData = targetEncoder.fit_transform(data[features], encodedLabel)
encodedData['release_date'] = encodedData['release_date'].dt.year

# M Estimate

In [32]:
MEE_encoder = MEstimateEncoder()
encodedData = MEE_encoder.fit_transform(data[features], encodedLabel)
encodedData['release_date'] = encodedData['release_date'].dt.year

# JAMES STEIN BEST SO FAR

In [19]:
JSE_encoder = JamesSteinEncoder()
encodedData = JSE_encoder.fit_transform(data[features], encodedLabel)
encodedData['release_date'] = encodedData['release_date'].dt.year

# CatBoost

In [72]:
CBE_encoder = CatBoostEncoder()
encodedData = CBE_encoder.fit_transform(data[features], encodedLabel)
encodedData['release_date'] = encodedData['release_date'].dt.year

# LOO

In [12]:
Loo_encoder = LeaveOneOutEncoder()
encodedData = Loo_encoder.fit_transform(data[features], encodedLabel)
encodedData['release_date'] = encodedData['release_date'].dt.year
encodedData

Unnamed: 0,release_date,genre,MPAA_rating,director,character,voice-actor
0,2001,2.589189,1.478114,2.670429,2.702128,2.702128
1,1994,2.594595,1.986799,2.672686,2.704492,2.704492
2,2004,2.589189,1.983498,3.000000,2.074693,2.074693
3,2004,2.589189,1.983498,3.000000,2.074693,2.074693
4,2004,2.589189,1.983498,3.000000,2.074693,2.074693
...,...,...,...,...,...,...
892,1997,1.796992,1.481481,1.908163,2.074693,1.750000
893,1997,1.796992,1.481481,1.908163,2.074693,2.074693
894,1997,1.796992,1.481481,1.908163,2.074693,2.074693
895,1997,1.796992,1.481481,1.908163,2.074693,2.074693


In [20]:
x = encodedData[features]
x.head()

Unnamed: 0,release_date,genre,MPAA_rating,director,character,voice-actor
0,2001,2.416774,1.642946,2.426852,2.422996,2.423259
1,1994,2.416774,2.009569,2.426852,2.422996,2.423259
2,2004,2.416774,2.009569,3.0,3.0,3.0
3,2004,2.416774,2.009569,3.0,3.0,3.0
4,2004,2.416774,2.009569,3.0,3.0,3.0


In [26]:
X_train, X_test, y_train, y_test = train_test_split(x, y, train_size=0.8, shuffle=True)

# POLY Models

In [22]:
Polynomial_svc = svm.SVC(kernel='poly', degree=40, C=0.4, tol=0.001).fit(X_train, y_train)

trainPredictions = Polynomial_svc.predict(X_train)
trainAccuracy = np.mean(trainPredictions == y_train)
print("\nPolynomial SVC with degree 4 Train Accuracy:", "{:.2f}".format(trainAccuracy * 100), "\b%")

testPredictions = Polynomial_svc.predict(X_test)
testAccuracy = np.mean(testPredictions == y_test)
print("\nPolynomial SVC with degree 4 Accuracy:", "{:.2f}".format(testAccuracy * 100), "\b%\n")


Polynomial SVC with degree 4 Train Accuracy: 63.04 %

Polynomial SVC with degree 4 Accuracy: 58.33 %



# LINEAR Best So Far   2  with james stein 4,0.4,1.5,0.8 Consistant

In [27]:
def_svc = svm.SVC(kernel='linear',degree=4, C=0.4, tol=1.5, gamma=0.8).fit(X_train, y_train)

trainPredictions = def_svc.predict(X_train)
trainAccuracy = np.mean(trainPredictions == y_train)
print("\nPolynomial SVC with degree 4 Train Accuracy:", "{:.2f}".format(trainAccuracy * 100), "\b%")

testPredictions = def_svc.predict(X_test)
testAccuracy = np.mean(testPredictions == y_test)

print("\nPolynomial SVC with degree 4 Accuracy:", "{:.2f}".format(testAccuracy * 100), "\b%\n")


Polynomial SVC with degree 4 Train Accuracy: 69.32 %

Polynomial SVC with degree 4 Accuracy: 68.33 %



# RPF BEST   1 with james stein 0.8,1

In [28]:
rbf_svc = svm.SVC(kernel='rbf', gamma=1, C=0.1, tol=1.5, degree=4).fit(X_train, y_train)


trainPredictions = rbf_svc.predict(X_train)
trainAccuracy = np.mean(trainPredictions == y_train)
print("\nPolynomial SVC with degree 4 Train Accuracy:", "{:.2f}".format(trainAccuracy * 100), "\b%")

testPredictions = rbf_svc.predict(X_test)
testAccuracy = np.mean(testPredictions == y_test)

print("\nPolynomial SVC with degree 4 Accuracy:", "{:.2f}".format(testAccuracy * 100), "\b%\n")


Polynomial SVC with degree 4 Train Accuracy: 73.64 %

Polynomial SVC with degree 4 Accuracy: 69.44 %



# LOO 0.2 0.3 

In [29]:
rbf_svc = svm.SVC(kernel='rbf', gamma=0.2, C=0.3).fit(X_train, y_train)


trainPredictions = rbf_svc.predict(X_train)
trainAccuracy = np.mean(trainPredictions == y_train)
print("\nPolynomial SVC with degree 4 Train Accuracy:", "{:.2f}".format(trainAccuracy * 100), "\b%")

testPredictions = rbf_svc.predict(X_test)
testAccuracy = np.mean(testPredictions == y_test)

print("\nPolynomial SVC with degree 4 Accuracy:", "{:.2f}".format(testAccuracy * 100), "\b%\n")


Polynomial SVC with degree 4 Train Accuracy: 71.97 %

Polynomial SVC with degree 4 Accuracy: 70.56 %



In [30]:
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.multiclass import OneVsRestClassifier,OneVsOneClassifier

svm_kernel_ovo = OneVsOneClassifier(SVC(kernel='linear', C=1)).fit(X_train, y_train)
#svm_kernel_ovr = OneVsRestClassifier(SVC(kernel='linear', C=0.5)).fit(X_train, y_train)

#svm_linear_ovo = OneVsOneClassifier(LinearSVC(C=0.5),).fit(X_train, y_train)
#svm_linear_ovr = OneVsRestClassifier(LinearSVC(C=0.5)).fit(X_train, y_train)

# model accuracy for svc model
#accuracy = svm_kernel_ovr.score(X_test, y_test)
#print('Linear Kernel OneVsRest SVM accuracy: ' + str(accuracy))
accuracy1 = svm_kernel_ovo.score(X_train, y_train)
accuracy = svm_kernel_ovo.score(X_test, y_test)
print('Linear Kernel OneVsOne SVM Train accuracy: ' + str(accuracy1*100) + "%")
print('Linear Kernel OneVsOne SVM accuracy: ' + str(accuracy*100) + "%")

# model accuracy for svc model
#accuracy = svm_linear_ovr.score(X_test, y_test)
#print('LinearSVC OneVsRest SVM accuracy: ' + str(accuracy))
#accuracy = svm_linear_ovo.score(X_test, y_test)
#print('LinearSVC OneVsOne SVM accuracy: ' + str(accuracy))

Linear Kernel OneVsOne SVM Train accuracy: 70.43235704323571%
Linear Kernel OneVsOne SVM accuracy: 75.0%
