In [1]:
import numpy as np
import pandas as pd
import imdb
from numpy import asarray
from sklearn.model_selection import train_test_split
from sklearn import svm
from category_encoders.target_encoder import TargetEncoder
from sklearn.preprocessing import LabelEncoder

In [2]:
successLevel = pd.read_csv('../Datasets/Classification Datasets/movies-revenue-classification.csv')
voice_actors = pd.read_csv('../Datasets/Classification Datasets/movie-voice-actors.csv')
directors = pd.read_csv('../Datasets/Classification Datasets/movie-director.csv')

features = ['release_date', 'genre', 'MPAA_rating', 'director', 'character', 'voice-actor']

In [3]:
"""Converts release-date data type to datetime instead of string."""
print("\nParsing Date: ")
print("-" * 25)

# Checking date format consistency.
date_lengths = successLevel.release_date.str.len()

print("Date Lengths :")
print(date_lengths.value_counts())
print("-" * 25)

print("Release-Date datatype before Parsing: ", successLevel.release_date.dtype)

# Fixing Parsing Wrong Dates
for i in range(successLevel.shape[0]):
    date = successLevel.loc[i, 'release_date']
    if 2 < int(date[-2]) < 7:
        new_date = date[:-2] + "19" + date[-2:]
        successLevel.loc[i, 'release_date'] = new_date
    elif int(date[-2]) == 2 and int(date[-1]) > 2:
        new_date = date[:-2] + "19" + date[-2:]
        successLevel.loc[i, 'release_date'] = new_date

successLevel.release_date = pd.to_datetime(successLevel.release_date)

print("Release-Date datatype after Parsing: ", successLevel.release_date.dtype)
print("-" * 50)


Parsing Date: 
-------------------------
Date Lengths :
9    331
8    132
Name: release_date, dtype: int64
-------------------------
Release-Date datatype before Parsing:  object
Release-Date datatype after Parsing:  datetime64[ns]
--------------------------------------------------


In [4]:
directors.rename(columns={'name': 'movie_title'}, inplace=True)
voice_actors.rename(columns={'movie': 'movie_title'}, inplace=True)

In [5]:
MovieSuccessLevels_actors = pd.merge(successLevel, voice_actors, on="movie_title", how="outer")
data = pd.merge(MovieSuccessLevels_actors, directors, on="movie_title", how="outer")
data = data.dropna(axis=0, subset=['MovieSuccessLevel'])
print("Shape after Joining :", data.shape)

Shape after Joining : (897, 8)


In [6]:
data.head(1)

Unnamed: 0,movie_title,release_date,genre,MPAA_rating,MovieSuccessLevel,character,voice-actor,director
0,Recess: School's Out,2001-02-16,Comedy,G,C,,,


In [7]:
data.to_csv("Preprocessed-Dataset/preprocessed_data.csv", index=False)

# Training Models

## Polynomial Classification

In [8]:
labelEncoder = LabelEncoder()
encodedLabel = labelEncoder.fit_transform(data["MovieSuccessLevel"])
encodedLabel = pd.DataFrame(encodedLabel)

targetEncoder = TargetEncoder()
encodedData = targetEncoder.fit_transform(data[features], encodedLabel)

encodedData.head()

Unnamed: 0,release_date,genre,MPAA_rating,director,character,voice-actor
0,2001-02-16,1.61828,1.305369,1.795045,1.79717,1.79717
1,1994-03-25,1.61828,0.986842,1.795045,1.79717,1.79717
2,2004-04-02,1.61828,0.986842,2.0,1.381271,1.381271
3,2004-04-02,1.61828,0.986842,2.0,1.381271,1.381271
4,2004-04-02,1.61828,0.986842,2.0,1.381271,1.381271


In [9]:
encodedData['release_date'] = encodedData['release_date'].dt.year
x = encodedData[features]
y = encodedLabel

y = np.squeeze(y)

In [10]:
X_train, X_test, y_train, y_test = train_test_split(x, y, train_size=0.8, shuffle=True)

# Target Encoder

In [12]:
Polynomial_svc = svm.SVC(kernel='poly', degree=10, C=0.02).fit(xTrain, yTrain)

trainPredictions = Polynomial_svc.predict(xTrain)
trainAccuracy = np.mean(trainPredictions == yTrain)
print("\nPolynomial SVC with degree 4 Train Accuracy:", "{:.2f}".format(trainAccuracy * 100), "\b%")

testPredictions = Polynomial_svc.predict(xTest)
testAccuracy = np.mean(testPredictions == YTest)
print("\nPolynomial SVC with degree 4 Accuracy:", "{:.2f}".format(testAccuracy * 100), "\b%\n")


Polynomial SVC with degree 4 Train Accuracy: 29.99 %

Polynomial SVC with degree 4 Accuracy: 21.67 %



In [31]:
def_svc = svm.SVC(kernel='linear', C=0.4).fit(xTrain, yTrain)

trainPredictions = def_svc.predict(xTrain)
trainAccuracy = np.mean(trainPredictions == yTrain)
print("\nPolynomial SVC with degree 4 Train Accuracy:", "{:.2f}".format(trainAccuracy * 100), "\b%")

testPredictions = def_svc.predict(xTest)
testAccuracy = np.mean(testPredictions == YTest)

print("\nPolynomial SVC with degree 4 Accuracy:", "{:.2f}".format(testAccuracy * 100), "\b%\n")


Polynomial SVC with degree 4 Train Accuracy: 63.04 %

Polynomial SVC with degree 4 Accuracy: 55.56 %



In [15]:
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.multiclass import OneVsRestClassifier,OneVsOneClassifier

svm_kernel_ovo = OneVsOneClassifier(SVC(kernel='linear', C=2)).fit(X_train, y_train)
svm_kernel_ovr = OneVsRestClassifier(SVC(kernel='linear', C=2)).fit(X_train, y_train)

svm_linear_ovo = OneVsOneClassifier(LinearSVC(C=1),).fit(X_train, y_train)
svm_linear_ovr = OneVsRestClassifier(LinearSVC(C=1)).fit(X_train, y_train)

# model accuracy for svc model
accuracy = svm_kernel_ovr.score(X_test, y_test)
print('Linear Kernel OneVsRest SVM accuracy: ' + str(accuracy))
accuracy = svm_kernel_ovo.score(X_test, y_test)
print('Linear Kernel OneVsOne SVM accuracy: ' + str(accuracy))

# model accuracy for svc model
accuracy = svm_linear_ovr.score(X_test, y_test)
print('LinearSVC OneVsRest SVM accuracy: ' + str(accuracy))
accuracy = svm_linear_ovo.score(X_test, y_test)
print('LinearSVC OneVsOne SVM accuracy: ' + str(accuracy))



Linear Kernel OneVsRest SVM accuracy: 0.2777777777777778
Linear Kernel OneVsOne SVM accuracy: 0.5166666666666667
LinearSVC OneVsRest SVM accuracy: 0.2
LinearSVC OneVsOne SVM accuracy: 0.4222222222222222


