In [23]:
import pandas as pd
import imdb
from numpy import asarray
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.preprocessing import OrdinalEncoder
from category_encoders.target_encoder import TargetEncoder

In [2]:
successLevel = pd.read_csv('../Datasets/Classification Datasets/movies-revenue-classification.csv')
voice_actors = pd.read_csv('../Datasets/Classification Datasets/movie-voice-actors.csv')
directors = pd.read_csv('../Datasets/Classification Datasets/movie-director.csv')

features = ['release_date', 'genre', 'MPAA_rating', 'director', 'character', 'voice-actor']

In [3]:
"""Converts release-date data type to datetime instead of string."""
print("\nParsing Date: ")
print("-" * 25)

# Checking date format consistency.
date_lengths = successLevel.release_date.str.len()

print("Date Lengths :")
print(date_lengths.value_counts())
print("-" * 25)

print("Release-Date datatype before Parsing: ", successLevel.release_date.dtype)

# Fixing Parsing Wrong Dates
for i in range(successLevel.shape[0]):
    date = successLevel.loc[i, 'release_date']
    if 2 < int(date[-2]) < 7:
        new_date = date[:-2] + "19" + date[-2:]
        successLevel.loc[i, 'release_date'] = new_date
    elif int(date[-2]) == 2 and int(date[-1]) > 2:
        new_date = date[:-2] + "19" + date[-2:]
        successLevel.loc[i, 'release_date'] = new_date

successLevel.release_date = pd.to_datetime(successLevel.release_date)

print("Release-Date datatype after Parsing: ", successLevel.release_date.dtype)
print("-" * 50)


Parsing Date: 
-------------------------
Date Lengths :
9    331
8    132
Name: release_date, dtype: int64
-------------------------
Release-Date datatype before Parsing:  object
Release-Date datatype after Parsing:  datetime64[ns]
--------------------------------------------------


In [4]:
directors.rename(columns={'name': 'movie_title'}, inplace=True)
voice_actors.rename(columns={'movie': 'movie_title'}, inplace=True)

In [7]:
MovieSuccessLevels_actors = pd.merge(successLevel, voice_actors, on="movie_title", how="outer")
data = pd.merge(MovieSuccessLevels_actors, directors, on="movie_title", how="outer")
data = data.dropna(axis=0, subset=['MovieSuccessLevel'])
print("Shape after Joining :", data.shape)

Shape after Joining : (897, 8)


In [8]:
data.head(1)

Unnamed: 0,movie_title,release_date,genre,MPAA_rating,MovieSuccessLevel,character,voice-actor,director
0,Recess: School's Out,2001-02-16,Comedy,G,C,,,


In [9]:
data.to_csv("Preprocessed-Dataset/preprocessed_data.csv", index=False)

# Training Models

In [26]:
y = data["MovieSuccessLevel"]
x = data[features]
x.head(1)

Unnamed: 0,release_date,genre,MPAA_rating,director,character,voice-actor
0,2001-02-16,Comedy,G,,,


## Polynomial Classification

In [28]:
label = asarray([y])
encoder = OrdinalEncoder()
result = encoder.fit_transform(label)
print(result)

[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 

In [None]:
xTrain, xTest, yTrain, YTest = train_test_split(x, y, train_size=0.8, shuffle=True)

# Target Encoder

In [14]:
Polynomial_svc = svm.SVC(kernel='poly', degree=4, C=0.008).fit(xTrain, yTrain)

trainPredictions = Polynomial_svc.predict(xTrain)
trainAccuracy = np.mean(trainPredictions == yTrain)
print("\nPolynomial SVC with degree 4 Train Accuracy:", "{:.2f}".format(trainAccuracy * 100), "\b%")

testPredictions = Polynomial_svc.predict(xTest)
testAccuracy = np.mean(testPredictions == yTest)
print("\nPolynomial SVC with degree 4 Accuracy:", "{:.2f}".format(testAccuracy * 100), "\b%\n")

TypeError: float() argument must be a string or a number, not 'Timestamp'