In [28]:
import numpy as np
import pandas as pd
import sklearn
import time
import re
import string
from wordcloud import WordCloud, STOPWORDS
from sklearn.decomposition import IncrementalPCA, PCA, TruncatedSVD
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split, KFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, hamming_loss
from sklearn.multiclass import OneVsRestClassifier
from sklearn.neighbors import KNeighborsClassifier
import sklearn.feature_extraction.text as extraction
import sklearn
import scipy
import math
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

In [29]:
shortX = pd.read_csv("./imdbmovies/features.csv")
fullX = pd.read_csv("./imdbmovies/features_vectorized.csv")
words50X = pd.read_csv("./imdbmovies/vectorization50.csv", header=None)
labelsY = pd.read_csv("./imdbmovies/labels.csv")

# Preprocessing

Preprocessing:
- removing all data whos feature vector is irregular (longer than the column of 44 or something)
- removed all commas in CSV so that it doesn't interfere with the read_csv
- drop features with missing values
- deleted non-movies
- threw out data with multiple parentheses appearences
- translate all titles to english
- removed year and other parentheses from title
- choosing to use snowball(Porter2) stemmer
- remove punctuation
- parsed titles and created tfidf

This requires google cloud api credential which is not attached, the following function cannot be run

In [None]:
def load_data():
    client = translate.Client()
    stemmer = stem.snowball.EnglishStemmer()    
    table = str.maketrans('','',string.punctuation)
    data = pd.read_csv("./imdbmovies/imdb.csv")
    
    idxRemove = data[np.logical_not(np.isnan(data['Unnamed: 44']))].index
    dataCleaned = data.drop(idxRemove)
    idxNonMovie = dataCleaned[dataCleaned["type"] != 'video.movie'].index
    dataCleaned.drop(idxNonMovie, inplace=True)
    idxTwoParen = dataCleaned[dataCleaned["title"].str.contains("\(.*\(")].index
    dataCleaned.drop(idxTwoParen, inplace=True)

    dataCleaned.drop(columns=["Unnamed: 44", "Unnamed: 45", "Unnamed: 46", "Unnamed: 47"], inplace=True)
    dataCleaned.dropna(axis=0, inplace=True)
    dataCleaned.reset_index(inplace=True)
    dataCleaned.drop(columns=["index", "url", "tid", "fn", "wordsInTitle", "type"], inplace=True)
    dataCleaned.title = dataCleaned.title.apply(lambda title: re.sub(r'\(([ a-zA-Z]*)([0-9]{4})\)', r'\1', title))\
                                    .apply(lambda title: title.translate(table))\
                                    .apply(lambda title: client.translate(title)["translatedText"].replace("&#39;", ""))\
                                    .apply(lambda title: " ".join([stemmer.stem(word) for word in tokenize(title)]))
    
    X = dataCleaned.iloc[:, :11]
    y = dataCleaned.iloc[:, 11:]
    
    return X, y

In [27]:
tfidfVector = extraction.TfidfVectorizer()
tfidf = tfidfVector.fit_transform(shortX['title'])
key = {v: k for k,v in tfidfVector.vocabulary_.items()}
words = pd.DataFrame(tfidf.todense())
words.rename(columns=key, inplace=True)

KeyError: 'title'

# Data Visualization

#### Word Cloud

World clouds based on Genre (Drama, Romance and Comedy) which are genres with more frequency of occurence in the current dataset

#### Based on Drama

In [None]:
word_cloud=pd.concat([shortX['title'], labelsY['Drama']], axis=1)
movies_for_wordcloud = word_cloud[word_cloud.Drama != 0]
values = " ".join(map(str, (word_cloud["title"].tolist())))
WordCloud(stopwords=STOPWORDS,width=1500,height=500).generate(values).to_image()

#### Based on Comedy

In [None]:
word_cloud=pd.concat([shortX['title'], labelsY['Comedy']], axis=1)
movies_for_wordcloud = word_cloud[word_cloud.Comedy != 0]
values = " ".join(map(str, (word_cloud["title"].tolist())))
WordCloud(stopwords=STOPWORDS,width=1500,height=500).generate(values).to_image()

#### Based on Romance

In [None]:
word_cloud=pd.concat([shortX['title'], labelsY['Romance']], axis=1)
movies_for_wordcloud = word_cloud[word_cloud.Romance != 0]
values = " ".join(map(str, (word_cloud["title"].tolist())))
WordCloud(stopwords=STOPWORDS,width=1500,height=500).generate(values).to_image()

#### Genre Fequency

Plotting a bar graph based which shows genre vs their frequency of occurence in the current dataset

Frequency Caluclation:

In [22]:
labelsY.columns
frequency = {}
for i in labelsY.columns.values:
    frequency[i] = sum(labelsY[i])

In [None]:
plt.rcParams["figure.figsize"] = [12,10]
df=pd.DataFrame.from_dict(frequency, orient="index")
df.plot(kind='bar',  ylim=(10,5050), legend = False, title='Frequencies for each Genre')
plt.xlabel('Genre')
plt.ylabel('Frequency')
plt.show()

# Modeling

In [None]:
del shortX['title']

## Log Regression

In [None]:
# Logistic Regression without PCA

C=np.power(np.e, np.random.uniform(0, 1, 10))
start_time = time.time()
dict_loss = {}
dict_accuracy = {}
for c in C:
    dict_loss[c] = []
    dict_accuracy[c] = []
    for i in range (5):
        x_train, x_test, y_train, y_test  = train_test_split(shortX, labelsY, test_size=.3)
        lr = OneVsRestClassifier(LogisticRegression(class_weight='balanced', C=c, solver='sag', max_iter = 2500), n_jobs=-1)
        lr.fit(x_train, y_train)
        y_pred = lr.predict(x_test)
        score=lr.score(x_test, y_test)
        hl = hamming_loss(y_test, y_pred)
        dict_loss[c].append(hl)
        dict_accuracy[c].append(score)
        
print("Time to load data: {} seconds".format(time.time() - start_time))


# Plotting the accuracy and hamming loss vs C values

fig, ax = plt.subplots(figsize=(24,12))
ax2 = ax.twinx()
for c, x in dict_loss.items(): 
    avg_loss = {k:np.mean(np.array(v)) for k,v in dict_loss.items()}
    avg_score = {k:np.mean(np.array(v)) for k,v in dict_accuracy.items()}
    list1 = sorted(avg_loss.items())
    list2 = sorted(avg_score.items())
    x_plot, y_plot = zip(*list1)
    ax.plot(x_plot, y_plot, color='orange',marker='d',markersize=10)
    x, y = zip(*list2)
    ax2.plot(x,y, color='green',marker='d',markersize=10)

ax.set_ylabel('Accuracy',fontsize=10)
ax2.set_ylabel('Hamming Loss',fontsize=10)
ax.legend(['Accuracy'],loc=2,fontsize=15)
ax2.legend(['Loss'],loc=1,fontsize=15)
ax.set_xlabel('C- values',fontsize=10)
plt.title("Accuracy and Hamming Loss vs C- values without PCA")
ax.set_xscale('log')
plt.show()


#Logistic Regression with PCA

start_time = time.time()
pca = PCA(n_components=0.95)

dict_loss_pca = {}
dict_accuracy_pca = {}
for c in C:
    dict_loss_pca[c] = []
    dict_accuracy_pca[c] = []
    for i in range (5):
        x_train, x_test, y_train, y_test  = train_test_split(shortX, labelsY, test_size=.3)
        pca.fit(x_train)
        x_train_reduced = pca.transform((x_train))
        x_test_reduced = pca.transform((x_test))
        lr_pca = OneVsRestClassifier(LogisticRegression(class_weight='balanced', C=c, solver='sag', max_iter = 5000), n_jobs=-1)
        lr_pca.fit(x_train_reduced, y_train)
        y_pred = lr_pca.predict(x_test_reduced)
        score=lr_pca.score(x_test_reduced, y_test)
        hl = hamming_loss(y_test, y_pred)
        dict_loss_pca[c].append(hl)
        dict_accuracy_pca[c].append(score)


print("Time to load data: {} seconds".format(time.time() - start_time))


# Plotting LR with PCA

fig, ax = plt.subplots(figsize=(24,12))
ax2 = ax.twinx()
for c, x in dict_loss.items(): 
    avg_loss = {k:np.mean(np.array(v)) for k,v in dict_loss_pca.items()}
    avg_score = {k:np.mean(np.array(v)) for k,v in dict_accuracy_pca.items()}
    list1 = sorted(avg_loss.items())
    list2 = sorted(avg_score.items())
    x_plot, y_plot = zip(*list1)
    ax.plot(x_plot, y_plot, color='orange',marker='d',markersize=10)
    x, y = zip(*list2)
    ax2.plot(x,y, color='green',marker='d',markersize=10)

ax.set_ylabel('Accuracy',fontsize=10)
ax2.set_ylabel('Hamming Loss',fontsize=10)
ax.legend(['Accuracy'],loc=2,fontsize=15)
ax2.legend(['Loss'],loc=1,fontsize=15)
ax.set_xlabel('C- values',fontsize=10)
plt.title("Accuracy and Hamming Loss vs C- values with PCA")
ax.set_xscale('log')
plt.show()

## SVM

### Given a prediction matrix, if all predicted values are < 0, mark the maximum value as the predicted class and if there are several values > 0 mark all of those value as the predicted classes

In [None]:
def get_pred(arr):
    if arr[arr  > 0].size == 0:
        result = np.zeros(arr.shape)
        maxIdx = np.argmax(arr)
        result[maxIdx] = 1
        return result
    else:
        result = arr
        result[result > 0] = 1
        result[result <= 0] = 0
        return result

In [None]:
titleKey = {0: "No Title Data", 1: "50% Covar Title Data", 2: "Full Title Data"}
Loss = {v: [] for k, v in titleKey.items()}
Score = {v: [] for k, v in titleKey.items()}
for t in range(15):
    for i, X in enumerate((shortX, shortX.join(words50X), fullX)):
        title = titleKey[i]
        X_train, X_test, y_train, y_test  = train_test_split(X, labelsY, test_size=.1)
        SVM = LinearSVC(dual=False, max_iter=10000)
        clf = OneVsRestClassifier(SVM)
        clf.fit(X_train, y_train)
        y_pred = clf.decision_function(X_test)
        y_pred = np.apply_along_axis(get_pred, 1, y_pred)
        ham_loss = hamming_loss(y_test, y_pred)
        acc = accuracy_score(y_test, y_pred)
        Score[title].append(acc)
        Loss[title].append(ham_loss)
        
plt.clf()
_, ax = plt.subplots(figsize=(24,12))
for k, v in trialScore.items():
    ax.plot(v, label=k)
plt.ylabel("Accuracy")
plt.xlabel("Trials")
plt.title("Accuracy Of Datasets")
plt.legend(loc='lower right')
plt.show()

plt.clf()
_, ax = plt.subplots()
for k, v in trialLoss.items(figsize=(24,12)):
    ax.plot(v, label=k)
plt.ylabel("Loss")
plt.xlabel("Trials")
plt.title("Hamming Loss Of Datasets")
plt.legend(loc='upper right')
plt.show()

## KNN

In [None]:
features = fullX.to_numpy()
labels = labelsY.to_numpy()
# k-fold crossvalidation with 10 folds
kf = KFold(n_splits=10, random_state=1, shuffle=True)

# calculate accuracy and loss
result = np.zeros(10)
loss = np.zeros(10)
genre_neigh = KNeighborsClassifier(n_neighbors=13)
i = 0
for train_idx, test_idx in kf.split(features):
    X_train2, X_test2 = features[train_idx], features[test_idx]
    y_train2, y_test2 = labels[train_idx], labels[test_idx]
    genre_neigh.fit(X_train2, y_train2)
    y_predict2 = genre_neigh.predict(X_test2)
    result[i] = genre_neigh.score(X_test2, y_test2)
    loss[i] = hamming_loss(y_test2, y_predict2)
    i += 1

words50 = words50X.to_numpy()    
# calculate accuracy and loss with 50% covar, 10 fold CV
result_50 = np.zeros(10)
loss_50 = np.zeros(10)
i = 0
for train_idx, test_idx in kf.split(words50):
    X_train3, X_test3 = words50[train_idx], words50[test_idx]
    y_train3, y_test3 = labels[train_idx], labels[test_idx]
    genre_neigh.fit(X_train3, y_train3)
    y_predict3 = genre_neigh.predict(X_test3)
    result_50[i] = genre_neigh.score(X_test3, y_test3)
    loss_50[i] = hamming_loss(y_test3, y_predict3)
    i += 1
    
# plot subset accuracy and Hamming loss
fig, ax1 = plt.subplots(figsize=(24,12))
ax1.set_xlabel('Folds',fontsize=20)
ax1.set_ylabel('Subset Accuracy',fontsize=20)
ax1.plot(np.arange(1,11), result, color='red',marker='o',markersize=10)
ax1.legend(['Accuracy'],loc=2,fontsize=20)

ax2 = ax1.twinx()

ax2.set_ylabel('Hamming Loss',fontsize=20)
ax2.plot(np.arange(1,11), loss, color='blue',marker='s',markersize=10)
ax2.legend(['Loss'],loc=1,fontsize=20)

plt.title('10-fold crossvalidation, KNN',fontsize=20)
fig.tight_layout()
plt.grid(1)
plt.show

## Random Forest

In [None]:
titleKey = {0: "No Title Data", 2: "Full Title Data"}
trialLoss = {v: [] for k, v in titleKey.items()}
trialScore = {v: [] for k, v in titleKey.items()}
for t in range(10):
    print("Starting trial ", t)
    for i, X in enumerate((shortX, fullX)):
        title = titleKey[i]
        X_train, X_test, y_train, y_test  = train_test_split(X, labelsY, test_size=.1)
        RFC = RandomForestClassifier(n_estimators=100, oob_score=True)
        clf = OneVsRestClassifier(RFC)
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        ham_loss = hamming_loss(y_test, y_pred)
        acc = accuracy_score(y_test, y_pred)
        trialScore[title].append(acc)
        trialLoss[title].append(ham_loss)
        
plt.clf()
_, ax = plt.subplots(figsize=(24,12))
for k, v in trialScore.items():
    ax.plot(v, label=k)
plt.ylabel("Accuracy")
plt.xlabel("Trials")
plt.title("Accuracy Of Datasets")
plt.legend(loc='lower right')
plt.show()

plt.clf()
_, ax = plt.subplots(figsize=(24,12))
for k, v in trialLoss.items():
    ax.plot(v, label=k)
plt.ylabel("Loss")
plt.xlabel("Trials")
plt.title("Hamming Loss Of Datasets")
plt.legend(loc='upper right')
plt.show()