In [None]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import confusion_matrix, balanced_accuracy_score, accuracy_score, hamming_loss
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

In [None]:
shortX = pd.read_csv("../imdbmovies/features.csv")
fullX = pd.read_csv("../imdbmovies/features_vectorized.csv")
words50X = pd.read_csv("../imdbmovies/vectorization50.csv", header=None)
labelsY = pd.read_csv("../imdbmovies/labels.csv")

In [None]:
shortX.pop("title")

# Optimizing Estimator Value

In [None]:
estimators = [10, 50, 100, 500, 1000]

In [None]:
score = {}
loss = {}
for e in estimators:
    score[e] = []
    loss[e] = []
    print("Testing estimator ", e)
    for trial in range(3):
        X_train, X_test, y_train, y_test  = train_test_split(shortX.join(words50X), labelsY, test_size=.1)                
        RFC = RandomForestClassifier(n_estimators=e, oob_score=True)
        clf = OneVsRestClassifier(RFC)        
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        acc = accuracy_score(y_test, y_pred)
        ham_loss = hamming_loss(y_test, y_pred)
        score[e].append(accuracy_score(y_test, y_pred))
        loss[e].append(hamming_loss(y_test, y_pred))
        print("Finished Trial ", trial)

In [None]:
X_train, X_test, y_train, y_test  = train_test_split(fullX, labelsY, test_size=.1)                
RFC = RandomForestClassifier(oob_score=True)
clf = OneVsRestClassifier(RFC)        
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
acc = accuracy_score(y_test, y_pred)
ham_loss = hamming_loss(y_test, y_pred)

In [None]:
plt.clf()
_, ax = plt.subplots()
avgScore = {k:np.mean(np.array(v)) for k,v in score.items()}
lists = sorted(avgScore.items())
x_plot, y_plot = zip(*lists)
ax.plot(x_plot, y_plot)
plt.ylabel("Accuracy")
plt.xlabel("E Values")
plt.title("Accuracy Of Datasets")
ax.set_xscale('log')
plt.legend(loc='lower right')
# plt.savefig("./figures/DatasetvAccuracy.png")
plt.show()

In [None]:
plt.clf()
_, ax = plt.subplots()
avgLoss = {k:np.mean(np.array(v)) for k,v in loss.items()}
lists = sorted(avgLoss.items())
x_plot, y_plot = zip(*lists)
ax.plot(x_plot, y_plot)
plt.ylabel("Accuracy")
plt.xlabel("E Values")
plt.title("Hamming Loss Of Datasets")
ax.set_xscale('log')
plt.legend(loc='lower right')
# plt.savefig("./figures/DatasetvAccuracy.png")
plt.show()

# Modeling

In [None]:
titleKey = {0: "No Title Data", 1: "50% Covar Title Data", 2: "Full Title Data"}
trialLoss = {v: [] for k, v in titleKey.items()}
trialScore = {v: [] for k, v in titleKey.items()}
for t in range(10):
    print("Starting trial ", t)
    for i, X in enumerate((shortX, shortX.join(words50X), fullX)):
        title = titleKey[i]
        X_train, X_test, y_train, y_test  = train_test_split(X, labelsY, test_size=.1)
        RFC = RandomForestClassifier(n_estimators=100, oob_score=True)
        clf = OneVsRestClassifier(RFC)
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        ham_loss = hamming_loss(y_test, y_pred)
        acc = accuracy_score(y_test, y_pred)
        trialScore[title].append(acc)
        trialLoss[title].append(ham_loss)

## Plotting Accuracy

In [None]:
plt.clf()
_, ax = plt.subplots()
for k, v in trialScore.items():
    ax.plot(v, label=k)
plt.ylabel("Accuracy")
plt.xlabel("Trials")
plt.title("Accuracy Of Datasets")
plt.legend(loc='lower right')
# plt.savefig("../figures/DatasetvAccuracy.png")
plt.show()

In [None]:
colScore = np.zeros((10, 3))
i = 0
for k, v in trialScore.items():
    colScore[:, i] = np.array(v)
    i += 1
plt.clf()
plt.boxplot(colScore, labels=list(trialScore.keys()))
# plt.savefig("../figures/DatasetvAccuracyBoxPlot.png")
plt.show()

## Plotting Hamming Loss

In [None]:
plt.clf()
_, ax = plt.subplots()
for k, v in trialLoss.items():
    ax.plot(v, label=k)
plt.ylabel("Loss")
plt.xlabel("Trials")
plt.title("Hamming Loss Of Datasets")
plt.legend(loc='upper right')
# plt.savefig("../figures/DatasetvHammingLoss.png")
plt.show()

In [None]:
colScore = np.zeros((10, 3))
i = 0
for k, v in trialLoss.items():
    colScore[:, i] = np.array(v)
    i += 1
plt.clf()
plt.boxplot(colScore, labels=list(trialLoss.keys()))
# plt.savefig("../figures/DatasetvHammingLossBoxPlot.png")
plt.show()