# Machine Learning Assignment 2

### By Alexander de Vryer (1356034)

## Necessary Imports and Helper Functions:

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, BaggingClassifier
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from statistics import mean


In [2]:
# load the csv files into a df
train = pd.read_csv('train_dataset.csv')
test = pd.read_csv('test_dataset.csv')

# test prediction format used for final model output
test_predictions = pd.DataFrame(test['id'])


# helper functions used throughout
def one_hot_encoder(train, test, att):
    ohc = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
    train_element = pd.DataFrame(ohc.fit_transform(train[att].to_numpy().reshape(-1, 1)))
    test_element = pd.DataFrame(ohc.transform(test[att].to_numpy().reshape(-1, 1)))
    return train_element, test_element

# draw a scatter plot of the data
def draw_plot(att1):
    sns.scatterplot(x = train[att1], y = train['imdb_score_binned'], alpha=0.4)
    plt.xlabel(att1)
    plt.ylabel('imdb_score_binned')
    plt.title(f'{att1} vs imdb_score')
    plt.savefig(f'Scatter {att1} vs imdb_score.png')
    plt.close("all")
    

## Read Data:

In [3]:
# load the np arrays
train_actor_1_name = pd.DataFrame(np.load('train_countvec_features_actor_1_name.npy'))
test_actor_1_name = pd.DataFrame(np.load('test_countvec_features_actor_1_name.npy'))

train_actor_2_name = pd.DataFrame(np.load('train_countvec_features_actor_2_name.npy'))
test_actor_2_name = pd.DataFrame(np.load('test_countvec_features_actor_2_name.npy'))

train_director_name = pd.DataFrame(np.load('train_countvec_features_director_name.npy'))
test_director_name = pd.DataFrame(np.load('test_countvec_features_director_name.npy'))

train_genres = pd.DataFrame(np.load('train_doc2vec_features_genre.npy'))
test_genres = pd.DataFrame(np.load('test_doc2vec_features_genre.npy'))

train_plot_keywords = pd.DataFrame(np.load('train_doc2vec_features_plot_keywords.npy'))
test_plot_keywords = pd.DataFrame(np.load('test_doc2vec_features_plot_keywords.npy'))

train_title_embedding = pd.DataFrame(np.load('train_fasttext_title_embeddings.npy'))
test_title_embedding = pd.DataFrame(np.load('test_fasttext_title_embeddings.npy'))

# apply one-hot encoding to categorical features
train_content_rating, test_content_rating = one_hot_encoder(train, test, 'content_rating')
train_language, test_language = one_hot_encoder(train, test, 'language')
train_country, test_country = one_hot_encoder(train, test, 'country')


# drop all non-numeric features from the main df
features_to_drop = ['id', 'actor_1_name', 'actor_2_name', 'actor_3_name', 'director_name', 'movie_title', 
                    'genres', 'language', 'country', 'content_rating', 'title_year', 'plot_keywords', 
                    'title_embedding']

for feature in features_to_drop:
    train = train.drop(feature, axis='columns')
    test = test.drop(feature, axis='columns')


# Feature Engineering:

## Normalisation:

In [4]:
# the numerical attributes we want to standardise
numerical_attributes = ['num_critic_for_reviews', 'duration', 'director_facebook_likes', 'actor_3_facebook_likes', 
                        'actor_1_facebook_likes', 'gross', 'num_voted_users', 'cast_total_facebook_likes', 
                        'facenumber_in_poster', 'num_user_for_reviews', 'actor_2_facebook_likes', 
                        'movie_facebook_likes', 'average_degree_centrality']

# standardise all numeric attributes to have mean 0 and std dev 1
std = StandardScaler()
train[numerical_attributes] = std.fit_transform(train[numerical_attributes])
test[numerical_attributes] = std.transform(test[numerical_attributes])

train_genres = std.fit_transform(train_genres)
test_genres = std.transform(test_genres)

train_plot_keywords = std.fit_transform(train_plot_keywords)
test_plot_keywords = std.transform(test_plot_keywords)

train_title_embedding = std.fit_transform(train_title_embedding)
test_title_embedding = std.transform(test_title_embedding)


## Feature Selection using Mutual Information:

In [5]:
# evaluate the mutual information scores of each feature in the dataset and print the highest in sorted order

mi_scores = mutual_info_classif(train.iloc[:,:-1], train.iloc[:,-1], discrete_features=False, n_neighbors=9)
print(f"Original continuous features: {sorted(mi_scores, reverse=True)[0:10]}\n")

mi_scores = mutual_info_classif(train_actor_1_name, train.iloc[:,-1], discrete_features=True, n_neighbors=9)
print(f"Actor 1: {sorted(mi_scores, reverse=True)[0:10]}\n")

mi_scores = mutual_info_classif(train_actor_2_name, train.iloc[:,-1], discrete_features=True, n_neighbors=9)
print(f"Actor 2: {sorted(mi_scores, reverse=True)[0:10]}\n")

mi_scores = mutual_info_classif(train_director_name, train.iloc[:,-1], discrete_features=True, n_neighbors=9)
print(f"Director: {sorted(mi_scores, reverse=True)[0:10]}\n")

mi_scores = mutual_info_classif(train_genres, train.iloc[:,-1], discrete_features=False, n_neighbors=9)
print(f"Genres: {sorted(mi_scores, reverse=True)[0:10]}\n")

mi_scores = mutual_info_classif(train_plot_keywords, train.iloc[:,-1], discrete_features=False, n_neighbors=9)
print(f"Plot keywords: {sorted(mi_scores, reverse=True)[0:10]}\n")

mi_scores = mutual_info_classif(train_title_embedding, train.iloc[:,-1], discrete_features=False, n_neighbors=9)
print(f"Title embeddings: {sorted(mi_scores, reverse=True)[0:10]}\n")

mi_scores = mutual_info_classif(train_content_rating, train.iloc[:,-1], discrete_features=True, n_neighbors=9)
print(f"Content rating: {sorted(mi_scores, reverse=True)[0:10]}\n")

mi_scores = mutual_info_classif(train_language, train.iloc[:,-1], discrete_features=True, n_neighbors=9)
print(f"Language: {sorted(mi_scores, reverse=True)[0:10]}\n")

mi_scores = mutual_info_classif(train_country, train.iloc[:,-1], discrete_features=True, n_neighbors=9)
print(f"Country: {sorted(mi_scores, reverse=True)[0:10]}")


Original continuous features: [0.17680579485444792, 0.09349919324041922, 0.0829956328122119, 0.08072565833912648, 0.0650280251135027, 0.0633598594307263, 0.038647284262994575, 0.0377363311929404, 0.03723657609921416, 0.03482002316325028]

Actor 1: [0.003534217741481222, 0.0034572371302196755, 0.0033873906249415435, 0.0033873906249415435, 0.002951848946812396, 0.002639922618862164, 0.002550624480618293, 0.0023502125095054035, 0.0022063069268572657, 0.002179560531694472]

Actor 2: [0.002934984662683963, 0.002934984662683963, 0.0028764549617678447, 0.002403405733233436, 0.0023317962851738617, 0.0023150125224301404, 0.0023137696470028495, 0.0021510663378194227, 0.002100763620702814, 0.002100763620702814]

Director: [0.004820623907788771, 0.004503269364648477, 0.0043202268406079165, 0.003520564833607431, 0.0034568063407268445, 0.0034568063407268445, 0.003154900682523741, 0.003154900682523741, 0.002625567762462727, 0.0024742706677133713]

Genres: [0.049570624646464, 0.04418245142969557, 0.04

## Feature Combinations of High-MI Features using Polynomial Features:

In [9]:
# SOME OF THE CODE BELOW HAS BEEN ADAPTED FROM ANSWERS POSTED ON STACKOVERFLOW, LINK IS BELOW:
# https://stackoverflow.com/questions/39839112/the-easiest-way-for-getting-feature-names-after-running-selectkbest-in-scikit-le

# select the 6-highest MI features and create degree-6 polynomial features using these

mi = SelectKBest(mutual_info_classif, k=6)
poly = PolynomialFeatures(degree=6)


# ORIGINAL CONTINUOUS FEATURES
# get the highest MI features
mi.fit(train.iloc[:,:-1], train.iloc[:,-1])
print(f"Highest MI continuous features: {mi.get_feature_names_out()}\n")
col_index = mi.get_support(indices=True)
train_poly = pd.DataFrame(train.iloc[:,:-1]).iloc[:,col_index]
test_poly = pd.DataFrame(test).iloc[:,col_index]

# create the polynomial features
train_poly = poly.fit_transform(train_poly)
test_poly = poly.transform(test_poly)
col_names = poly.get_feature_names_out()

# standardise the features
train_poly = std.fit_transform(train_poly)
test_poly = std.transform(test_poly)

# create a df for these features and calculate the mi scores for them
train_poly = pd.DataFrame(train_poly, columns=col_names)
test_poly = pd.DataFrame(test_poly, columns=col_names)

mi_scores = mutual_info_classif(train_poly, train.iloc[:,-1], discrete_features=False, n_neighbors=9)
print(f"Original continuous feature combinations: {sorted(mi_scores, reverse=True)[0:20]}\n")

# ABOVE PROCESS IS REPEATED FOR THE BELOW TWO DATA COLLECTIONS
# GENRES
mi.fit(train_genres, train.iloc[:,-1])
print(f"Highest MI genre features: {mi.get_feature_names_out()}\n")
col_index = mi.get_support(indices=True)
train_genre_poly = pd.DataFrame(train_genres).iloc[:,col_index]
test_genre_poly = pd.DataFrame(test_genres).iloc[:,col_index]

train_genre_poly = poly.fit_transform(train_genre_poly)
test_genre_poly = poly.transform(test_genre_poly)
col_names = poly.get_feature_names_out()

train_genre_poly = std.fit_transform(train_genre_poly)
test_genre_poly = std.transform(test_genre_poly)

train_genre_poly = pd.DataFrame(train_genre_poly, columns=col_names)
test_genre_poly = pd.DataFrame(test_genre_poly, columns=col_names)

mi_scores = mutual_info_classif(train_genre_poly, train.iloc[:,-1], discrete_features=False, n_neighbors=9)
print(f"Genre feature combinations: {sorted(mi_scores, reverse=True)[0:10]}\n")


# TITLE EMBEDDINGS
mi.fit(train_title_embedding, train.iloc[:,-1])
print(f"Highest MI title embedding features: {mi.get_feature_names_out()}\n")
col_index = mi.get_support(indices=True)
train_title_poly = pd.DataFrame(train_title_embedding).iloc[:,col_index]
test_title_poly = pd.DataFrame(test_title_embedding).iloc[:,col_index]

train_title_poly = poly.fit_transform(train_title_poly)
test_title_poly = poly.transform(test_title_poly)
col_names = poly.get_feature_names_out()

train_title_poly = std.fit_transform(train_title_poly)
test_title_poly = std.transform(test_title_poly)

train_title_poly = pd.DataFrame(train_title_poly, columns=col_names)
test_title_poly = pd.DataFrame(test_title_poly, columns=col_names)

mi_scores = mutual_info_classif(train_title_poly, train.iloc[:,-1], discrete_features=False, n_neighbors=9)
print(f"Title embedding feature combinations: {sorted(mi_scores, reverse=True)[0:5]}")


Highest MI continuous features: ['num_critic_for_reviews' 'duration' 'director_facebook_likes'
 'num_voted_users' 'num_user_for_reviews' 'movie_facebook_likes']

Original continuous feature combinations: [0.17667988759507036, 0.17455997823467895, 0.17288838232017945, 0.1629240446020579, 0.16235615896105182, 0.15996964717325834, 0.1565325868557288, 0.15451985892431352, 0.1531338214089062, 0.15284811836171164, 0.15152185023508258, 0.15023885207372967, 0.14993124058245577, 0.14898916976860432, 0.1488764264169964, 0.14863916530124088, 0.14689367742615245, 0.1466123009535938, 0.14287772653195896, 0.1424995773872202]

Highest MI genre features: ['x18' 'x31' 'x32' 'x40' 'x85' 'x89']

Genre feature combinations: [0.05037355885700068, 0.048849410097768775, 0.04814406698732654, 0.04808201970921244, 0.04800306123349252, 0.047558777411201625, 0.04753448458271725, 0.0469442188376652, 0.04661407831241737, 0.046389028642880525]

Highest MI title embedding features: ['x15' 'x20' 'x24' 'x44' 'x50' 'x63

In [10]:
# find and select the 20 best original continuous polynomial features
mi_poly = SelectKBest(mutual_info_classif, k=20)
mi_poly.fit(train_poly, train.iloc[:,-1])
col_index = mi_poly.get_support(indices=True)
train_poly_best = pd.DataFrame(train_poly).iloc[:,col_index]
test_poly_best = pd.DataFrame(test_poly).iloc[:,col_index]
print(f"Continuous features selected: {mi_poly.get_feature_names_out()}\n")

# find and select the 10 best genre polynomial features
mi_poly = SelectKBest(mutual_info_classif, k=10)
mi_poly.fit(train_genre_poly, train.iloc[:,-1])
col_index = mi_poly.get_support(indices=True)
train_genre_best = pd.DataFrame(train_genre_poly).iloc[:,col_index]
test_genre_best = pd.DataFrame(test_genre_poly).iloc[:,col_index]
print(f"Genre features selected: {mi_poly.get_feature_names_out()}\n")

# find and select the 5 best title embedding polynomial features
mi_poly = SelectKBest(mutual_info_classif, k=5)
mi_poly.fit(train_title_poly, train.iloc[:,-1])
col_index = mi_poly.get_support(indices=True)
train_title_best = pd.DataFrame(train_title_poly).iloc[:,col_index]
test_title_best = pd.DataFrame(test_title_poly).iloc[:,col_index]
print(f"Title embedding features selected: {mi_poly.get_feature_names_out()}")

# combine the features into a dataframe
train_overall = pd.concat([train_poly_best, train_genre_best, train_title_best], axis=1)
test_overall = pd.concat([test_poly_best, test_genre_best, test_title_best], axis=1)


Continuous features selected: ['num_voted_users' 'num_voted_users^2'
 'director_facebook_likes^2 num_voted_users' 'num_voted_users^3'
 'duration director_facebook_likes num_voted_users^2'
 'director_facebook_likes^3 num_voted_users'
 'director_facebook_likes num_voted_users^3' 'num_voted_users^4'
 'num_critic_for_reviews^2 num_voted_users^3'
 'duration^2 num_voted_users^3'
 'duration^2 num_voted_users movie_facebook_likes^2'
 'duration director_facebook_likes num_voted_users^2 movie_facebook_likes'
 'duration num_voted_users^4' 'director_facebook_likes^4 num_voted_users'
 'director_facebook_likes^2 num_voted_users^3' 'num_voted_users^5'
 'num_voted_users^4 movie_facebook_likes' 'duration num_voted_users^5'
 'director_facebook_likes num_voted_users^4 movie_facebook_likes'
 'num_voted_users^6']

Genre features selected: ['x0^2 x2 x3' 'x3^3 x5' 'x1 x3^4' 'x4 x5^4' 'x0^3 x2^2 x4'
 'x0^2 x1 x2 x4 x5' 'x0^2 x2 x4^2 x5' 'x0 x3 x4 x5^3' 'x1 x3^3 x4 x5'
 'x3^4 x4 x5']

Title embedding features 

# Classifiers:

## Hyperparameter Tuning:

## AdaBoost Classifier

In [11]:
# Tune the learning rate and number of base estimators used for ABC

for i in [25, 50, 75, 100]:
    for j in [1.0, 1.25, 1.5, 1.75, 2.0, 2.25]:
        abc = AdaBoostClassifier(n_estimators=i, learning_rate=j)
        abc.fit(train_overall, train.iloc[:,-1])
        abc_acc = cross_val_score(abc, train_overall, train.iloc[:,-1], cv = 10)
        print(f"{i} estimators and {j} learning rate: {mean(abc_acc)}")


25 estimators and 1.0 learning rate: 0.590875968992248
25 estimators and 1.25 learning rate: 0.4873565891472868
25 estimators and 1.5 learning rate: 0.4344141749723145
25 estimators and 1.75 learning rate: 0.32825359911406427
25 estimators and 2.0 learning rate: 0.5995980066445182
25 estimators and 2.25 learning rate: 0.49265227021040975
50 estimators and 1.0 learning rate: 0.506
50 estimators and 1.25 learning rate: 0.4364595791805094
50 estimators and 1.5 learning rate: 0.36286932447397563
50 estimators and 1.75 learning rate: 0.3269091915836102
50 estimators and 2.0 learning rate: 0.13111627906976744
50 estimators and 2.25 learning rate: 0.15576190476190477
75 estimators and 1.0 learning rate: 0.4663798449612403
75 estimators and 1.25 learning rate: 0.381516057585825
75 estimators and 1.5 learning rate: 0.33156921373200443
75 estimators and 1.75 learning rate: 0.31621483942414175
75 estimators and 2.0 learning rate: 0.572922480620155
75 estimators and 2.25 learning rate: 0.144073089

## Random Forest Classifier

In [12]:
# Tune the number of trees in the forest for RFC

for i in [25, 50, 75, 100, 125, 150]:
    rfc = RandomForestClassifier(n_estimators=i)
    rfc.fit(train_overall, train.iloc[:,-1])
    rfc_acc = cross_val_score(rfc, train_overall, train.iloc[:,-1], cv = 10)
    print(f"{i} estimators: {mean(rfc_acc)}")
    

25 estimators: 0.6810874861572536
50 estimators: 0.6854163898117386
75 estimators: 0.6934119601328903
100 estimators: 0.6880841638981174
125 estimators: 0.6914119601328904
150 estimators: 0.6894119601328904


## Bagging Classifier

In [14]:
# Tune the base classifier and number of base estimators used for BC

for classifier in [DecisionTreeClassifier(), SVC(), GaussianNB()]:
    for i in [10, 20, 30, 40 , 50]:
        bc = BaggingClassifier(estimator=classifier, n_estimators=i)
        bc.fit(train_overall, train.iloc[:,-1])
        bc_acc = cross_val_score(rfc, train_overall, train.iloc[:,-1], cv = 10)
        print(f"{classifier} base classifier and {i} estimators: {mean(bc_acc)}")
        

DecisionTreeClassifier() base classifier and 10 estimators: 0.6880808416389812
DecisionTreeClassifier() base classifier and 20 estimators: 0.6900797342192692
DecisionTreeClassifier() base classifier and 30 estimators: 0.692407530454042
DecisionTreeClassifier() base classifier and 40 estimators: 0.6924152823920265
DecisionTreeClassifier() base classifier and 50 estimators: 0.691750830564784
SVC() base classifier and 10 estimators: 0.6910874861572536
SVC() base classifier and 20 estimators: 0.6934119601328904
SVC() base classifier and 30 estimators: 0.694079734219269
SVC() base classifier and 40 estimators: 0.6954119601328903
SVC() base classifier and 50 estimators: 0.6947452934662237
GaussianNB() base classifier and 10 estimators: 0.695078626799557
GaussianNB() base classifier and 20 estimators: 0.69174861572536
GaussianNB() base classifier and 30 estimators: 0.6927541528239203
GaussianNB() base classifier and 40 estimators: 0.6934174972314507
GaussianNB() base classifier and 50 estimat

# Final Models:

## Model Construction

In [15]:
# ABC
abc = AdaBoostClassifier(n_estimators=25, learning_rate=2.0)

# RFC
rfc = RandomForestClassifier(n_estimators=75)

# BC
bc = BaggingClassifier(estimator=SVC(), n_estimators=40)


## Confusion Matrix using Holdout

In [21]:
# create a train-test split with a 20% holdout
X_train, X_test, y_train, y_test = train_test_split(train_overall, train.iloc[:,-1], test_size=0.2)

for classifier in [(abc, "AdaBoostClassifier"), (rfc, "RandomForestClassifier"), (bc, "BaggingClassifier")]:
    # train each model on the 80% data
    classifier[0].fit(X_train, y_train)
    y_pred = classifier[0].predict(X_test)
    
    # create a confusion matrix for the predictions
    cm = confusion_matrix(y_test, y_pred)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm)
    disp.plot()
    plt.title(f"Confusion Matrix for {classifier[1]}")
    plt.savefig(f"confusionMatrix {classifier[1]}.png")
    plt.close("all")
    

## Final Predictions Output

In [23]:
for classifier in [(abc, "AdaBoostClassifier"), (rfc, "RandomForestClassifier"), (bc, "BaggingClassifier")]:
    # train the model on the entire training set and output the predictions to a csv file
    classifier[0].fit(train_overall, train.iloc[:,-1])
    test_predictions['imdb_score_binned'] = classifier[0].predict(test_overall)
    test_predictions.to_csv(f'test_predictions_{classifier[1]}.csv', index = False)
