In [594]:
import pandas as pd

train = pd.read_csv('train_dataset.csv')
test = pd.read_csv('test_dataset.csv')

In [595]:
import numpy as np

names_train = ['features_countvec/train_countvec_features_actor_1_name.npy', 'features_countvec/train_countvec_features_actor_2_name.npy', 
               'features_countvec/train_countvec_features_director_name.npy', 'features_doc2vec/train_doc2vec_features_genre.npy',
               'features_doc2vec/train_doc2vec_features_plot_keywords.npy', 'features_fasttext/train_fasttext_title_embeddings.npy']
names_test = ['features_countvec/test_countvec_features_actor_1_name.npy', 'features_countvec/test_countvec_features_actor_2_name.npy', 
               'features_countvec/test_countvec_features_director_name.npy', 'features_doc2vec/test_doc2vec_features_genre.npy',
               'features_doc2vec/test_doc2vec_features_plot_keywords.npy', 'features_fasttext/test_fasttext_title_embeddings.npy']

for name in names_train:
    print(name)
    print(len(np.load(name)[0]))

# The count vectors have too many columns and need to be trimmed

features_countvec/train_countvec_features_actor_1_name.npy
2063
features_countvec/train_countvec_features_actor_2_name.npy
2919
features_countvec/train_countvec_features_director_name.npy
2113
features_doc2vec/train_doc2vec_features_genre.npy
100
features_doc2vec/train_doc2vec_features_plot_keywords.npy
100
features_fasttext/train_fasttext_title_embeddings.npy
100


In [596]:
def create_columns(len, unique):
    columns = []
    for i in range(len):
        columns.append(str(i + unique))
    return columns

In [597]:
temp1 = np.load('features_countvec/train_countvec_features_actor_1_name.npy')
df_act_1 = pd.DataFrame(temp1, columns=create_columns(len(temp1[0]), 0))

temp2 = np.load('features_countvec/train_countvec_features_actor_2_name.npy')
df_act_2 = pd.DataFrame(temp2, columns=create_columns(len(temp2[0]), len(temp1[0])))

temp3 = np.load('features_countvec/train_countvec_features_director_name.npy')
df_dir = pd.DataFrame(temp3, columns=create_columns(len(temp3[0]), len(temp1[0]) + len(temp2[0])))

temp4 = np.load('features_doc2vec/train_doc2vec_features_genre.npy')
df_genre = pd.DataFrame(temp4, columns=create_columns(len(temp4[0]), len(temp1[0]) + len(temp2[0]) + 100))

temp5 = np.load('features_doc2vec/train_doc2vec_features_plot_keywords.npy')
df_keyword = pd.DataFrame(temp5, columns=create_columns(len(temp5[0]), len(temp1[0]) + len(temp2[0]) + 200))

temp6 = np.load('features_fasttext/train_fasttext_title_embeddings.npy')
df_title = pd.DataFrame(temp6, columns=create_columns(len(temp6[0]), len(temp1[0]) + len(temp2[0]) + 300))

In [598]:
# Implement PCA to cut down columns further
from sklearn.decomposition import PCA

pca = PCA(n_components=1000)

df_list = [df_act_1, df_act_2, df_dir, df_genre, df_keyword, df_title]
train_pre_data = pd.concat(df_list, axis=1)

columns = []
for i in range(1000):
    columns.append(str(i))

principalComponents = pca.fit_transform(train_pre_data)

df_pca = pd.DataFrame(data=principalComponents, columns=columns)

df_pca.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,990,991,992,993,994,995,996,997,998,999
0,-0.04087,-0.020972,-0.000291,-0.019807,-0.067842,-0.039268,0.021977,0.009322,-0.022308,-0.05209,...,0.022151,0.062721,0.002703,0.009476,-0.024666,-0.011129,-8e-06,0.012796,0.023309,-0.006035
1,-0.033925,0.020134,-0.035483,0.005658,-0.060051,-0.015004,-0.044098,-0.028775,0.066608,-0.01912,...,-0.073046,0.038297,-0.044502,-0.02714,0.041589,0.113115,0.003074,-0.070862,-0.003725,-0.001442
2,-0.107427,0.041082,1.065319,0.087488,0.233568,-0.097163,-0.085989,-0.001578,0.078709,-0.038435,...,-0.02159,-0.036569,-0.066415,-0.018222,-0.038458,-0.053032,0.004183,0.052738,0.036958,-0.009681
3,0.954659,-0.382909,0.10239,-0.11821,-0.026847,-0.015419,-0.038244,-0.01905,0.041273,0.030041,...,0.009866,0.021702,-0.072854,0.023565,0.025225,-0.00184,-0.007941,0.004382,0.001915,0.025532
4,-0.039269,-0.068133,-0.053963,-0.056048,-0.171098,-0.109997,-0.184301,-0.69509,-0.328191,0.731292,...,-0.000423,-0.005158,-0.019132,-0.006927,-0.016068,-0.01627,-0.005062,0.026129,-0.033984,-0.041692


In [599]:
from sklearn.feature_selection import RFE
from sklearn.tree import DecisionTreeClassifier

selector = RFE(estimator=DecisionTreeClassifier(), n_features_to_select=100, step=10)
test_x = selector.fit_transform(df_pca, train['imdb_score_binned'])

In [600]:
columns = []
for i in range(100):
    columns.append(str(i))

df_selected_train = pd.DataFrame(data=test_x, columns=columns)

In [601]:
# Drop old columns and unnecessary columns
titles = ['actor_1_name', 'actor_2_name', 'director_name', 'genres', 'plot_keywords', 'language', 'country', 'id',
            'title_embedding', 'movie_title', 'actor_3_name', 'actor_3_facebook_likes']
train_new = pd.concat([train.drop(columns=titles), df_selected_train], axis=1)

print(len(train_new.columns))


115


In [602]:
# Use OHE
train['content_rating'].value_counts()

content_rating
R            1362
PG-13        1039
PG            458
G              68
Not Rated      27
Unrated        19
Approved       14
X               9
NC-17           3
M               2
Passed          2
GP              1
Name: count, dtype: int64

In [603]:
# Combine ratings
train.loc[train.content_rating == 'GP', 'content_rating'] = 'G'
train.loc[train.content_rating == 'Passed', 'content_rating'] = 'Approved'
train.loc[train.content_rating == 'NC-17', 'content_rating'] = 'X'
train.loc[train.content_rating == 'Not Rated', 'content_rating'] = 'Unrated'
train.loc[train.content_rating == 'M', 'content_rating'] = 'PG-13'

train['content_rating'].value_counts()


content_rating
R           1362
PG-13       1041
PG           458
G             69
Unrated       46
Approved      16
X             12
Name: count, dtype: int64

In [604]:
from sklearn.preprocessing import OneHotEncoder

enc = OneHotEncoder()
encoded = enc.fit_transform(train[['content_rating']])

df_temp = pd.DataFrame(encoded.toarray(), columns=['Approved', 'G', 'PG', 'PG-13', 'R', 'Unrated', 'X'])

train_final = pd.concat([train_new.drop(columns=['content_rating']), df_temp], axis=1)

In [605]:
y = train_final['imdb_score_binned']
X = train_final.drop(columns=['imdb_score_binned'])

In [606]:
# Apply post processing, ie normalisation, standardisation
# Remove outliers
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
#scaler = MinMaxScaler()
scaler = StandardScaler()
X_norm = scaler.fit_transform(X)

In [607]:
# Get test data
temp1_test = np.load('features_countvec/test_countvec_features_actor_1_name.npy')
df_act_1_test = pd.DataFrame(temp1_test, columns=create_columns(len(temp1_test[0]), 0))

temp2_test = np.load('features_countvec/test_countvec_features_actor_2_name.npy')
df_act_2_test = pd.DataFrame(temp2_test, columns=create_columns(len(temp2_test[0]), len(temp1_test[0])))

temp3_test = np.load('features_countvec/test_countvec_features_director_name.npy')
df_dir_test = pd.DataFrame(temp3_test, columns=create_columns(len(temp3_test[0]), len(temp1_test[0]) + len(temp2_test[0])))

temp4_test = np.load('features_doc2vec/test_doc2vec_features_genre.npy')
df_genre_test = pd.DataFrame(temp4_test, columns=create_columns(len(temp4_test[0]), len(temp1_test[0]) + len(temp2_test[0]) + 100))

temp5_test = np.load('features_doc2vec/test_doc2vec_features_plot_keywords.npy')
df_keyword_test = pd.DataFrame(temp5_test, columns=create_columns(len(temp5_test[0]), len(temp1_test[0]) + len(temp2_test[0]) + 200))

temp6_test = np.load('features_fasttext/test_fasttext_title_embeddings.npy')
df_title_test = pd.DataFrame(temp6_test, columns=create_columns(len(temp6_test[0]), len(temp1_test[0]) + len(temp2_test[0]) + 300))

#df_act_1_test = drop_col(df_act_1_test)
#df_act_2_test = drop_col(df_act_2_test)
#df_dir_test = drop_col(df_dir_test)

In [608]:
df_list = [df_act_1_test, df_act_2_test, df_dir_test, df_genre_test, df_keyword_test, df_title_test]
test_pre_data = pd.concat(df_list, axis=1)

In [609]:
# Apply fitted PCA to test data
test_pcs = pca.transform(test_pre_data)

columns = []
for i in range(1000):
    columns.append(str(i))

df_pca_test = pd.DataFrame(data=test_pcs, columns=columns)

In [610]:
x_test_100 = selector.transform(df_pca_test)

In [611]:
columns = []
for i in range(100):
    columns.append(str(i))

df_selected_test = pd.DataFrame(data=x_test_100, columns=columns)

In [612]:
test_new = pd.concat([test.drop(columns=titles), df_selected_test], axis=1)

print(len(test_new.columns))

114


In [624]:
# Remove low frequency ratings
#count = test_new['content_rating'].value_counts()
#test_new = test_new[~test_new['content_rating'].isin(count[count < 15].index)]  

# Combine ratings
test.loc[test.content_rating == 'GP', 'content_rating'] = 'G'
test.loc[test.content_rating == 'Passed', 'content_rating'] = 'Approved'
test.loc[test.content_rating == 'NC-17', 'content_rating'] = 'X'
test.loc[test.content_rating == 'Not Rated', 'content_rating'] = 'Unrated'
test.loc[test.content_rating == 'M', 'content_rating'] = 'PG-13'

encoded = enc.transform(test[['content_rating']])

df_temp = pd.DataFrame(encoded.toarray(), columns=['Approved', 'G', 'PG', 'PG-13', 'R', 'Unrated', 'X'])

test_new = pd.concat([test_new.drop(columns='content_rating'), df_temp], axis=1)

KeyError: "['content_rating'] not found in axis"

In [622]:
X_norm_test = scaler.transform(test_new)

In [625]:
# Remove features that were hurting our model
remove_negative_features = ['num_critic_for_reviews', 'director_facebook_likes', 'actor_1_facebook_likes', 'facenumber_in_poster', 
                            'average_degree_centrality', '0', '1', '3', '5', '6', '9', '10', '12', '13', '14', '15', 
                            '16', '17', '19', '21', '23', '24', '25', '27', '28', '29', '30', '31', '33', '36', '37', '38', '44', 
                            '47', '48', '50', '51', '54', '55', '58', '62', '63', '64', '66', '68', '72', '73', '74', '76', '77', 
                            '79', '81', '82', '85', '88', '89', '90', '92', '93', '95', '96', '98', 'G', 'PG', 'R']

X_norm_test_dropped = pd.DataFrame(X_norm_test, columns=test_new.columns).drop(columns=remove_negative_features)

X_norm_dropped = pd.DataFrame(X_norm, columns=test_new.columns).drop(columns=remove_negative_features)

In [626]:
from sklearn.ensemble import GradientBoostingClassifier
GTB = GradientBoostingClassifier(n_estimators=100, learning_rate=0.3,
                                 max_depth=5, random_state=0).fit(X_norm_dropped, y)
y_GTB = GTB.predict(X_norm_test_dropped)
df_GTB = pd.DataFrame(y_GTB, columns=['imdb_score_binned'])
df_GTB.insert(0, 'id', range(1, len(df_GTB) + 1))
df_GTB['imdb_score_binned'].value_counts()

imdb_score_binned
2    586
3    129
4     24
1     12
0      1
Name: count, dtype: int64

In [616]:
df_GTB.to_csv('submission.csv', index=False) 