In [1]:
import pandas as pd

train = pd.read_csv('train_dataset.csv')
test = pd.read_csv('test_dataset.csv')
train.head()

Unnamed: 0,id,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,num_user_for_reviews,language,country,content_rating,title_year,actor_2_facebook_likes,movie_facebook_likes,title_embedding,average_degree_centrality,imdb_score_binned
0,1,Roger Allers,186,73,28,847,Nathan Lane,2000,422783777,Adventure|Animation|Drama|Family|Musical,...,656,English,USA,G,1994,886,17000,[-4.1984697e-03 4.2941985e-03 -1.1961063e-03 ...,0.001576,4
1,2,George A. Romero,252,97,0,233,Shawn Roberts,654,20433940,Horror,...,662,English,Canada,R,2005,529,0,[-4.7586653e-03 2.6511205e-03 -3.7954253e-04 ...,0.000675,2
2,3,David Gordon Green,232,117,234,221,Tye Sheridan,12000,371897,Drama,...,118,English,USA,R,2013,1000,11000,[ 2.78131524e-03 -3.15494463e-03 -6.38332494e-...,0.003002,2
3,4,John Carpenter,297,109,0,145,Richard Masur,957,13782838,Horror|Mystery|Sci-Fi,...,911,English,USA,R,1982,163,23000,[-5.32674184e-03 3.60742491e-03 7.91795843e-...,0.001726,4
4,5,Peter Jackson,297,171,0,857,Orlando Bloom,16000,313837577,Action|Adventure|Drama|Fantasy,...,5060,English,New Zealand,PG-13,2001,5000,21000,[-4.2586620e-03 3.6257182e-03 -1.5326265e-03 ...,0.001876,4


In [2]:
# Remove any less than 10
train['country'].value_counts()

country
USA               2382
UK                 255
France              86
Germany             61
Canada              46
Australia           32
Spain               21
Japan               12
New Zealand         10
China                9
Hong Kong            9
Italy                8
South Korea          7
Ireland              6
Mexico               6
Denmark              6
Brazil               4
Thailand             3
South Africa         3
Argentina            3
Iran                 3
Netherlands          3
India                3
Norway               3
Taiwan               2
Czech Republic       2
Russia               2
Romania              2
Hungary              2
Colombia             1
Israel               1
West Germany         1
Official site        1
Afghanistan          1
Aruba                1
Chile                1
Finland              1
Iceland              1
Poland               1
Belgium              1
New Line             1
Indonesia            1
Name: count, dtype: int64

In [3]:
# Remove language column
train['language'].value_counts()

language
English       2872
French          32
Spanish         19
Mandarin         9
Japanese         9
German           9
Italian          5
Cantonese        5
Korean           4
Portuguese       4
Hindi            3
Norwegian        3
Dutch            3
Danish           3
Aboriginal       2
Thai             2
Indonesian       2
Dari             2
Persian          2
Kazakh           1
Maya             1
Aramaic          1
Filipino         1
Bosnian          1
Czech            1
Zulu             1
Vietnamese       1
Hungarian        1
Hebrew           1
Mongolian        1
Romanian         1
Arabic           1
Name: count, dtype: int64

In [5]:
import numpy as np

names_train = ['features_countvec/train_countvec_features_actor_1_name.npy', 'features_countvec/train_countvec_features_actor_2_name.npy', 
               'features_countvec/train_countvec_features_director_name.npy', 'features_doc2vec/train_doc2vec_features_genre.npy',
               'features_doc2vec/train_doc2vec_features_plot_keywords.npy', 'features_fasttext/train_fasttext_title_embeddings.npy']
names_test = ['features_countvec/test_countvec_features_actor_1_name.npy', 'features_countvec/test_countvec_features_actor_2_name.npy', 
               'features_countvec/test_countvec_features_director_name.npy', 'features_doc2vec/test_doc2vec_features_genre.npy',
               'features_doc2vec/test_doc2vec_features_plot_keywords.npy', 'features_fasttext/test_fasttext_title_embeddings.npy']

for name in names_train:
    print(name)
    print(len(np.load(name)[0]))

# The count vectors have too many columns and need to be trimmed

features_countvec/train_countvec_features_actor_1_name.npy
2063
features_countvec/train_countvec_features_actor_2_name.npy
2919
features_countvec/train_countvec_features_director_name.npy
2113
features_doc2vec/train_doc2vec_features_genre.npy
100
features_doc2vec/train_doc2vec_features_plot_keywords.npy
100
features_fasttext/train_fasttext_title_embeddings.npy
100


In [6]:
def create_columns(len, unique):
    columns = []
    for i in range(len):
        columns.append(str(i + unique))
    return columns

def drop_col(df):
    for col in df.columns:
        if (df[col].sum() < 10):
            df = df.drop(col, axis=1)
    return df

In [7]:
temp1 = np.load('features_countvec/train_countvec_features_actor_1_name.npy')
df_act_1 = pd.DataFrame(temp1, columns=create_columns(len(temp1[0]), 0))

temp2 = np.load('features_countvec/train_countvec_features_actor_2_name.npy')
df_act_2 = pd.DataFrame(temp2, columns=create_columns(len(temp2[0]), len(temp1[0])))

temp3 = np.load('features_countvec/train_countvec_features_director_name.npy')
df_dir = pd.DataFrame(temp3, columns=create_columns(len(temp3[0]), len(temp1[0]) + len(temp2[0])))

temp4 = np.load('features_doc2vec/train_doc2vec_features_genre.npy')
df_genre = pd.DataFrame(temp4, columns=create_columns(len(temp4[0]), len(temp1[0]) + len(temp2[0]) + 100))

temp5 = np.load('features_doc2vec/train_doc2vec_features_plot_keywords.npy')
df_keyword = pd.DataFrame(temp5, columns=create_columns(len(temp5[0]), len(temp1[0]) + len(temp2[0]) + 200))

temp6 = np.load('features_fasttext/train_fasttext_title_embeddings.npy')
df_title = pd.DataFrame(temp6, columns=create_columns(len(temp6[0]), len(temp1[0]) + len(temp2[0]) + 300))

df_act_1 = drop_col(df_act_1)
df_act_2 = drop_col(df_act_2)
df_dir = drop_col(df_dir)

In [8]:
print(len(df_act_1.columns))
print(len(df_act_2.columns))
print(len(df_dir.columns))

# Columns are now down to appropriate number

144
72
110


In [68]:
#test_pre_data = pd.DataFrame()

df_list = [df_act_1, df_act_2, df_dir, df_genre, df_keyword, df_title]
train_pre_data = pd.concat(df_list, axis=1)

# Drop old columns and unnecessary columns
titles = ['actor_1_name', 'actor_2_name', 'director_name', 'genres', 'plot_keywords', 'language', 'country',
            'title_embedding', 'movie_title', 'actor_3_name', 'actor_3_facebook_likes']
train_new = pd.concat([train.drop(columns=titles), train_pre_data], axis=1)

print(len(train_new.columns))
train_new.head()


641


Unnamed: 0,id,num_critic_for_reviews,duration,director_facebook_likes,actor_1_facebook_likes,gross,num_voted_users,cast_total_facebook_likes,facenumber_in_poster,num_user_for_reviews,...,5372,5373,5374,5375,5376,5377,5378,5379,5380,5381
0,1,186,73,28,2000,422783777,644348,6458,0,656,...,0.004746,0.00377,0.00139,0.003076,-0.008132,0.001598,0.006643,0.0024,-0.003933,-0.005527
1,2,252,97,0,654,20433940,78883,1876,8,662,...,0.005036,0.003941,0.003205,0.001162,-0.005804,0.002159,0.003575,-0.00071,-0.003438,-0.006788
2,3,232,117,234,12000,371897,36494,13607,2,118,...,-0.000173,0.000985,-0.004908,0.000435,0.002376,0.000944,0.003305,0.000132,-0.002589,-0.00218
3,4,297,109,0,957,13782838,258078,1757,0,911,...,0.005158,0.00543,0.003289,0.002486,-0.010253,0.003423,0.006926,0.001835,-0.005065,-0.008602
4,5,297,171,0,16000,313837577,1238746,22342,2,5060,...,0.004583,0.005102,0.002862,0.000808,-0.007791,0.002772,0.005651,-0.000516,-0.004016,-0.006797


In [69]:
# Remove less than 50
train['content_rating'].value_counts()

content_rating
R            1362
PG-13        1039
PG            458
G              68
Not Rated      27
Unrated        19
Approved       14
X               9
NC-17           3
M               2
Passed          2
GP              1
Name: count, dtype: int64

In [70]:
# Remove low frequency ratings
count = train_new['content_rating'].value_counts()
train_new = train_new[~train_new['content_rating'].isin(count[count < 50].index)]  

# Combine PG ratings
train_new.loc[train_new.content_rating == 'PG-13', 'content_rating'] = 'PG'

from sklearn.preprocessing import OneHotEncoder

enc = OneHotEncoder()
encoded = enc.fit_transform(train_new[['content_rating']])

df_temp = pd.DataFrame(encoded.toarray(), columns=['G', 'PG', 'R'])

train_new = pd.concat([train_new.drop(columns='content_rating'), df_temp], axis=1)

train_new.head()


Unnamed: 0,id,num_critic_for_reviews,duration,director_facebook_likes,actor_1_facebook_likes,gross,num_voted_users,cast_total_facebook_likes,facenumber_in_poster,num_user_for_reviews,...,5375,5376,5377,5378,5379,5380,5381,G,PG,R
0,1.0,186.0,73.0,28.0,2000.0,422783777.0,644348.0,6458.0,0.0,656.0,...,0.003076,-0.008132,0.001598,0.006643,0.002400,-0.003933,-0.005527,1.0,0.0,0.0
1,2.0,252.0,97.0,0.0,654.0,20433940.0,78883.0,1876.0,8.0,662.0,...,0.001162,-0.005804,0.002159,0.003575,-0.000710,-0.003438,-0.006788,0.0,0.0,1.0
2,3.0,232.0,117.0,234.0,12000.0,371897.0,36494.0,13607.0,2.0,118.0,...,0.000435,0.002376,0.000944,0.003305,0.000132,-0.002589,-0.002180,0.0,0.0,1.0
3,4.0,297.0,109.0,0.0,957.0,13782838.0,258078.0,1757.0,0.0,911.0,...,0.002486,-0.010253,0.003423,0.006926,0.001835,-0.005065,-0.008602,0.0,0.0,1.0
4,5.0,297.0,171.0,0.0,16000.0,313837577.0,1238746.0,22342.0,2.0,5060.0,...,0.000808,-0.007791,0.002772,0.005651,-0.000516,-0.004016,-0.006797,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2922,2923.0,376.0,104.0,37.0,12000.0,268488329.0,235025.0,14863.0,0.0,265.0,...,0.000713,-0.003138,0.002156,0.001613,-0.000120,-0.001954,-0.001803,0.0,1.0,0.0
2923,2924.0,218.0,103.0,11.0,2000.0,80021740.0,47968.0,5497.0,4.0,154.0,...,0.000092,-0.000938,0.001605,0.000265,0.001896,-0.000025,-0.000267,0.0,0.0,1.0
2924,2925.0,49.0,92.0,13.0,11000.0,28972187.0,28805.0,12687.0,1.0,159.0,...,0.001385,-0.006023,0.001189,0.003271,0.002730,-0.002297,-0.007346,0.0,0.0,1.0
2925,2926.0,181.0,81.0,670.0,486.0,7267324.0,161471.0,623.0,1.0,709.0,...,0.000406,-0.003227,0.002605,-0.001219,-0.002485,-0.001602,0.001218,0.0,1.0,0.0


In [72]:
# Added NaN values were added by one hot encoding, they should be removed
train_final = train_new.dropna()
y = train_final['imdb_score_binned']
X = train_final.drop(columns=['imdb_score_binned'])
len(train_final)

2855

In [None]:
# Apply post processing, ie normalisation, standardisation

In [73]:
from sklearn import svm

svm_model = svm.SVC()
svm_model.fit(train_final, y)


ValueError: Found input variables with inconsistent numbers of samples: [2855, 3004]