In [1]:
#import basic library
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import r2_score
import xgboost as xgb

%matplotlib inline

In [2]:
df = pd.read_csv('imdb.csv')

In [3]:
df.columns = df.columns.str.lower().str.replace(' ','_')

In [4]:
df.loc[df.rate == 'No Rate', 'rate'] = 0
df.rate = pd.to_numeric(df['rate'])

In [5]:
df.loc[df.votes == 'No Votes', 'votes'] = 0
df.votes = pd.to_numeric(df.votes.str.replace(",",""), downcast="integer")

In [6]:
df.loc[df.duration == 'None', 'duration'] = 0
df.duration = pd.to_numeric(df.duration)

In [7]:
categorical_cols = ['genre', 'type','certificate','nudity','violence','profanity','alcohol','frightening']
numeric_cols = ['rate','votes','duration']

In [9]:
imdb_data = df[categorical_cols+numeric_cols].copy()

In [10]:
for col in categorical_cols:
    imdb_data[col] = imdb_data[col].str.lower()

In [11]:
imdb_data.votes = imdb_data.votes.fillna(imdb_data.votes.mean())

In [13]:
imdb_data.genre=imdb_data.genre.str.replace(' ','')
genre_cols = imdb_data.genre.str.get_dummies(sep=',')

In [14]:
data = pd.concat([imdb_data, genre_cols], axis=1, join='inner')

In [15]:
del data['genre']

In [16]:
data.votes = data.votes.astype(int)

In [17]:
from sklearn.model_selection import train_test_split

In [24]:
def rmse(y, y_pred):
    se = (y - y_pred) ** 2
    mse = se.mean()
    return np.sqrt(mse)

In [18]:
df_train, df_test = train_test_split(data, test_size=0.2, random_state=1)

In [20]:
df_train = df_train.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

y_train = np.log1p(df_train.rate.values)
y_test = np.log1p(df_test.rate.values)

del df_train['rate']
del df_test['rate']

In [21]:
dv = DictVectorizer(sparse=False)

train_dict = df_train.to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_test.to_dict(orient='records')
X_test = dv.transform(val_dict)

In [22]:
features = dv.get_feature_names()
dtrain = xgb.DMatrix(X_train, label=y_train, feature_names=features)
dval = xgb.DMatrix(X_test, label=y_test, feature_names=features)

In [25]:
xgb_params = {
        'eta': 0.3, 
        'max_depth': 8,
        'min_child_weight': 2,

        'objective': 'reg:squarederror',
        'nthread': 8,

        'seed': 1,
        'verbosity': 1,
    }
model = xgb.train(xgb_params, dtrain, num_boost_round=100)
y_pred = model.predict(dval)
score_mean = rmse(y_test, y_pred)
xgb_r2_score = r2_score(y_test , y_pred)

In [26]:
score_mean, xgb_r2_score

(0.1152777508856016, 0.919437396105891)

In [30]:
import pickle

In [31]:
C = 1.0
output_file = f'model_{C}.bin'

In [32]:
# Save the model
with open(output_file, 'wb') as f_out:
    pickle.dump((dv, model), f_out)

print(f'the model is saved to {output_file}')

the model is saved to model_1.0.bin
