## Build your own Recommendation Systems !!!

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

!pip install -f http://h2o-release.s3.amazonaws.com/h2o/latest_stable_Py.html h2o

Looking in links: http://h2o-release.s3.amazonaws.com/h2o/latest_stable_Py.html
[0m

# Load Data

In [2]:
train = pd.read_csv('../input/ctrl-shift-intelligence-2k22/train.csv')
test = pd.read_csv('../input/ctrl-shift-intelligence-2k22/test.csv')

train.shape, test.shape

((8001, 15), (1825, 14))

# Understanding Data

In [3]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8001 entries, 0 to 8000
Data columns (total 15 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  8001 non-null   int64  
 1   userID      8001 non-null   int64  
 2   title       8000 non-null   object 
 3   year        7946 non-null   float64
 4   kind        7971 non-null   object 
 5   genre       7916 non-null   object 
 6   vote        7228 non-null   float64
 7   country     7415 non-null   object 
 8   language    7305 non-null   object 
 9   cast        7694 non-null   object 
 10  director    6368 non-null   object 
 11  composer    4852 non-null   object 
 12  writer      6532 non-null   object 
 13  runtime     7073 non-null   float64
 14  rating      7228 non-null   float64
dtypes: float64(4), int64(2), object(9)
memory usage: 937.7+ KB


In [4]:
train.describe()

Unnamed: 0.1,Unnamed: 0,userID,year,vote,runtime,rating
count,8001.0,8001.0,7946.0,7228.0,7073.0,7228.0
mean,4000.0,4000.0,1995.146111,19026.52,97.64414,6.716146
std,2309.834085,2309.834085,15.82904,99510.44,64.906578,1.291765
min,0.0,0.0,1905.0,5.0,1.0,1.2
25%,2000.0,2000.0,1990.0,242.75,75.0,6.1
50%,4000.0,4000.0,1999.0,1266.5,93.0,6.9
75%,6000.0,6000.0,2003.0,6138.5,109.0,7.6
max,8000.0,8000.0,2023.0,2462087.0,1620.0,9.6


In [5]:
train.head()

Unnamed: 0.1,Unnamed: 0,userID,title,year,kind,genre,vote,country,language,cast,director,composer,writer,runtime,rating
0,0,0,Dinosaur Planet,2003.0,tv mini series,"['Documentary', 'Animation', 'Family']",474.0,['United States'],['English'],"['Christian Slater', 'Scott Sampson']",,,"['Mike Carrol', 'Mike Carroll', 'Georgann Kane']",50.0,7.7
1,1,1,Character,2021.0,movie,"['Crime', 'Horror', 'Thriller']",46.0,['Japan'],['Japanese'],"['Masaki Suda', 'Fukase', 'Mitsuki Takahata', ...",['Akira Nagai'],['Youki Kojima'],"['Takashi Nagasaki', 'Takashi Nagasaki', 'Anna...",125.0,8.3
2,2,2,Get Up and Dance!,1994.0,video movie,['Family'],18.0,['United States'],['English'],"['Paula Abdul', 'Aurorah Allain', 'Bill Bohl',...",['Steve Purcell'],,,54.0,8.1
3,3,3,The Rise and Fall of El Chapo,2016.0,tv movie,['Documentary'],42.0,['United States'],,,,,,85.0,6.9
4,4,4,Sick - IMDb,,,['Thriller'],,['United States'],,"['Marc Menchaca', 'Gideon Adlon', 'Dylan Spray...",['John Hyams'],,"['Katelyn Crabb', 'Kevin Williamson']",,


In [6]:
train['director'].value_counts()

['Kevin Dunn']                                    33
['Akira Kurosawa']                                16
['Tom Clegg']                                     14
['Werner Herzog']                                 14
['Takashi Miike']                                 14
                                                  ..
['Clive Donner']                                   1
['Margaret Selby', 'Tex Avery', 'Chuck Jones']     1
['Elio Petri']                                     1
['Gary Dauberman']                                 1
['Arthur Wong', 'Brandy Yuen']                     1
Name: director, Length: 4241, dtype: int64

## Check Missing Values

In [7]:
train.isnull().sum()

Unnamed: 0       0
userID           0
title            1
year            55
kind            30
genre           85
vote           773
country        586
language       696
cast           307
director      1633
composer      3149
writer        1469
runtime        928
rating         773
dtype: int64

In [8]:
test.isnull().sum()

Unnamed: 0      0
userID          0
title           0
year           15
kind            8
genre           8
vote          104
country        79
language       97
cast           37
director      227
composer      473
writer        187
runtime       128
dtype: int64

# Data Clean

In [9]:
verif = train['Unnamed: 0'] == train['userID']
verif.value_counts()

True    8001
dtype: int64

So, we are going to delete this feature

In [10]:
del train['Unnamed: 0']
del test['Unnamed: 0']

### Object cols

In [11]:
obj_cols = []

for i in train.columns:
    if train[i].dtypes == 'object':
        obj_cols.append(i)
        
        
obj_cols

['title',
 'kind',
 'genre',
 'country',
 'language',
 'cast',
 'director',
 'composer',
 'writer']

### Encode Label

In [12]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()

In [13]:
for col in obj_cols:
    train[col] = le.fit_transform(train[col])
    test[col] = le.fit_transform(test[col])
    
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8001 entries, 0 to 8000
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   userID    8001 non-null   int64  
 1   title     8001 non-null   int64  
 2   year      7946 non-null   float64
 3   kind      8001 non-null   int64  
 4   genre     8001 non-null   int64  
 5   vote      7228 non-null   float64
 6   country   8001 non-null   int64  
 7   language  8001 non-null   int64  
 8   cast      8001 non-null   int64  
 9   director  8001 non-null   int64  
 10  composer  8001 non-null   int64  
 11  writer    8001 non-null   int64  
 12  runtime   7073 non-null   float64
 13  rating    7228 non-null   float64
dtypes: float64(4), int64(10)
memory usage: 875.2 KB


### Imputer Data

In [14]:
from sklearn.impute import KNNImputer

knni = KNNImputer(n_neighbors=5)

In [15]:
null_train_cols = []
for col in train.columns:
    if train[col].isnull().sum().any():
        null_train_cols.append(col)
        
null_test_cols = []
for col in test.columns:
    if test[col].isnull().sum().any():
        null_test_cols.append(col)
        
len(null_train_cols), len(null_test_cols)

(4, 3)

In [16]:
null_train_cols

['year', 'vote', 'runtime', 'rating']

- Impute Train Data

In [17]:
imput_train = pd.DataFrame(knni.fit_transform(train[null_train_cols]))

iterate = 0
for col in null_train_cols:
    train[col] = imput_train[iterate]
    iterate +=1
    
del imput_train
train.isnull().sum()

userID      0
title       0
year        0
kind        0
genre       0
vote        0
country     0
language    0
cast        0
director    0
composer    0
writer      0
runtime     0
rating      0
dtype: int64

- Impute Test Data

In [18]:
imput_test = pd.DataFrame(knni.fit_transform(test[null_test_cols]))

iterate = 0
for col in null_test_cols:
    test[col] = imput_test[iterate]
    iterate +=1
    
test.isnull().sum()

userID      0
title       0
year        0
kind        0
genre       0
vote        0
country     0
language    0
cast        0
director    0
composer    0
writer      0
runtime     0
dtype: int64

# Modeling

### Split Data

In [19]:
del train['userID']
del test['userID']

X = train.copy()
y = X.pop('rating')

X.shape

(8001, 12)

### Try H2O

In [20]:
## import packages
import h2o
from h2o.automl import H2OAutoML

In [21]:
## prepare data
h2o.init()

h2o_train = h2o.H2OFrame(train)
h2o_test = h2o.H2OFrame(test)
target = train['rating'].to_numpy().ravel()
h2o_train['rating'] = h2o.H2OFrame(target)

#from sklearn.model_selection import train_test_split
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

Checking whether there is an H2O instance running at http://localhost:54321 ..... not found.
Attempting to start a local H2O server...
  Java Version: openjdk version "11.0.14.1" 2022-02-08; OpenJDK Runtime Environment (build 11.0.14.1+1-Ubuntu-0ubuntu1.20.04); OpenJDK 64-Bit Server VM (build 11.0.14.1+1-Ubuntu-0ubuntu1.20.04, mixed mode, sharing)
  Starting server from /opt/conda/lib/python3.7/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /tmp/tmpz3drhftb
  JVM stdout: /tmp/tmpz3drhftb/h2o_unknownUser_started_from_python.out
  JVM stderr: /tmp/tmpz3drhftb/h2o_unknownUser_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,02 secs
H2O_cluster_timezone:,Etc/UTC
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.36.0.4
H2O_cluster_version_age:,"21 days, 17 hours and 34 minutes"
H2O_cluster_name:,H2O_from_python_unknownUser_h9a6tn
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,4.396 Gb
H2O_cluster_total_cores:,4
H2O_cluster_allowed_cores:,4


Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%


### Train Model

In [22]:
## run model
features = [x for x in h2o_train.columns if x != 'rating']

model_h2o = H2OAutoML(stopping_metric='rmse', max_runtime_secs=1200)
model_h2o.train(x=features, y='rating', training_frame=h2o_train)

AutoML progress: |██████████████████████████████████████████████████████████████
12:29:39.301: GBM_lr_annealing_selection_AutoML_1_20220421_120959 [GBM lr_annealing] failed: water.exceptions.H2OIllegalArgumentException: Can only convert jobs producing a single Model or ModelContainer.

█| (done) 100%
Model Details
H2OStackedEnsembleEstimator :  Stacked Ensemble
Model Key:  StackedEnsemble_AllModels_3_AutoML_1_20220421_120959

No model summary for this model

ModelMetricsRegressionGLM: stackedensemble
** Reported on train data. **

MSE: 0.14353123744092985
RMSE: 0.3788551668394267
MAE: 0.2576695637377759
RMSLE: 0.06115564895207093
R^2: 0.9067805686232867
Mean Residual Deviance: 0.14353123744092985
Null degrees of freedom: 8000
Residual degrees of freedom: 7990
Null deviance: 12319.249471966426
Residual deviance: 1148.3934307648797
AIC: 7198.292526102844

ModelMetricsRegressionGLM: stackedensemble
** Reported on cross-validation data. **

MSE: 0.6576141015814659
RMSE: 0.8109340920083863




In [23]:
## check leaderboard
model_h2o.leaderboard

model_id,mean_residual_deviance,rmse,mse,mae,rmsle
StackedEnsemble_AllModels_3_AutoML_1_20220421_120959,0.657614,0.810934,0.657614,0.575834,0.122766
StackedEnsemble_AllModels_4_AutoML_1_20220421_120959,0.657908,0.811116,0.657908,0.575888,0.122797
StackedEnsemble_BestOfFamily_4_AutoML_1_20220421_120959,0.670165,0.818636,0.670165,0.582948,0.123836
StackedEnsemble_BestOfFamily_5_AutoML_1_20220421_120959,0.670245,0.818685,0.670245,0.582975,0.123839
StackedEnsemble_BestOfFamily_7_AutoML_1_20220421_120959,0.671203,0.81927,0.671203,0.581809,0.124151
StackedEnsemble_AllModels_2_AutoML_1_20220421_120959,0.678093,0.823464,0.678093,0.587891,0.124539
StackedEnsemble_AllModels_1_AutoML_1_20220421_120959,0.678437,0.823673,0.678437,0.587553,0.124593
StackedEnsemble_BestOfFamily_2_AutoML_1_20220421_120959,0.679864,0.824539,0.679864,0.587369,0.124788
StackedEnsemble_BestOfFamily_3_AutoML_1_20220421_120959,0.6806,0.824985,0.6806,0.588722,0.124771
XGBoost_grid_1_AutoML_1_20220421_120959_model_6,0.691719,0.831697,0.691719,0.594304,0.125731




# Test

In [24]:
## generate predictions
preds_h2o = model_h2o.leader.predict(h2o_test).as_data_frame()
preds_h2o

stackedensemble prediction progress: |███████████████████████████████████████████| (done) 100%


Unnamed: 0,predict
0,6.224972
1,5.825169
2,6.575671
3,5.253024
4,6.888264
...,...
1820,6.294739
1821,6.562093
1822,6.480877
1823,6.444521


# Submission

In [25]:
sub = pd.read_csv('../input/ctrl-shift-intelligence-2k22/sample_sub.csv')
print(sub['rating'].value_counts(),'\n')
sub.head()

1    1825
Name: rating, dtype: int64 



Unnamed: 0,userID,rating
0,8001,1
1,8002,1
2,8003,1
3,8004,1
4,8005,1


Save Submission File

In [26]:
sub['rating'] = preds_h2o

sub.to_csv('submission.csv', index=False)
sub

Unnamed: 0,userID,rating
0,8001,6.224972
1,8002,5.825169
2,8003,6.575671
3,8004,5.253024
4,8005,6.888264
...,...,...
1820,9821,6.294739
1821,9822,6.562093
1822,9823,6.480877
1823,9824,6.444521
