## Build your own Recommendation Systems !!!

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)


# Load Data

In [2]:
train = pd.read_csv('../input/ctrl-shift-intelligence-2k22/train.csv')
test = pd.read_csv('../input/ctrl-shift-intelligence-2k22/test.csv')

train.shape, test.shape

((8001, 15), (1825, 14))

# Understanding Data

In [3]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8001 entries, 0 to 8000
Data columns (total 15 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  8001 non-null   int64  
 1   userID      8001 non-null   int64  
 2   title       8000 non-null   object 
 3   year        7946 non-null   float64
 4   kind        7971 non-null   object 
 5   genre       7916 non-null   object 
 6   vote        7228 non-null   float64
 7   country     7415 non-null   object 
 8   language    7305 non-null   object 
 9   cast        7694 non-null   object 
 10  director    6368 non-null   object 
 11  composer    4852 non-null   object 
 12  writer      6532 non-null   object 
 13  runtime     7073 non-null   float64
 14  rating      7228 non-null   float64
dtypes: float64(4), int64(2), object(9)
memory usage: 937.7+ KB


In [4]:
train.describe()

Unnamed: 0.1,Unnamed: 0,userID,year,vote,runtime,rating
count,8001.0,8001.0,7946.0,7228.0,7073.0,7228.0
mean,4000.0,4000.0,1995.146111,19026.52,97.64414,6.716146
std,2309.834085,2309.834085,15.82904,99510.44,64.906578,1.291765
min,0.0,0.0,1905.0,5.0,1.0,1.2
25%,2000.0,2000.0,1990.0,242.75,75.0,6.1
50%,4000.0,4000.0,1999.0,1266.5,93.0,6.9
75%,6000.0,6000.0,2003.0,6138.5,109.0,7.6
max,8000.0,8000.0,2023.0,2462087.0,1620.0,9.6


In [5]:
train.head()

Unnamed: 0.1,Unnamed: 0,userID,title,year,kind,genre,vote,country,language,cast,director,composer,writer,runtime,rating
0,0,0,Dinosaur Planet,2003.0,tv mini series,"['Documentary', 'Animation', 'Family']",474.0,['United States'],['English'],"['Christian Slater', 'Scott Sampson']",,,"['Mike Carrol', 'Mike Carroll', 'Georgann Kane']",50.0,7.7
1,1,1,Character,2021.0,movie,"['Crime', 'Horror', 'Thriller']",46.0,['Japan'],['Japanese'],"['Masaki Suda', 'Fukase', 'Mitsuki Takahata', ...",['Akira Nagai'],['Youki Kojima'],"['Takashi Nagasaki', 'Takashi Nagasaki', 'Anna...",125.0,8.3
2,2,2,Get Up and Dance!,1994.0,video movie,['Family'],18.0,['United States'],['English'],"['Paula Abdul', 'Aurorah Allain', 'Bill Bohl',...",['Steve Purcell'],,,54.0,8.1
3,3,3,The Rise and Fall of El Chapo,2016.0,tv movie,['Documentary'],42.0,['United States'],,,,,,85.0,6.9
4,4,4,Sick - IMDb,,,['Thriller'],,['United States'],,"['Marc Menchaca', 'Gideon Adlon', 'Dylan Spray...",['John Hyams'],,"['Katelyn Crabb', 'Kevin Williamson']",,


In [6]:
train['director'].value_counts()

['Kevin Dunn']                                    33
['Akira Kurosawa']                                16
['Tom Clegg']                                     14
['Werner Herzog']                                 14
['Takashi Miike']                                 14
                                                  ..
['Clive Donner']                                   1
['Margaret Selby', 'Tex Avery', 'Chuck Jones']     1
['Elio Petri']                                     1
['Gary Dauberman']                                 1
['Arthur Wong', 'Brandy Yuen']                     1
Name: director, Length: 4241, dtype: int64

## Check Missing Values

In [7]:
train.isnull().sum()

Unnamed: 0       0
userID           0
title            1
year            55
kind            30
genre           85
vote           773
country        586
language       696
cast           307
director      1633
composer      3149
writer        1469
runtime        928
rating         773
dtype: int64

In [8]:
test.isnull().sum()

Unnamed: 0      0
userID          0
title           0
year           15
kind            8
genre           8
vote          104
country        79
language       97
cast           37
director      227
composer      473
writer        187
runtime       128
dtype: int64

# Data Clean

In [9]:
verif = train['Unnamed: 0'] == train['userID']
verif.value_counts()

True    8001
dtype: int64

So, we are going to delete this feature

In [10]:
del train['Unnamed: 0']
del test['Unnamed: 0']

### Object cols

In [11]:
obj_cols = []

for i in train.columns:
    if train[i].dtypes == 'object':
        obj_cols.append(i)
        
        
obj_cols

['title',
 'kind',
 'genre',
 'country',
 'language',
 'cast',
 'director',
 'composer',
 'writer']

### Encode Label

In [12]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()

In [13]:
for col in obj_cols:
    train[col] = le.fit_transform(train[col])
    test[col] = le.fit_transform(test[col])
    
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8001 entries, 0 to 8000
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   userID    8001 non-null   int64  
 1   title     8001 non-null   int64  
 2   year      7946 non-null   float64
 3   kind      8001 non-null   int64  
 4   genre     8001 non-null   int64  
 5   vote      7228 non-null   float64
 6   country   8001 non-null   int64  
 7   language  8001 non-null   int64  
 8   cast      8001 non-null   int64  
 9   director  8001 non-null   int64  
 10  composer  8001 non-null   int64  
 11  writer    8001 non-null   int64  
 12  runtime   7073 non-null   float64
 13  rating    7228 non-null   float64
dtypes: float64(4), int64(10)
memory usage: 875.2 KB


### Imputer Data

In [14]:
from sklearn.impute import KNNImputer

knni = KNNImputer(n_neighbors=2)

In [15]:
null_train_cols = []
for col in train.columns:
    if train[col].isnull().sum().any():
        null_train_cols.append(col)
        
null_test_cols = []
for col in test.columns:
    if test[col].isnull().sum().any():
        null_test_cols.append(col)
        
len(null_train_cols), len(null_test_cols)

(4, 3)

In [16]:
null_train_cols

['year', 'vote', 'runtime', 'rating']

- Impute Train Data

In [17]:
imput_train = pd.DataFrame(knni.fit_transform(train[null_train_cols]))

iterate = 0
for col in null_train_cols:
    train[col] = imput_train[iterate]
    iterate +=1
    
del imput_train
train.isnull().sum()

userID      0
title       0
year        0
kind        0
genre       0
vote        0
country     0
language    0
cast        0
director    0
composer    0
writer      0
runtime     0
rating      0
dtype: int64

- Impute Test Data

In [18]:
imput_test = pd.DataFrame(knni.fit_transform(test[null_test_cols]))

iterate = 0
for col in null_test_cols:
    test[col] = imput_test[iterate]
    iterate +=1
    
test.isnull().sum()

userID      0
title       0
year        0
kind        0
genre       0
vote        0
country     0
language    0
cast        0
director    0
composer    0
writer      0
runtime     0
dtype: int64

# Modeling

### Split Data

In [19]:
del train['userID']
del test['userID']

X = train.copy()
y = X.pop('rating')

X.shape

(8001, 12)

### Try FLAML

In [20]:
!pip install flaml

Collecting flaml
  Downloading FLAML-1.0.0-py3-none-any.whl (157 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m157.2/157.2 KB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting xgboost<=1.3.3,>=0.90
  Downloading xgboost-1.3.3-py3-none-manylinux2010_x86_64.whl (157.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m157.5/157.5 MB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: xgboost, flaml
  Attempting uninstall: xgboost
    Found existing installation: xgboost 1.5.2
    Uninstalling xgboost-1.5.2:
      Successfully uninstalled xgboost-1.5.2
Successfully installed flaml-1.0.0 xgboost-1.3.3
[0m

In [21]:
from flaml import AutoML
automl = AutoML()

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

### Train Model

In [22]:
automl.fit(X, y, task="regression", metric='rmse', time_budget=1200)

[flaml.automl: 04-20 11:06:27] {2105} INFO - task = regression
[flaml.automl: 04-20 11:06:27] {2107} INFO - Data split method: uniform
[flaml.automl: 04-20 11:06:27] {2111} INFO - Evaluation method: cv
[flaml.automl: 04-20 11:06:27] {2188} INFO - Minimizing error metric: rmse
[flaml.automl: 04-20 11:06:27] {2281} INFO - List of ML learners in AutoML Run: ['lgbm', 'rf', 'catboost', 'xgboost', 'extra_tree', 'xgb_limitdepth']
[flaml.automl: 04-20 11:06:27] {2567} INFO - iteration 0, current learner lgbm
[flaml.automl: 04-20 11:06:28] {2698} INFO - Estimated sufficient time budget=2641s. Estimated necessary time budget=23s.
[flaml.automl: 04-20 11:06:28] {2750} INFO -  at 0.5s,	estimator lgbm's best error=1.2084,	best estimator lgbm's best error=1.2084
[flaml.automl: 04-20 11:06:28] {2567} INFO - iteration 1, current learner lgbm
[flaml.automl: 04-20 11:06:28] {2750} INFO -  at 0.7s,	estimator lgbm's best error=1.2084,	best estimator lgbm's best error=1.2084
[flaml.automl: 04-20 11:06:28] 

In [23]:
print('Best ML leaner:', automl.best_estimator)
print('Best hyperparmeter config:', automl.best_config)
print('Best log_loss on validation data: {0:.4g}'.format(automl.best_loss))
print('Training duration of best run: {0:.4g} s'.format(automl.best_config_train_time))

Best ML leaner: lgbm
Best hyperparmeter config: {'n_estimators': 5145, 'num_leaves': 91, 'min_child_samples': 8, 'learning_rate': 0.017485703999232848, 'log_max_bin': 10, 'colsample_bytree': 0.6627836019944721, 'reg_alpha': 0.004089629988951731, 'reg_lambda': 0.0481664196794399}
Best log_loss on validation data: 0.8062
Training duration of best run: 38.54 s


### Evaluating Model

In [24]:
import math
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

In [25]:
def evaluate_model(y_test, pred):
    print("*"*12, "Evaluations", "*"*12, '\n')
    
    print("MAE :", mean_absolute_error(y_test, pred), '\n')
    print("MSE :", mean_squared_error(y_test, pred), '\n')
    print("R2_Score :", r2_score(y_test, pred), '\n')
    
    mse_ = np.square(np.subtract(y_test,pred)).mean() 
    print("RMSE :", math.sqrt(mse_))

In [26]:
pred = automl.predict(X_test)
    
evaluate_model(y_test, pred)

************ Evaluations ************ 

MAE : 0.012551768761498673 

MSE : 0.0005417227492051437 

R2_Score : 0.9996708417307789 

RMSE : 0.023274938221295963


# Test

In [27]:
pred = automl.predict(test)
len(pred)

1825

# Submission

In [28]:
sub = pd.read_csv('../input/ctrl-shift-intelligence-2k22/sample_sub.csv')
print(sub['rating'].value_counts(),'\n')
sub.head()

1    1825
Name: rating, dtype: int64 



Unnamed: 0,userID,rating
0,8001,1
1,8002,1
2,8003,1
3,8004,1
4,8005,1


Save Submission File

In [29]:
sub['rating'] = pred

sub.to_csv('submission.csv', index=False)
sub

Unnamed: 0,userID,rating
0,8001,6.249968
1,8002,5.754599
2,8003,6.416793
3,8004,5.328876
4,8005,6.810848
...,...,...
1820,9821,5.965596
1821,9822,7.250384
1822,9823,6.126991
1823,9824,6.506165
