In [2]:
#Loading the required libraries
import os
import numpy as np
import pandas as pd

In [3]:
os.getcwd()

'C:\\Users\\Bharath Ambati\\Desktop\\AV_Hackathons\\Joke_Rating_Prediction'

In [4]:
#Loading the data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
jokes = pd.read_csv('jokes.csv')

print("Train Shape", train.shape)
print("Test Shape", test.shape)
print("Jokes Shape", jokes.shape)


Train Shape (1092059, 4)
Test Shape (537880, 3)
Jokes Shape (139, 2)


In [5]:
train.head()

Unnamed: 0,id,user_id,joke_id,Rating
0,31030_110,31030,110,2.75
1,16144_109,16144,109,5.094
2,23098_6,23098,6,-6.438
3,14273_86,14273,86,4.406
4,18419_134,18419,134,9.375


In [6]:
#Drop the id column
train.drop('id', axis = 1, inplace = True)

In [7]:
#unique jokes
train.joke_id.nunique()

139

In [8]:
#Unique users
train.user_id.nunique()

40863

#### Data Analysis

In [9]:
from plotly.offline import init_notebook_mode, plot, iplot
import plotly.graph_objs as go
init_notebook_mode(connected=True)

data = train['Rating'].value_counts().sort_index(ascending=False)
trace = go.Bar(x = data.index,
               text = ['{:.1f} %'.format(val) for val in (data.values / train.shape[0] * 100)],
               textposition = 'auto',
               textfont = dict(color = '#000000'),
               y = data.values,
               )
# Create layout
layout = dict(title = 'Distribution Of {} Joke-ratings'.format(train.shape[0]),
              xaxis = dict(title = 'Rating'),
              yaxis = dict(title = 'Count'))
# Create plot
fig = go.Figure(data=[trace], layout=layout)
iplot(fig)

In [31]:
data = train.groupby('user_id')['Rating'].count().clip(upper=100)

# Create trace
trace = go.Histogram(x = data.values,
                     name = 'Ratings',
                     xbins = dict(start = 0,
                                  end = 100,
                                  size = 2))
# Create layout
layout = go.Layout(title = 'Distribution Of Number of Ratings Per User (Clipped at 100)',
                   xaxis = dict(title = 'Number of Ratings Per User'),
                   yaxis = dict(title = 'Count'),
                   bargap = 0.2)

# Create plot
fig = go.Figure(data=[trace], layout=layout)
iplot(fig)

In [14]:
#Collaborative filtering using SURPRISE library
from surprise import BaselineOnly
from surprise import Dataset
from surprise import Reader, KNNWithMeans
from surprise.model_selection import cross_validate


In [10]:
reader = Reader(rating_scale=(-10, 10))
data = Dataset.load_from_df(train[['user_id', 'joke_id', 'Rating']], reader)

In [48]:
data

<surprise.dataset.DatasetAutoFolds at 0x21323eac7f0>

In [49]:
from surprise import *

In [50]:
from surprise.model_selection import GridSearchCV

param_grid = {'n_epochs': [50], 'lr_all': [0.001],
              'reg_all': [0.1, 0.2]}
gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=10, n_jobs=-1, joblib_verbose=1000)

gs.fit(data)

# best RMSE score
print(gs.best_score['rmse'])

# combination of parameters that gave the best RMSE score
print(gs.best_params['rmse'])

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed: 11.6min
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed: 13.0min
[Parallel(n_jobs=-1)]: Done   3 tasks      | elapsed: 14.3min
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed: 15.8min
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed: 24.5min
[Parallel(n_jobs=-1)]: Done   6 tasks      | elapsed: 25.9min
[Parallel(n_jobs=-1)]: Done   7 tasks      | elapsed: 27.3min
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed: 28.6min
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed: 35.3min
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed: 36.6min
[Parallel(n_jobs=-1)]: Done  11 tasks      | elapsed: 37.7min
[Parallel(n_jobs=-1)]: Done  12 tasks      | elapsed: 38.9min
[Parallel(n_jobs=-1)]: Done  13 tasks      | elapsed: 45.4min
[Parallel(n_jobs=-1)]: Done  14 out of  20 | elapsed: 46.7min remaining: 20.0min
[Parallel(n_jobs=-1)]: Done  15 out o

Here for the Grid Search I've used multiple parameters and narrowed down to the parameters chosen. I didn't include here the other as it takes a lot of time if I add more parameters. 

#### Training the model on complete data

In [51]:
# We can now use the algorithm that yields the best rmse:
algo = gs.best_estimator['rmse']
algo.fit(data.build_full_trainset())

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x2131bfa1d30>

#### Making Predictions

In [53]:
test1 = test.drop('id', axis = 1)

In [57]:
# Making predictions
predictions1 = []
for i,j in test1.values:
    predictions = algo.predict(i, j)
    predictions1.append(predictions)
    #predictions_df = pd.concat(predictions1)

In [58]:
predictions_df = pd.DataFrame.from_dict(predictions1)

In [59]:
predictions_df.drop(['r_ui', 'details'], axis = 1, inplace = True)

In [60]:
predictions_df.to_csv('Predictions5.csv')