# Import the libraries

In [None]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [None]:
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from surprise import KNNWithMeans,SVD,Dataset,Reader,accuracy
from surprise.model_selection import train_test_split,GridSearchCV,cross_validate
from collections import defaultdict


# 1. Read and explore the given dataset. ( Rename column/add headers, plot histograms, find data characteristics)

In [None]:
#considering the first 3 coloumns
ratings = pd.read_csv("ratings_Electronics.csv",encoding = "ISO-8859-1",usecols=[0,1,2])
ratings.columns = ['userId','productId','ratings']
ratings.head()

In [None]:
ratings.shape

In [None]:
ratings.userId=ratings.userId.astype(str)
ratings.productId=ratings.productId.astype(str)

In [None]:
ratings.info()

In [None]:
ratings.describe().T

In [None]:
ratings.isnull().any()

In [None]:
print("total unique users - ",len(ratings["userId"].unique()))

In [None]:
ratings.hist('ratings',bins = 10)

In [None]:
ratings['ratings'].value_counts()

To have a better look at the ratings distribution , lets group the data set by Product ID and User ID

In [None]:
product_ratings = ratings.groupby('productId')['ratings'].count()

In [None]:
product_ratings.sort_values(ascending=False)

In [None]:
user_ratings=ratings.groupby('userId')['ratings'].count()

In [None]:
user_ratings.sort_values(ascending=False)

# 2.Take a subset of the dataset to make it less sparse/ denser. ( For example, keep the users only who has given 50 or more number of ratings ) 

lets filter the most rated products and most rating user to reduce the dimension so as to avoid running into memory errors.

In [None]:
filter_products = ratings['productId'].value_counts() >= 50
filter_products = filter_products[filter_products].index.tolist()

In [None]:
filter_users = ratings['userId'].value_counts() >= 50
filter_users = filter_users[filter_users].index.tolist()

In [None]:
df = ratings[(ratings['productId'].isin(filter_products)) & (ratings['userId'].isin(filter_users))]

In [None]:
df.head()

In [None]:
df.shape

In [None]:
reader = Reader(rating_scale=(1, 5))

In [None]:
data = Dataset.load_from_df(df[['userId','productId', 'ratings']], reader)

# 3. Split the data randomly into train and test dataset. ( For example, split it in 70/30 ratio)

In [None]:
trainset, testset = train_test_split(data, test_size=.30,random_state=0)

In [None]:
type(trainset)

# 4. Build Popularity Recommender model.

lets create a data frame with the average ratings of the product and total number of ratings of the product.

In [None]:
new_df=pd.DataFrame(df.groupby('productId')['ratings'].mean())

In [None]:
new_df['Total ratings']=pd.DataFrame(df.groupby('productId')['ratings'].count())

In [None]:
new_df.sort_values(by='Total ratings',ascending=False).head()

The above products are being rated high by multiple users so this could serve as a good recommendation as per popularity based model.

# Making sense of trainset

Points to Note:

* Trainset is no longer a pandas dataframe. Rather, it's a specific datatypes defined by the Surprise library
* UserId and ItemId in the pandas dataframe can contain any value (either string/integer etc). However, Trainset convert these raw ids into numeric indexes called as "inner id"
* Methods are provided to convert rw id to inner id and vice verca

In [None]:
user_records = trainset.ur
type(user_records)

In [None]:
user_records[0]

In [None]:
# However the ids are the inner ids and not the raw ids
# raw ids can be obatined as follows

print(trainset.to_raw_uid(0))
print(trainset.to_raw_iid(11097))

# 5. Build Collaborative Filtering model.

## ITEM-ITEM COLLABORATIVE FILTERING

## Since our goal is to recommend new products to each user based on his/her habits we go for item based one.user based collaborativefiltering will be used when we want to predict the user rating for a particular product

## implementing the model using KNNwithmeans algorithm

In [None]:
algo = KNNWithMeans(k=50, sim_options={'name': 'pearson_baseline', 'user_based': False})
algo.fit(trainset)
test_pred = algo.test(testset)

In [None]:
print("Item-based Model : Test Set")
accuracy.rmse(test_pred, verbose=True)

In [None]:
print("Item-based Model : Training Set")
train_pred = algo.test(trainset.build_testset())
accuracy.rmse(train_pred)

# 6.Evaluate both the models. ( Once the model is trained on the training data, it can be used to compute the error (like RMSE) on predictions made on the test data.) You can also use a different method to evaluate the models.

As seen above the rmse of train data is 0.019 whereas in test it is 1.13,which implies the model has not performed very well

In [None]:
print(test_pred[0:5])

# SVD

In [None]:
svd_model = SVD(n_factors=50,reg_all=0.01)
svd_model.fit(trainset)

In [None]:
accuracy.rmse(test_pred)

Using SVD also we got rmse as 1.13,now lets try Parameter tuning of SVD Recommendation system

In [None]:
param_grid = {'n_factors' : [5,10,15], "reg_all":[0.01,0.02]}
svd_tuned = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=3,refit = True)
svd_tuned.fit(data)
svd_tuned.best_params

In [None]:
test_predict_svd_tuned= svd_tuned.test(testset)
test_predict_svd_tuned[0:5]

In [None]:
accuracy.rmse(test_predict_svd_tuned)

Since the rmse value has reduced fairly in SVD after parameter tuning, we can use this to generate recommendations to users

# 7. Get top - K ( K = 5) recommendations. Since our goal is to recommend new products to each user based on his/her habits, we will recommend 5 new products.

In [None]:
def get_top_n(predictions, n):
    '''Return the top-N recommendation for each user from a set of predictions.

    Args:
        predictions(list of Prediction objects): The list of predictions, as
            returned by the test method of an algorithm.
        n(int): The number of recommendation to output for each user. Default
            is 10.

    Returns:
    A dict where keys are user (raw) ids and values are lists of tuples:
        [(raw item id, rating estimation), ...] of size n.
    '''

    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

## Printing the top 5 recommendations for all the users

In [None]:
top_n = get_top_n(test_predict_svd_tuned,5)
for uid, user_ratings in top_n.items():
    print(uid, [iid for (iid, _) in user_ratings])

# 8. Summarise your insights.

* We analyzed the dataset considering only three required columns userId, productId, and ratings.
* We understood how the data is distributed across the ratings value range.
* To avoid running into memory issues we filtered a subset of data from the original data set (users who have given atleast 50 ratings/products rated atleast 50 times).
* We implemented popularity based recommendations by listing out the higly rated products with the highest number of ratings.
* Understanding popularity based recommendation is same regardless of the user's preferences.
* We implemented item based collaborative filtering model so as to provide product recommendations for the any user.
* We used KNNwithmeans model and we got a train RMSE of 0.019 and test RMSE of 1.13 which clearly indicates that the model is not generalized, So we tried implementing SVD and the RMSE was 0.85 (Less RMSE value could help the model perform better).
* With the help of SVD parameter tuned model and iterative functions we were able to draw top 5 recommendations for every individual user 