In [132]:
#### importing libraries
import sqlite3
import MySQLdb
import pandas as pd
import numpy as np
from scipy.spatial.distance import euclidean
from scipy.stats.stats import pearsonr

In [133]:
##### exploring trainig data.
df=pd.read_csv('train.csv').drop(['Unnamed: 0'],axis=1)
pd.pivot_table(df,'preference',rows=['book_id'],cols=['user_id'])

user_id,101,102,103,104,105,106,107,108
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
50001,2,0,0,1,0,0,1,3
50002,0,2,0,0,2,0,0,0
50003,1,0,3,0,0,0,0,0
50004,0,3,0,4,1,2,0,0
50005,0,0,0,0,0,0,4,4
50006,3,0,0,0,4,0,0,0
50007,0,4,0,0,0,0,0,0
50008,0,0,1,0,0,4,0,0
50009,0,0,4,0,0,0,2,0
50010,0,0,0,0,0,0,0,0


# Evaluation

In [134]:
#### we will be calculating recall for evaluating recommender system
def eval_recall(pred,r,y):
    count=0
    recall=0
    for j in range(n):
        for i in range(m):
            if r[i,j]==1:
                count+=1.0
                if(pred[i,j]==y[i,j]):
                    recall+=1.0
    return recall/count             

# From here we will be deploying Content Based Recommender System

In [135]:
##### Since classifier can understand reference given to books, which needs to be converted
##### into continous values like rating for each book.We will 
##### In CB engine user will have similar features corresponding to that of book's feature i.e 

df.loc[(df.preference!=0),'preference']=5-df['preference']
#### n number of users
#### m number of books
#### y(i,j) is rating of book i by user j
#### shape of y is (n,m)
y=pd.pivot_table(df,'preference',rows=['book_id'],cols=['user_id']).values
m,n=np.shape(y)

#### calculating average rating of each book given by user
df.loc[(df.preference==0),'preference']=np.nan
avg_rating = pd.pivot_table(df,'preference',rows=['book_id'],cols=['user_id']).mean(axis=1)

#### r(i,j) is 1 if user j has provided rating of book i
#### shape of r is (n,m)
r=np.zeros((m,n))
for i in range(m):
    for j in range(n):
        if y[i,j]>=1: r[i,j]=1


#### now initialise theta a users feature matrix
#### p number distinct of genres of books.
#### theta(j) shape(p,1) is a weight vector of user j
#### shape of theta (p,n)

p = len(df.genre_name.unique())
theta=np.zeros((p+1,n)) # initialise theta with all its element 0
#theta = np.zeros((p+1,n))

#### b_count(j) contains the number of books reviewed by user j
b_count=[0]*n
for i in range(n):
    for j in range(m):
        if r[j,i]==1: b_count[i]+=1
            
#### initialise x a items feature matrix
#### will create dummies of genre_name columns for each book
df_books = pd.read_csv('Books.csv')
dummies=pd.get_dummies(df_books.genre_name)
x=np.ones((15,1))
x=np.concatenate((x,dummies.values),axis=1)

In [136]:
##### define function needed for optimisation
def h(theta_one,x_one):
    return theta_one.dot(x_one)

def costF(theta_one,x_one,rating):
    return h(theta_one,x_one)-rating

def grad(theta_one,x_one,rating):
    return costF(theta_one,x_one,rating)*x_one


In [137]:
##### optimizing algorithm
def optimize(theta,x,alpha,y):
    for j in range(n):
        for i in range(m):
            if r[i,j]==1:
                theta[:,j]=theta[:,j] - alpha*grad(theta[:,j],x[i,:],y[i,j])
            else: continue
    return theta

def train(itr,alpha,theta,x,y):
    for i in range(itr):
        theta = optimize(theta,x,alpha,y)
    return theta

def predict(theta,x):
    pred=[]
    for j in range(n):
        prd=map(round,x.dot(theta[:,j]))
        pred.append(prd)
    return np.array(pred).T

In [138]:
##### recommending book to user
def recommend(user,theta,x):### gives recommendation with decreasing order of preference
    recmd={}
    book_id = pd.read_csv('Books.csv').book_id.unique().tolist()
    if user==None:
        avg_rating = pd.pivot_table(df,'preference',rows=['book_id'],cols=['user_id']).mean(axis=1)
        for ids,rating in zip(book_id,avg_rating):
            recmd[ids]=rating
        return sorted(recmd,key=recmd.__getitem__,reverse=True)
    else:
        pred=predict(theta,x)
        pred=pred[user%100-1,:]
        for ids,rating in zip(book_id,pred):
            recmd[ids]=rating
        return sorted(recmd,key=recmd.__getitem__,reverse=True)

In [139]:
#### training the algo
#### with iterations=1000, alpha=0.001
theta = train(10000,0.001,theta,x,y)

In [140]:
def recommend_genre(id_,theta=theta,x=x):### recommends genre
    book_ids = recommend(id_,theta,x)
    book_ids=map(lambda x: x%1000-1,book_ids)
    df = pd.read_csv('Books.csv')
    genre = df.genre_name.values
    return genre[book_ids]

def recommend_authors(id_,theta=theta,x=x):## recommends authors
    book_ids = recommend(id_,theta,x)
    book_ids=map(lambda x: x%1000-1,book_ids)
    df = pd.read_csv('Books.csv')
    authors = df.author.values
    return authors[book_ids]

In [141]:
recommend_authors(105)

array(['Soumika Shetty', 'J K Rowling', 'William Shakespeare',
       'George Ilian', 'Dale Carnegie', 'Shamik Dasgupta',
       'Ashwini Sanghi', 'Stephen King'], dtype=object)

In [142]:
recommend_genre(105)

array(['crime_mystery', 'action_adventure', 'arts_film', 'biography',
       'business_economics', 'comics', 'crime_mystery', 'horror'], dtype=object)