# Recommender Systems on Purchased Data
## Item-Item Based Filtering
###### The most popular and simple approach to CF is the item-item collaborative filtering. To construct a recommendation for a user, we build the item matrix by finding the k-nearest neighbor items using Cosine, Pearson or Jaccard similarity. We then construct the prediction for each user-item based on the item-item similarity matrix calculated previously.

In [2]:
import pandas as pd
import numpy as np
import random
from scipy import sparse
import itertools
from sklearn.metrics.pairwise import cosine_similarity

#### Random Table

In [3]:
#Import Data 
df=pd.read_csv("C:/Users/alorenzodebrionne/Documents/Python/purchased_data.csv")
df.head(3)

Unnamed: 0,userID,ItemID,nb_purchased
0,100002,51,1
1,100002,52,3
2,100002,53,1


In [4]:
print("userID:", df.userID.nunique())
print("ItemID:", df.ItemID.nunique())

userID: 1892
ItemID: 17626


#### To ensure statistical significance users with less than 5 items, and items with less than 10 ratings are excluded.

In [5]:
#Filter on Items with have at least 10 users
item_count = df.groupby(['ItemID']).size().reset_index(name='counts').sort_values(['counts'], ascending=False)
list_items = item_count[item_count['counts']>175]

# Filter on Users with at least 5 items
user_count = df.groupby(['userID']).size().reset_index(name='counts').sort_values(['counts'], ascending=False)
list_users = user_count[user_count['counts']>40]

In [6]:
list_items= list_items['ItemID']
list_users= list_users['userID']

In [7]:
df= df[df['ItemID'].isin(list_items)]
df= df[df['userID'].isin(list_users)]
print("UserID:", df.userID.nunique())
print("ItemID:", df.ItemID.nunique())

UserID: 1716
ItemID: 53


### Rating Matrix

###### We convert the table to a 2D Matrix, the matrix is very sparse because all items are not bought.

In [8]:
df_pivot = df.pivot(index='userID', columns='ItemID').nb_purchased
userID = pd.DataFrame(df_pivot.index)#Keep UserID's 

# Replace all NaN per zeros
df_pivot=df_pivot.fillna(0)

#Reset Index
df_pivot = df_pivot.reset_index(drop=True)

##### Magnitude Calculation

In [9]:
#score creation
magnitude = np.sqrt(np.square(df_pivot).sum(axis=1))
df_pivot = df_pivot.divide(magnitude,axis='index')

## Create the train set and test set
### Process for Data Masking
##### Randomly assign 0 values to the training set - Test set is a copy of the DataFrame

In [10]:
#Make a copy of the original set to be the test set
def make_train(df, pct_test = 0.25):
    
    test_set=df.copy() 
    training_set=df.copy() 

    colnames = list(training_set.columns.values) #List Products Names

    training_set_array= np.array(training_set) #Transform to Array

    nonzero_inds=training_set_array.nonzero() #Find the indices in the ratings data where an interaction exists
    nonzero_pairs=list(zip(nonzero_inds[0],nonzero_inds[1])) #zip these pairs together of user, uitem index into list
    random.seed(123) #set the random seed to zero for reproducibility

    num_samples=int(np.ceil(pct_test*len(nonzero_pairs))) #round the number of samples needed to the nearest integer
    samples=random.sample(nonzero_pairs,num_samples) #Sample a random number of the user-item pairs without replacement
    user_inds=[index[0] for index in samples] #get the user row indices
    item_inds=[index[1] for index in samples] #get the item column indices
    training_set_array[user_inds,item_inds] = 0 #Assign all of the randomly chosen user-item pairs to zero

    training_set=pd.DataFrame(training_set_array, columns=colnames)

    return training_set, test_set, list(set(user_inds))
training_set, test_set, item_users_altered = make_train(df_pivot,pct_test=0.25)

## Item-Item Based Filtering

In [11]:
#cosine similarity between products
def calculate_similarity(data_items):
    data_sparse = sparse.csr_matrix(data_items)
    similarities = cosine_similarity(data_sparse.transpose())
    sim = pd.DataFrame(data=similarities,index=data_items.columns,columns=data_items.columns)
    return sim


#Build item-item similarity matrix
cosine_sim = calculate_similarity(training_set)

ncol = len(training_set.columns)
data_neighbours=pd.DataFrame(index=cosine_sim.columns,columns=range(1,ncol+1))
for i in range(0,len(cosine_sim.columns)):
    data_neighbours.iloc[i,:ncol]=cosine_sim.iloc[0:,i].sort_values(ascending=False)[:ncol].index

### Recommendation per User - Top-N

In [12]:
#Empty Datasets to Append the user's recommendation
df_rec = pd.DataFrame([])
df_top_N  = pd.DataFrame([])

In [14]:
i=0
n_items, n_users = df_pivot.shape

while i < n_users:
    productsofusers = training_set.iloc[i]
    productsofusers=productsofusers[productsofusers>0].index.values #Keep only indexes

    #construct the neighbourhood from the most similar items to the ones our user already have
    most_similar_to_likes = data_neighbours.loc[productsofusers]
    similar_list=most_similar_to_likes.values.tolist()
    similar_list=list(set([item for sublist in similar_list for item in sublist]))
    neighbourhood=cosine_sim[similar_list].loc[similar_list]
    #vector with the neighbourhood and user likes
    user_vector=training_set.iloc[i].loc[similar_list]
    #calculate score
    score=neighbourhood.dot(user_vector).div(neighbourhood.sum(axis=1))                                  

    #drop products users already have
    score=score.drop(productsofusers)

    #creation of the table with recommendations for each users
    df_top_N = pd.DataFrame(score.nlargest(3),columns=['score']) #top 10 products
    df_top_N['index_u'] = i #find the id of the user i
    df_top_N['userID'] = userID[userID.index==i].userID.tolist()[0] #find the id of the user i
    df_rec = df_rec.append(df_top_N) 
    i += 1

## Evaluation
#### Create the Confusion Matrix 

In [16]:
#Variables  Initialization
true_positive=0
false_positive=0
true_negative=0
false_negative=0

for user in item_users_altered: #Iterate through each user that had an item altered
    actual =pd.DataFrame(test_set.iloc[user,:]) # List all Item for the User
    actual.columns=  ['ItemID']
    training =pd.DataFrame(training_set.iloc[user,:]) # List all Item for the Use
    training.columns= ['ItemID']
    
    training= training[training['ItemID'] == 0] # Item-User Hidden
    actual= actual[actual['ItemID'] > 0]  # Original Data 
        
    must_be_recommend= pd.merge(training,actual,left_index=True, right_index=True, how='inner').index.values
    is_recommend = df_rec[(df_rec['index_u']==user)].index.values
    not_recommend = np.array(pd.DataFrame(training[~training.index.isin(is_recommend)].index))

    #condition positive
    for i in is_recommend:
        if (i in must_be_recommend):
            true_positive=true_positive+1
        else:
            false_positive=false_positive+1
    #condition negative
    for i in not_recommend:
        if (i in must_be_recommend):
            false_negative=false_negative+1
        else:
            true_negative=true_negative+1   
 
print('TP= ', true_positive, 'FP= ', false_positive,'FN= ', false_negative,'TN= ', true_negative)



TP=  1089 FP=  3042 FN=  2750 TN=  56199
