<a href="https://colab.research.google.com/github/abhishek2602/Projects/blob/master/RecommendationEngine.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import pandas as pd
import numpy as np
import math
from math import sqrt
import codecs

In [0]:
# Getting the movie Dataset
doc = codecs.open('u.item', 'rU', 'latin-1') # open for reading the 'universal' type set
moviedb = pd.read_csv(doc, sep = '|', names = range(0, 24))
moviedb.drop((moviedb.iloc[:,2:]), inplace = True, axis = 1) # Dropping columns that will not be used for this excercise
moviedb.columns = ['movieid', 'movie_title']
moviedb.head()

Unnamed: 0,movieid,movie_title
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)


In [0]:
# Reading the dataset
df = pd.read_table('u.data', names = ('userid', 'itemid', 'rating', 'timestamp'))
df.head()

Unnamed: 0,userid,itemid,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [0]:
# Preprocessing
df.sort_values(by = ['userid'], ascending = True, inplace = True)
df.drop(columns = ['timestamp'], inplace = True) # Dropping unwanted columns
df.head()

Unnamed: 0,userid,itemid,rating
66567,1,55,5
62820,1,203,4
10207,1,183,5
9971,1,150,5
22496,1,68,4


In [0]:
# Function to convert dataframe to nested dictionary
def recur_dictify(frame):
    if len(frame.columns) == 1:
        if frame.values.size == 1:
            return frame.values[0][0]
        return frame.values.squeeze()
    grouped = frame.groupby(frame.columns[0])
    d = {k: recur_dictify(g.iloc[:,1:]) for k, g in grouped}
    return d

In [0]:
df2 = recur_dictify(df)

**Pearson Correlation Score**

Implementation for the Pearson correlation score first finds the items rated by both users. It then calculates the sums and the sum of the squares of the ratings for the both users and calculates the sum of the products of their ratings. Finally, it uses these results to calculate the Pearson correlation coefficient.Unlike the distance metric, this formula is not intuitive, but it does tell you how much the variables change together divided by the product of how much they alter individually.

Generally, this pearson_correlation function returns a value between -1 to 1 . A value 1 means both users are having the same taste in all most all cases.

In [0]:
#defining pearson similiarity to get simillarity between 2 users
def pearson_correlation(person1,person2):
    # To get both rated items
    both_rated = {}
    for item in df2[person1]:
        if item in df2[person2]:
            both_rated[item] = 1 # gives the list of movies both have rated 
 
    number_of_ratings = len(both_rated)#gives the total no of the above 
    
    # Checking for number of ratings in common
    if number_of_ratings == 0:
        return 0
    
     # Add up all the preferences of each user
    person1_preferences_sum = sum([df2[person1][item] for item in both_rated])
    person2_preferences_sum = sum([df2[person2][item] for item in both_rated])
    
    # Sum up the squares of preferences of each user
    person1_square_preferences_sum = sum([pow(df2[person1][item],2) for item in both_rated])
    person2_square_preferences_sum = sum([pow(df2[person2][item],2) for item in both_rated])
    
    # Sum up the product value of both preferences for each item
    product_sum_of_both_users = sum([df2[person1][item] * df2[person2][item] for item in both_rated])
    
    # Calculate the pearson score
    numerator_value = product_sum_of_both_users - (person1_preferences_sum*person2_preferences_sum/number_of_ratings)
    denominator_value = sqrt((person1_square_preferences_sum - pow(person1_preferences_sum,2)/number_of_ratings) * (person2_square_preferences_sum -pow(person2_preferences_sum,2)/number_of_ratings))
    if denominator_value == 0:
        return (0)
    else:
        r = numerator_value/denominator_value
        return (r) 
    #returns a value between -1 to 1 . A value 1 means both users are having the same taste in all most all cases.




In [0]:
def user_recommendations(person,no_of_movies):
 
    # Gets recommendations for a person by using a weighted average of every other user's rankings
    totals = {}
    simSums = {}
    rankings_list =[]
    for other in df2:
        # don't compare me to myself
        if other == person:
            continue
        sim = pearson_correlation(person,other)
 
        # ignore scores of zero or lower
        if sim == 0: 
            continue
        for item in df2[other]:
 
            # only score movies i haven't seen yet
            if item not in df2[person] or df2[person][item] == 0:
 
            # Similrity * score
                totals.setdefault(item,0)
                totals[item] += df2[other][item]* sim
                # sum of similarities
                simSums.setdefault(item,0)
                simSums[item]+= sim
 
        # Create the normalized list
 
    rankings = [(total/simSums[item],item) for item,total in totals.items()]
    rankings.sort()
    rankings.reverse()
    # returns the recommended items
    recommendataions_list = [recommend_item for score,recommend_item in rankings]
    best_movies=recommendataions_list[0:no_of_movies]
    print("\n")
    for movie in best_movies:
        likes= (moviedb.loc[moviedb['movieid'] == movie,'movie_title'].tolist())
        
        print ( ' '.join(likes)) #removing the quotes and brackets
    print("\n")
    return("\n These are the movies user %s would like..." %person)

In [0]:
user = input ("Please enter the user id: ")
noofmovies = input("How many movies do you want to recommend? ")
print (user_recommendations(int (user),int (noofmovies)))

Please enter the user id: 101
How many movies do you want to recommend? 5


Leading Man, The (1996)
Truman Show, The (1998)
Four Days in September (1997)
Year of the Horse (1997)
In the Bleak Midwinter (1995)



 These are the movies user 101 would like...




Now for a new user we can take his liking which will then recommend

In [0]:
#Recommending movies for a new user 
def userlist():
    df2=recur_dictify(df)
    allratings = {} #empty dict for appending all the rating of that user
    name = input("Please enter your name: ")
    noofmovies = input ("How many movies do you want to recommend? ")
    for item in moviedb['movieid']:
        #getting the movie for movie id for the user to see
        moviename= moviedb.loc[moviedb['movieid'] == item,'movie_title'].tolist()
        rating = input("Please enter your rating 0-5 for %s: \n to skip this rating press s \n to exit press e " % moviename)
        if rating == "e":
            break
        elif rating == "s":
            continue
        else :
            allratings.update({item:int(float(rating))})
    #appending the ratings to that user
    df2.update({name:allratings})
    return (df2,name,noofmovies)

In [0]:
df2, name2, nof = userlist()
print(user_recommendations(name2, int(nof)))

Please enter your name: Abhishek
How many movies do you want to recommend? 10
Please enter your rating 0-5 for ['Toy Story (1995)']: 
 to skip this rating press s 
 to exit press e 5
Please enter your rating 0-5 for ['GoldenEye (1995)']: 
 to skip this rating press s 
 to exit press e 4
Please enter your rating 0-5 for ['Four Rooms (1995)']: 
 to skip this rating press s 
 to exit press e s
Please enter your rating 0-5 for ['Get Shorty (1995)']: 
 to skip this rating press s 
 to exit press e s
Please enter your rating 0-5 for ['Copycat (1995)']: 
 to skip this rating press s 
 to exit press e s
Please enter your rating 0-5 for ['Shanghai Triad (Yao a yao yao dao waipo qiao) (1995)']: 
 to skip this rating press s 
 to exit press e s
Please enter your rating 0-5 for ['Twelve Monkeys (1995)']: 
 to skip this rating press s 
 to exit press e 5
Please enter your rating 0-5 for ['Babe (1995)']: 
 to skip this rating press s 
 to exit press e s
Please enter your rating 0-5 for ['Dead Man Wa