# Content Based Recommendation System

### 1. Using -Users- data as content and finding similar user's
### 2. Using -Users- movie based data as content and finding similar movie's

In [1]:
import pandas as pd
import numpy as np
import math
import os

In [2]:
def edit_userData(users):
        gender = {'M': 0,'F': 1}
        jobs = {'student':0,'other':1,'educator':2,
                'administrator':3,'engineer':4,'programmer':5,
                'librarian':6,'writer':7,'executive':8,'scientist':9,
                'artist':10,'technician':11,'marketing':12,'entertainment':13,
                'healthcare':14,'retired':15,'lawyer':16,'salesman':17,
                'none':18,'homemaker':19,'doctor':20}
        users.Sex = [gender[item] for item in users.Sex]
        users.Occupation = [jobs[item] for item in users.Occupation]
        return users

def describeDataframes(root):
    u_cols = ['User ID', 'Age', 'Sex', 'Occupation', 'Zip Code']
    r_cols = ['User ID', 'Movie ID', 'Rating', 'Unix Timestamp']
    i_cols = ['Movie ID', 'Movie Title' ,'Release Date','Video Release Date', 'IMDB URL', 'Unknown', 'Action', 'Adventure',
        'Animation', 'Childrens', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
        'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']

    userPath = os.path.join(root,'u.user')
    ratingPath = os.path.join(root,'u.data')
    itemsPath = os.path.join(root,'u.item')

    users = pd.read_csv(userPath, sep='|', names=u_cols, encoding='latin-1')
    ratings = pd.read_csv(ratingPath, sep='\t', names=r_cols, encoding='latin-1')
    items = pd.read_csv(itemsPath, sep='|', names=i_cols, encoding='latin-1')

    users = edit_userData(users)
    return users,ratings,items

In [3]:
user_df,rating_df,movie_df = describeDataframes('root')

In [4]:
user_df.head()

Unnamed: 0,User ID,Age,Sex,Occupation,Zip Code
0,1,24,0,11,85711
1,2,53,1,1,94043
2,3,23,0,7,32067
3,4,24,0,11,43537
4,5,33,1,1,15213


In [5]:
rating_df.head()

Unnamed: 0,User ID,Movie ID,Rating,Unix Timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [6]:
movie_df.head()

Unnamed: 0,Movie ID,Movie Title,Release Date,Video Release Date,IMDB URL,Unknown,Action,Adventure,Animation,Childrens,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [7]:
user_count = len(user_df)
movie_count = len(movie_df)
movie_names = movie_df['Movie Title'].tolist()

In [8]:
print("User Count -----------",user_count)
print("Movie Count -----------",movie_count)

User Count ----------- 943
Movie Count ----------- 1682


In [9]:
# User Watch List ---> For every user indexed in the list
# is a dictionary from Movie ID -> Rating for the user at that index

def initList():
    wList = []
    for runner in range(user_count):
        wList.append({})
    return wList

def make_avgMovieRating():
    avgMovieRating = np.zeros((movie_count))
    for i in range(movie_count):
        tempDF = rating_df.loc[rating_df['Movie ID'] == i+1]
        avgMovieRating[i] += tempDF.Rating.mean()
    return avgMovieRating
    
def make_userWatchList():
    watchList = initList()
    for line in rating_df.itertuples():
        current_user = line[1]-1
        movie_watched = line[2]-1
        rating = line[3]
        watchList[current_user][movie_watched] = rating
    return watchList

In [10]:
# userWatchList[0]  --> A dictionary of movies watched by User ID '1'
# The Dictionary maps from Movie ID to the rating user has given.

userWatchList = make_userWatchList()
avgMovieRating = make_avgMovieRating()

In [11]:
def get_ageValue(age_value):
    for value in range(10,90,10):
        if(age_value<=value):
            age_value=value%10
            return age_value

def make_userAttributes():
    userAttrs = np.zeros((user_count,30))
    for line in user_df.itertuples():
        age_value = get_ageValue(line[2])
        gender_value = line[3]
        occupation_value = line[4]
        userAttrs[line[0]][occupation_value] = 1
        userAttrs[line[0]][20+age_value] = 1
        userAttrs[line[0]][-1] = gender_value
    return userAttrs

def make_movieAttributes():
    temp_df = movie_df.drop(["Movie ID","Movie Title","Release Date","Video Release Date","IMDB URL","Unknown"], axis =1)
    return temp_df.to_numpy()

In [12]:
userAttributes = make_userAttributes()
movieAttributes = make_movieAttributes()

In [13]:
def cosineDistance(arr1,arr2):
    numerator = np.dot(arr1,arr2)
    denominator = math.sqrt(sum(arr1**2))+math.sqrt(sum(arr2**2))
    return numerator/denominator

In [14]:
def make_userMovieGenre(userID):
    userMovieGenre = np.zeros((18,))
    mainUserMovies = list(userWatchList[userID-1].keys())
    movieCounter = 0
    for movieIndex in mainUserMovies:
        movieCounter+=1
        userMovieGenre += movieAttributes[movieIndex]
    return userMovieGenre/movieCounter

def getMovieSimilarities(userID):
    similarities = {}
    userMovieGenre = make_userMovieGenre(userID)
    for i in range(movie_count):
        similarities[i] = cosineDistance(userMovieGenre,movieAttributes[i])
    return similarities

def getUserSimilarities(userID):
    similarities = {}
    mainUser = userAttributes[userID-1]
    for i in range(user_count):
        similarities[i] = cosineDistance(mainUser,userAttributes[i])
    similarities[userID-1] = 0
    return similarities

In [15]:
def make_userValues(similarities):
    ratingXsimilarity = np.zeros((movie_count,))
    similarityCounterArray = np.zeros((movie_count,))
    for currentUser in range(user_count):
        currentSimilarity = similarities[currentUser]
        moviesWatched = userWatchList[currentUser]
        for movieID,rating in moviesWatched.items():
            ratingXsimilarity[movieID] += rating*currentSimilarity
            similarityCounterArray[movieID] += 1
    return ratingXsimilarity, similarityCounterArray

def make_moviePredictions(similarities):
    for i in range(movie_count):
        similarities[i] = similarities[i]*avgMovieRating[i]
    predictions = sorted(similarities, key=similarities.get, reverse=True)
    return predictions

In [16]:
def make_userPredictions(similarities):
    predictions = {}
    ratingXsimilarity, similarityCounterArray = make_userValues(similarities)
    for ID in range(movie_count):
        predictions[ID] = ratingXsimilarity[ID]/similarityCounterArray[ID]
    predictions = sorted(predictions, key=predictions.get, reverse=True)
    return predictions

def predict1(userID):
    similarities = getUserSimilarities(userID)
    predictions = make_userPredictions(similarities)
    return predictions


def predict2(userID):
    similarities = getMovieSimilarities(userID)
    predictions = make_moviePredictions(similarities)
    return predictions

### type = "user"
###### Using user data such as: [Age, Sex, Occupation] and then using that to find similarity and recommending movies based on the most similar user's

### type = "item"
###### Using user data such as: [Averaged movie genre's of a user] and then using that to find similar movies

In [17]:
def predict(userID, top=5, filter_type="user"):
    mainUserMovies = list(userWatchList[userID-1].keys())
    if filter_type=="user":
        predictions = predict1(userID)
    elif filter_type == "item":
        predictions = predict2(userID)
    final = []
    for prediction in predictions:
        if prediction not in mainUserMovies:
            final.append(movie_names[prediction])
    return final[:top]

In [18]:
predict(1, filter_type="user")

['He Walked by Night (1948)',
 'Saint of Fort Washington, The (1993)',
 'Great Day in Harlem, A (1994)',
 'They Made Me a Criminal (1939)',
 'Marlene Dietrich: Shadow and Light (1996) ']

In [19]:
predict(1, filter_type="item")

['Wings of Desire (1987)',
 'Titanic (1997)',
 'As Good As It Gets (1997)',
 'Brassed Off (1996)',
 'Stand by Me (1986)']