# Imports and making data

In [1]:
import pandas as pd
import numpy as np
import operator
import math

### JesterSmall.csv is made using the orignal Jester Dataset. Formatted it into a [UserID, JokeID, Rating] format for easy processing

In [2]:
mainFrame = pd.read_csv('JesterSmall.csv', sep='\t')

In [3]:
mainFrame.columns = ["userID","jokeID","rating"]

In [4]:
'''
The jokes have been rated on a scale from -10 to 10
'''

mainFrame.head()

Unnamed: 0,userID,jokeID,rating
0,0,0,-7.82
1,0,1,8.79
2,0,2,-9.66
3,0,3,-8.16
4,0,4,-7.52


In [5]:
mainFrame.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 141547 entries, 0 to 141546
Data columns (total 3 columns):
userID    141547 non-null int64
jokeID    141547 non-null int64
rating    141547 non-null float64
dtypes: float64(1), int64(2)
memory usage: 3.2 MB


In [6]:
userCount = len(mainFrame.userID.unique())
jokeCount = len(mainFrame.jokeID.unique())

In [7]:
'''
A small size Jester dataset onsisiting reviews from 2000 user's on 100 jokes.
'''

print(userCount,jokeCount)

2000 100


In [8]:
def cosineDistance(arr1,arr2):
    numerator = np.dot(arr1,arr2)
    denominator = math.sqrt(sum(arr1**2))+math.sqrt(sum(arr2**2))
    return numerator/denominator

# User - Item Filtering

### Using cosine distance to find similarity between a certain -UserID- and other users on the basis of how their item ratings

### Then using similarity value trying to predict a value of how would the -UserID- like a certain Joke.

In [9]:
# ... Helper Function ... #

def initList(uC,jC):
    arr = []
    for i in range(uC):
        arr.append([])
    ratings = np.zeros((uC,jC))
    return arr,ratings

In [10]:
'''
userJokes -> userJokes[i] will store the list joke ID's rated by the user with ID = i
userRatings -> userRatings[i] will be a numpy array having values of the ratings given by user with ID = i
'''

userJokes, userRatings = initList(userCount,jokeCount)
for row in mainFrame.itertuples():
    userJokes[row[1]].append(row[2])
    userRatings[row[1]][row[2]] = row[3]

In [11]:
'''
avgRatings -> avgRatings[i] will store the average rating of a user.
'''
avgRatings = np.zeros(userCount)
for i in range(userCount):
    avgRatings[i] = np.average(userRatings[i])

In [12]:
'''
getTargetRatings() -> This function takes the target user ID. Splits its rated jokes into two sets:
                    (Split = 10%)
                        Train - For finding similarities.
                        Test - To Test the method
                    It will return a numpy array of ratings of the ID's in 
                    the train set along with train and test list's

--------

getOtherRatings() -> This function will reduce down ratings of all the other users and retrive the ratings of only
                    the jokes that need to be compared with to find similarity.
'''
def getTargetRatings(targetID):
    targetUserJokes = userJokes[targetID]
    seperator = int(len(targetUserJokes)*0.1)
    testingJokes = targetUserJokes[:seperator]
    trainingJokes = targetUserJokes[seperator:]
    targetRatings = np.zeros(len(trainingJokes))
    for ID in range(len(trainingJokes)):
        targetRatings[ID] = userRatings[targetID][trainingJokes[ID]]
    return targetRatings,trainingJokes,testingJokes

def getOtherRatings(targetUserJokes,ratings):
    other = np.zeros(len(targetUserJokes))
    for ID in range(len(targetUserJokes)):
        other[ID] = ratings[targetUserJokes[ID]]
    return other

In [13]:
'''
findSimilarities() -> This function will loop over all the user's and find user's who have rated the same jokes
                        with ID's in the train list and find a Cosine Similarity Value between those two.
                    Returns a dictionary of UserID:Similarity type.
'''

def findSimilarities(targetID,target,trainingJokes):
    targetSet = set(trainingJokes)
    targetCount = len(targetSet)
    similarities = {}
    for i in range(userCount):
        if (i==targetID):
            continue
        else:
            otherSet = set(userJokes[i])
            count = len(otherSet.intersection(targetSet))
            if(count==targetCount):
                other = getOtherRatings(trainingJokes,userRatings[i])
                similarities[i] = cosineDistance(target,other)
    return similarities

In [14]:
'''
makeThePrediction() -> This function is used to test the method and calulated an expected rating
                        on the basis of most similar user's and the rating that those certain 
                        user's gave to a joke.
'''

def makeThePredcition(similar,ID, targetID):
    numerator = 0
    totalSim = 0
    for user in similar:
        normalizedRating = userRatings[user[0]][ID]
        numerator = numerator + (user[1]*normalizedRating)
        totalSim = totalSim + user[1]
    pred = numerator/totalSim
    return pred

In [15]:
'''
predict() -> It is used to make a prediction and simultaneouly check on how it is performing.
'''

def predict(targetID,bestof):
    targetRatings,trainingJokes,testingJokes = getTargetRatings(targetID)
    similarities = findSimilarities(targetID,targetRatings,trainingJokes)
    otherUsers = sorted(similarities.items(), key=operator.itemgetter(1), reverse=True)
    loss = 0
    for testID in testingJokes:
        mostSimilar = []
        for otherUser in otherUsers:
            if (testID in userJokes[otherUser[0]]):
                mostSimilar.append(otherUser)
            if len(mostSimilar)==bestof:
                break
        pred = makeThePredcition(mostSimilar,testID, targetID)
        actual = userRatings[targetID][testID]
        error = (pred-actual)**2
        loss += error
        print("Actual =",actual,"----- Predicted =",round(pred,2),"----- Error =",error)
    print()
    mse = loss/len(testingJokes)
    print("Total RMSE =",mse**0.5, "------- Total MSError",mse)

In [16]:
predict(0,5)

Actual = -7.82 ----- Predicted = -0.0 ----- Error = 61.1456541578109
Actual = 8.79 ----- Predicted = -3.09 ----- Error = 141.06754252188455
Actual = -9.66 ----- Predicted = -4.98 ----- Error = 21.87619845216674
Actual = -8.16 ----- Predicted = -8.09 ----- Error = 0.005533356692395098
Actual = -7.52 ----- Predicted = -2.61 ----- Error = 24.153064508190106
Actual = -8.5 ----- Predicted = 1.2 ----- Error = 94.14797114473382
Actual = -9.85 ----- Predicted = -6.76 ----- Error = 9.542777370877875

Total RMSE = 7.0906250128739545 ------- Total MSError 50.27696307319377


In [17]:
predict(13,5)

Actual = 9.22 ----- Predicted = 6.36 ----- Error = 8.18692430854605
Actual = 9.27 ----- Predicted = 6.78 ----- Error = 6.2200354279425305
Actual = 9.22 ----- Predicted = 7.99 ----- Error = 1.5015298026605937
Actual = 8.3 ----- Predicted = 8.28 ----- Error = 0.0004999305040423537
Actual = 7.43 ----- Predicted = 1.85 ----- Error = 31.150596041070365
Actual = 0.44 ----- Predicted = 8.43 ----- Error = 63.86173333960625
Actual = 3.5 ----- Predicted = -2.45 ----- Error = 35.39873020233972
Actual = 8.16 ----- Predicted = -1.43 ----- Error = 92.02870407097939
Actual = 5.97 ----- Predicted = 7.4 ----- Error = 2.056707105098695
Actual = 8.98 ----- Predicted = 8.16 ----- Error = 0.6797024627818821

Total RMSE = 4.910042389751126 ------- Total MSError 24.10851626915295


In [18]:
predict(164,5)

Actual = 6.36 ----- Predicted = 6.05 ----- Error = 0.09866459873908044
Actual = 9.27 ----- Predicted = 6.43 ----- Error = 8.087668747833982
Actual = 7.04 ----- Predicted = 5.65 ----- Error = 1.9199258950720386
Actual = 4.85 ----- Predicted = 1.83 ----- Error = 9.121255801351762
Actual = 9.27 ----- Predicted = 8.78 ----- Error = 0.23764141926304844
Actual = 5.24 ----- Predicted = -1.12 ----- Error = 40.487565424225316

Total RMSE = 3.1610315269566995 ------- Total MSError 9.992120314414205


In [19]:
predict(832,5)

Actual = 2.09 ----- Predicted = -1.33 ----- Error = 11.667395096738563
Actual = -6.17 ----- Predicted = -1.78 ----- Error = 19.26115295305719
Actual = -5.73 ----- Predicted = -2.28 ----- Error = 11.86863567173033

Total RMSE = 3.7769998553316575 ------- Total MSError 14.26572790717536


In [20]:
predict(1432,5)

Actual = -8.06 ----- Predicted = -0.67 ----- Error = 54.65360517532493
Actual = 0.97 ----- Predicted = 4.06 ----- Error = 9.562529994557906
Actual = 1.84 ----- Predicted = -0.83 ----- Error = 7.1454580030115915
Actual = 3.11 ----- Predicted = 3.54 ----- Error = 0.1830896909461598
Actual = -0.49 ----- Predicted = 6.2 ----- Error = 44.73601857805843
Actual = 1.31 ----- Predicted = -0.74 ----- Error = 4.197205935174834

Total RMSE = 4.4810323843409465 ------- Total MSError 20.07965122951231


# Item - Item Filtering

### Using cosine distance finding similar jokes on the basis of the ratings they have recieved which could be suggested to user's

In [21]:
'''
jokeRatings -> jokeRatings[i] will store the value of the ratings all the users gave to a joke with ID = i

jokeAverage -> jokeAverage[i] = averageOf(jokeRatings[i])
'''


jokeRatings = np.zeros((jokeCount,userCount))
jokeAverage = np.zeros((jokeCount))

In [22]:
for row in mainFrame.itertuples():
    jokeRatings[row[2]][row[1]] = row[3]

for i in range(jokeCount):
    jokeAverage[i] = np.average(jokeRatings[i])

In [23]:
'''
predictItem()-> Predicts an item based on the most similar rated jokes. And prints the most similar joke ID's
'''

def predictItem(jokeID, bestof):
    targetRatings = jokeRatings[jokeID]
    similarities = {}
    for i in range(jokeCount):
        if(i==jokeID):
            continue
        else:
            similarities[i] = cosineDistance(targetRatings,jokeRatings[i])
    similarJokes = sorted(similarities.items(), key=operator.itemgetter(1), reverse=True)
    print("Following Joke ID's are similar to Joke:",jokeID,"--- Average Joke Score:",jokeAverage[jokeID])
    for i in range(bestof):
        print("ID:",similarJokes[i][0],"----- Score:",round(jokeAverage[similarJokes[i][0]],3))

In [24]:
predictItem(0,5)

Following Joke ID's are similar to Joke: 0 --- Average Joke Score: 0.8035549999999999
ID: 1 ----- Score: 0.533
ID: 9 ----- Score: 1.168
ID: 2 ----- Score: 0.41
ID: 37 ----- Score: 1.273
ID: 55 ----- Score: 2.039


In [25]:
predictItem(13,5)

Following Joke ID's are similar to Joke: 13 --- Average Joke Score: 1.391625
ID: 25 ----- Score: 1.57
ID: 11 ----- Score: 1.448
ID: 52 ----- Score: 3.101
ID: 28 ----- Score: 3.151
ID: 41 ----- Score: 1.904


In [26]:
predictItem(52,5)

Following Joke ID's are similar to Joke: 52 --- Average Joke Score: 3.10071
ID: 49 ----- Score: 3.786
ID: 35 ----- Score: 3.398
ID: 31 ----- Score: 3.02
ID: 34 ----- Score: 3.151
ID: 68 ----- Score: 2.701


In [27]:
predictItem(82,5)

Following Joke ID's are similar to Joke: 82 --- Average Joke Score: 0.7829299999999999
ID: 71 ----- Score: 0.915
ID: 92 ----- Score: 0.979
ID: 88 ----- Score: 1.428
ID: 75 ----- Score: 0.788
ID: 79 ----- Score: 0.391


In [28]:
predictItem(99,5)

Following Joke ID's are similar to Joke: 99 --- Average Joke Score: 0.54311
ID: 51 ----- Score: 0.293
ID: 50 ----- Score: -0.193
ID: 97 ----- Score: 0.153
ID: 84 ----- Score: 0.372
ID: 79 ----- Score: 0.391
