# Recommendation Engines Calculations - Content Based

## *Author: Ashley Qing Loh*

## 1. Collected Data

In [37]:
import pandas as pd
import numpy as np

In [38]:
# Import excel file for all exercises
data = pd.read_excel('exerciseCB.xlsx', header=1)

# Dropped last 10 columns and last 9 rows so only non-calculated data is left
dropped = np.r_[-9:,-10:]
data.drop(data.columns[list(range(-10,0))],axis=1,inplace=True)
data.drop(data.tail(9).index,inplace=True)

data

Unnamed: 0.1,Unnamed: 0,Sports,Books,Leadership,Philosophy,Society,Fiction,Security,Love,VideoGames,Superheroes,Unnamed: 11,Unnamed: 12,Unnamed: 13,User 1,User 2,User 3,User 4
0,question1,1,0,1,0,1,1,0,0,0,1,,,,1.0,-1.0,,
1,question2,0,1,1,1,0,0,0,1,0,0,,,,-1.0,1.0,,
2,question3,0,0,0,1,1,1,0,0,0,0,,,,,,,
3,question4,0,0,1,1,0,0,1,1,0,0,,,,,1.0,,
4,question5,0,1,0,0,0,0,0,0,1,1,,,,,,1.0,
5,question6,1,0,0,1,0,0,0,0,0,0,,,,1.0,,,
6,question7,0,0,0,0,0,0,0,1,0,1,,,,,,-1.0,
7,question8,0,0,1,1,0,0,1,0,0,1,,,,,,1.0,
8,question9,0,0,0,0,0,1,0,0,1,0,,,,,,,
9,question10,0,1,0,0,1,0,1,0,0,0,,,,,,,


## 2. Building a Content-based Filtering Engine

In [39]:
Questions = data.iloc[:,0:11].set_index('Unnamed: 0')
User_Rating = data.iloc[:,0:18].drop(data.columns[list(range(1,14))],axis=1).set_index('Unnamed: 0')
User_Rating

Unnamed: 0_level_0,User 1,User 2,User 3,User 4
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
question1,1.0,-1.0,,
question2,-1.0,1.0,,
question3,,,,
question4,,1.0,,
question5,,,1.0,
question6,1.0,,,
question7,,,-1.0,
question8,,,1.0,
question9,,,,
question10,,,,


### 2.1 Simply Unary

In [40]:
# Calculate user profiles using Simply Unary

Simply_Unary_UP = pd.DataFrame(index=User_Rating.columns, columns=Questions.columns)

for i in range(4):
    profile = Questions.mul(User_Rating.iloc[:,i],axis=0).sum()
    Simply_Unary_UP.loc['User '+str(i+1)] = profile

Simply_Unary_UP

Unnamed: 0,Sports,Books,Leadership,Philosophy,Society,Fiction,Security,Love,VideoGames,Superheroes
User 1,3,-2,-1,0,0,2,-1,-1,1,0
User 2,-2,2,2,3,-1,-2,0,3,0,-1
User 3,-2,1,1,0,0,-3,-1,-2,0,1
User 4,0,0,0,0,0,0,0,0,0,0


In [41]:
# Calculate user predictions using Simply Unary

# Calculate sumproduct of each user and each question
Simply_Unary_T1 = pd.DataFrame(index=User_Rating.index, columns=User_Rating.columns)

for i in range(4):
    sumproduct = Questions.mul(Simply_Unary_UP.iloc[i,:],axis=1).sum(axis=1)
    Simply_Unary_T1['User '+str(i+1)] = sumproduct

# Calculate square root of each question
sqrt_by_q = np.sqrt(Questions.mul(Questions,axis=1).sum(axis=1))
# Calculate square root of each user
sqrt_by_u = pd.DataFrame(np.sqrt(Simply_Unary_UP.mul(Simply_Unary_UP).sum(axis=1))).transpose()

# Calculate sumproduct of sqrt of question and sqrt of user
Simply_Unary_T2 = pd.DataFrame(index=User_Rating.index, columns=User_Rating.columns)

for j in range(4):
    sumproduct = sqrt_by_q.mul(sqrt_by_u.iloc[0,j])
    Simply_Unary_T2['User '+str(j+1)] = sumproduct

# Divide table 1 by table 2 to get predictions
Simply_Unary_PRED = (Simply_Unary_T1/Simply_Unary_T2).fillna(0)
Simply_Unary_PRED

Unnamed: 0_level_0,User 1,User 2,User 3,User 4
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
question1,0.39036,-0.298142,-0.29277,0.0
question2,-0.436436,0.833333,0.0,0.0
question3,0.251976,0.0,-0.377964,0.0
question4,-0.327327,0.666667,-0.218218,0.0
question5,-0.125988,0.096225,0.251976,0.0
question6,0.46291,0.117851,-0.308607,0.0
question7,-0.154303,0.235702,-0.154303,0.0
question8,-0.218218,0.333333,0.109109,0.0
question9,0.46291,-0.235702,-0.46291,0.0
question10,-0.377964,0.096225,0.0,0.0


In [42]:
# Count total likes, dislikes and neutral predictions for each user

Likes = Simply_Unary_PRED[Simply_Unary_PRED > 0].count()
Dislikes = Simply_Unary_PRED[Simply_Unary_PRED < 0].count()
Neutral = Simply_Unary_PRED[Simply_Unary_PRED == 0].count()

Simply_Unary_Count = pd.DataFrame(index = ('Likes','Dislikes','Neutral'), columns=User_Rating.columns)
Simply_Unary_Count.loc['Likes'] = Likes
Simply_Unary_Count.loc['Dislikes'] = Dislikes
Simply_Unary_Count.loc['Neutral'] = Neutral

Simply_Unary_Count

Unnamed: 0,User 1,User 2,User 3,User 4
Likes,7,15,5,0
Dislikes,11,4,10,0
Neutral,2,1,5,20


In [43]:
# Top 5 question recommendations for each user

User_1 = pd.DataFrame(Simply_Unary_PRED[Simply_Unary_PRED.index.isin(User_Rating[User_Rating['User 1'].isnull()].index)].iloc[:,0].nlargest(5, keep="all")).index
User_2 = pd.DataFrame(Simply_Unary_PRED[Simply_Unary_PRED.index.isin(User_Rating[User_Rating['User 2'].isnull()].index)].iloc[:,1].nlargest(5, keep="all")).index
User_3 = pd.DataFrame(Simply_Unary_PRED[Simply_Unary_PRED.index.isin(User_Rating[User_Rating['User 3'].isnull()].index)].iloc[:,2].nlargest(5, keep="all")).index
User_4 = pd.DataFrame(Simply_Unary_PRED[Simply_Unary_PRED.index.isin(User_Rating[User_Rating['User 4'].isnull()].index)].iloc[:,3].nlargest(5, keep="all")).index

mydict = {'User 1':User_1, 'User 2':User_2, 'User 3':User_3}

Top_Recommendations = pd.DataFrame({ key:pd.Series(value) for key, value in mydict.items() })
Top_Recommendations

Unnamed: 0,User 1,User 2,User 3
0,question12,question13,question14
1,question9,question14,question19
2,question3,question20,question11
3,question18,question18,question2
4,question11,question8,question10
5,question15,question15,question17
6,,,question18
7,,,question20


### 2.2 Unit Weight

In [44]:
Questions_UW = Questions.div(Questions.sum(axis=1),axis=0)
Questions_UW

Unnamed: 0_level_0,Sports,Books,Leadership,Philosophy,Society,Fiction,Security,Love,VideoGames,Superheroes
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
question1,0.2,0.0,0.2,0.0,0.2,0.2,0.0,0.0,0.0,0.2
question2,0.0,0.25,0.25,0.25,0.0,0.0,0.0,0.25,0.0,0.0
question3,0.0,0.0,0.0,0.333333,0.333333,0.333333,0.0,0.0,0.0,0.0
question4,0.0,0.0,0.25,0.25,0.0,0.0,0.25,0.25,0.0,0.0
question5,0.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.333333,0.333333
question6,0.5,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0
question7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.5
question8,0.0,0.0,0.25,0.25,0.0,0.0,0.25,0.0,0.0,0.25
question9,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.5,0.0
question10,0.0,0.333333,0.0,0.0,0.333333,0.0,0.333333,0.0,0.0,0.0


In [45]:
# Calculate user profiles using Unit Weight

Unit_Weight_UP = pd.DataFrame(index=User_Rating.columns, columns=Questions.columns)

for i in range(4):
    profile = Questions_UW.mul(User_Rating.iloc[:,i],axis=0).sum()
    Unit_Weight_UP.loc['User '+str(i+1)] = profile

Unit_Weight_UP

Unnamed: 0,Sports,Books,Leadership,Philosophy,Society,Fiction,Security,Love,VideoGames,Superheroes
User 1,1.03333,-0.45,-0.25,0.25,0.0,0.533333,-0.2,-0.25,0.333333,0.0
User 2,-0.533333,0.5,0.55,0.75,-0.2,-0.533333,-0.0833333,0.75,0.0,-0.2
User 3,-0.666667,0.333333,0.25,0.0,0.0,-0.916667,-0.333333,-0.75,0.0,0.0833333
User 4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [46]:
# Calculate user predictions using Unit Weight

# Calculate sumproduct of each user and each question
Unit_Weight_T1 = pd.DataFrame(index=User_Rating.index, columns=User_Rating.columns)

for i in range(4):
    sumproduct = Questions_UW.mul(Unit_Weight_UP.iloc[i,:],axis=1).sum(axis=1)
    Unit_Weight_T1['User '+str(i+1)] = sumproduct

# Calculate square root of each question
sqrt_by_q = np.sqrt(Questions_UW.mul(Questions_UW,axis=1).sum(axis=1))
# Calculate square root of each user
sqrt_by_u = pd.DataFrame(np.sqrt(Unit_Weight_UP.mul(Unit_Weight_UP).sum(axis=1))).transpose()

# Calculate sumproduct of sqrt of question and sqrt of user
Unit_Weight_T2 = pd.DataFrame(index=User_Rating.index, columns=User_Rating.columns)

for j in range(4):
    sumproduct = sqrt_by_q.mul(sqrt_by_u.iloc[0,j])
    Unit_Weight_T2['User '+str(j+1)] = sumproduct

# Divide table 1 by table 2 to get predictions
Unit_Weight_PRED = (Unit_Weight_T1/Unit_Weight_T2).fillna(0)
Unit_Weight_PRED

Unnamed: 0_level_0,User 1,User 2,User 3,User 4
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
question1,0.427934,-0.268373,-0.382235,0.0
question2,-0.254363,0.834683,-0.05698,0.0
question3,0.328679,0.006299,-0.361873,0.0
question4,-0.163519,0.643743,-0.284901,0.0
question5,-0.048952,0.113389,0.164488,0.0
question6,0.659494,0.100297,-0.322329,0.0
question7,-0.128473,0.254601,-0.322329,0.0
question8,-0.072675,0.332782,0.0,0.0
question9,0.445373,-0.246885,-0.443203,0.0
question10,-0.272734,0.081892,0.0,0.0


In [47]:
# Count total likes, dislikes and neutral predictions for each user

Likes = Unit_Weight_PRED[Unit_Weight_PRED > 0].count()
Dislikes = Unit_Weight_PRED[Unit_Weight_PRED < 0].count()
Neutral = Unit_Weight_PRED[Unit_Weight_PRED == 0].count()

Unit_Weight_Count = pd.DataFrame(index = ('Likes','Dislikes','Neutral'), columns=User_Rating.columns)
Unit_Weight_Count.loc['Likes'] = Likes
Unit_Weight_Count.loc['Dislikes'] = Dislikes
Unit_Weight_Count.loc['Neutral'] = Neutral

Unit_Weight_Count

Unnamed: 0,User 1,User 2,User 3,User 4
Likes,10,16,4,0
Dislikes,10,4,13,0
Neutral,0,0,3,20


In [48]:
# Top 5 question recommendations for each user

User_1 = pd.DataFrame(Unit_Weight_PRED[Unit_Weight_PRED.index.isin(User_Rating[User_Rating['User 1'].isnull()].index)].iloc[:,0].nlargest(5, keep="all")).index
User_2 = pd.DataFrame(Unit_Weight_PRED[Unit_Weight_PRED.index.isin(User_Rating[User_Rating['User 2'].isnull()].index)].iloc[:,1].nlargest(5, keep="all")).index
User_3 = pd.DataFrame(Unit_Weight_PRED[Unit_Weight_PRED.index.isin(User_Rating[User_Rating['User 3'].isnull()].index)].iloc[:,2].nlargest(5, keep="all")).index
User_4 = pd.DataFrame(Unit_Weight_PRED[Unit_Weight_PRED.index.isin(User_Rating[User_Rating['User 4'].isnull()].index)].iloc[:,3].nlargest(5, keep="all")).index

mydict = {'User 1':User_1, 'User 2':User_2, 'User 3':User_3}

Top_Recommendations = pd.DataFrame({ key:pd.Series(value) for key, value in mydict.items() })
Top_Recommendations

Unnamed: 0,User 1,User 2,User 3
0,question12,question13,question14
1,question9,question14,question19
2,question3,question20,question11
3,question18,question18,question10
4,question15,question8,question18


### 2.3 IDF

In [49]:
# Calculate DF and IDF for each topic
DF = Questions_UW[Questions_UW > 0].count()
IDF = np.log10(Questions_UW.count()/Questions_UW[Questions_UW > 0].count())
IDF

Sports         0.698970
Books          0.522879
Leadership     0.301030
Philosophy     0.259637
Society        0.522879
Fiction        0.522879
Security       0.455932
Love           0.522879
VideoGames     0.455932
Superheroes    0.602060
dtype: float64

In [50]:
# Calculate user profiles using Unit Weight

IDF_UP = pd.DataFrame(index=User_Rating.columns, columns=Questions.columns)

for i in range(4):
    profile = (Questions_UW.mul(User_Rating.iloc[:,i],axis=0).sum())*IDF
    IDF_UP.loc['User '+str(i+1)] = profile

IDF_UP

Unnamed: 0,Sports,Books,Leadership,Philosophy,Society,Fiction,Security,Love,VideoGames,Superheroes
User 1,0.722269,-0.235295,-0.0752575,0.0649093,0.0,0.278869,-0.0911864,-0.13072,0.151977,0.0
User 2,-0.372784,0.261439,0.165566,0.194728,-0.104576,-0.278869,-0.0379943,0.392159,0.0,-0.120412
User 3,-0.46598,0.174293,0.0752575,0.0,0.0,-0.479306,-0.151977,-0.392159,0.0,0.0501717
User 4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [51]:
# Calculate user predictions using Unit Weight

# Calculate sumproduct of each user and each question
IDF_T1 = pd.DataFrame(index=User_Rating.index, columns=User_Rating.columns)

for i in range(4):
    sumproduct = Questions_UW.mul(IDF_UP.iloc[i,:],axis=1).sum(axis=1)
    IDF_T1['User '+str(i+1)] = sumproduct

# Calculate square root of each question
sqrt_by_q = np.sqrt(Questions_UW.mul(Questions_UW,axis=1).sum(axis=1))
# Calculate square root of each user
sqrt_by_u = pd.DataFrame(np.sqrt(IDF_UP.mul(IDF_UP).sum(axis=1))).transpose()

# Calculate sumproduct of sqrt of question and sqrt of user
IDF_T2 = pd.DataFrame(index=User_Rating.index, columns=User_Rating.columns)

for j in range(4):
    sumproduct = sqrt_by_q.mul(sqrt_by_u.iloc[0,j])
    IDF_T2['User '+str(j+1)] = sumproduct

# Divide table 1 by table 2 to get predictions
IDF_PRED = (IDF_T1/IDF_T2).fillna(0)
IDF_PRED

Unnamed: 0_level_0,User 1,User 2,User 3,User 4
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
question1,0.490309,-0.436363,-0.450526,0.0
question2,-0.222832,0.695633,-0.087616,0.0
question3,0.235027,-0.149509,-0.340032,0.0
question4,-0.13751,0.490191,-0.28807,0.0
question5,-0.056961,0.111728,0.159241,0.0
question6,0.659111,-0.172767,-0.404874,0.0
question7,-0.109453,0.263674,-0.297141,0.0
question8,-0.060115,0.138516,-0.016311,0.0
question9,0.360751,-0.270584,-0.416452,0.0
question10,-0.223202,0.094173,0.015831,0.0


In [52]:
# Count total likes, dislikes and neutral predictions for each user

Likes = IDF_PRED[IDF_PRED > 0].count()
Dislikes = IDF_PRED[IDF_PRED < 0].count()
Neutral = IDF_PRED[IDF_PRED == 0].count()

IDF_Count = pd.DataFrame(index = ('Likes','Dislikes','Neutral'), columns=User_Rating.columns)
IDF_Count.loc['Likes'] = Likes
IDF_Count.loc['Dislikes'] = Dislikes
IDF_Count.loc['Neutral'] = Neutral

IDF_Count

Unnamed: 0,User 1,User 2,User 3,User 4
Likes,10,14,5,0
Dislikes,10,6,14,0
Neutral,0,0,1,20


In [53]:
# Top 5 question recommendations for each user

User_1 = pd.DataFrame(IDF_PRED[IDF_PRED.index.isin(User_Rating[User_Rating['User 1'].isnull()].index)].iloc[:,0].nlargest(5, keep="all")).index
User_2 = pd.DataFrame(IDF_PRED[IDF_PRED.index.isin(User_Rating[User_Rating['User 2'].isnull()].index)].iloc[:,1].nlargest(5, keep="all")).index
User_3 = pd.DataFrame(IDF_PRED[IDF_PRED.index.isin(User_Rating[User_Rating['User 3'].isnull()].index)].iloc[:,2].nlargest(5, keep="all")).index
User_4 = pd.DataFrame(IDF_PRED[IDF_PRED.index.isin(User_Rating[User_Rating['User 4'].isnull()].index)].iloc[:,3].nlargest(5, keep="all")).index

mydict = {'User 1':User_1, 'User 2':User_2, 'User 3':User_3}

Top_Recommendations = pd.DataFrame({ key:pd.Series(value) for key, value in mydict.items() })
Top_Recommendations

Unnamed: 0,User 1,User 2,User 3
0,question12,question13,question14
1,question9,question14,question19
2,question3,question7,question11
3,question18,question20,question10
4,question15,question18,question18
