## Standard Imports

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

## Importing Data

In [2]:
u_cols = ['user_id','age','sex','occupation','zip_code']
users = pd.read_csv("./data/u.user",sep='|',names = u_cols,encoding='latin-1')

In [3]:
users.head()

Unnamed: 0,user_id,age,sex,occupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


In [4]:
i_cols = ['movie_id','title','release date','video release date','IMDb URL','unknown','Action','Adventure','Animation','Children\'s','Comedy','Crime','Documentary','Drama','Fantasy','Film-Noir','Horror','Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
movies = pd.read_csv('./data/u.item', sep='|',names=i_cols, encoding='latin-1')

Dropping all unnecessary columns

In [5]:
movies = movies[['movie_id','title']]

In [6]:
movies.head()

Unnamed: 0,movie_id,title
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)


In [7]:
r_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv('./data/u.data', sep='\t', names=r_cols, encoding='latin-1')

In [8]:
ratings = ratings.drop('timestamp',axis = 1)

In [9]:
ratings.head()

Unnamed: 0,user_id,movie_id,rating
0,196,242,3
1,186,302,3
2,22,377,1
3,244,51,2
4,166,346,1


## Defining the Problem

We are going to treat this as a regression problem<br>
Given a ***Movie Ratings*** we are going to predict similar ***User Ids*** and recommend movies highly rated by these users

## Splitting Training and Testing Data

In [10]:
from sklearn.model_selection import train_test_split

In [11]:
X = ratings.copy()
Y = ratings['user_id']

In [12]:
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size = 0.25,stratify = Y)

In [13]:
from sklearn.metrics import mean_squared_error

## Defining Loss Function

In [14]:
def rmse(Y_true,Y_pred):
    return np.sqrt(mean_squared_error(Y_true,Y_pred))

## Defining Testing Function

In [15]:
def score(model):
    id_pairs = zip(X_test['user_id'],X_test['movie_id'])
    Y_pred = np.array([model(user,movie) for (user,movie) in id_pairs])
    Y_true = np.array(X_test['rating'])
    return rmse(Y_true,Y_pred)

## Defining Rating Matrix

A row of rating matrix holds rating given by a user to all the available movies<br>NaN means the user has not given any rating

In [16]:
r_matrix = X_train.pivot_table(values='rating',index = 'user_id',columns='movie_id')

In [17]:
r_matrix.head()

movie_id,1,2,3,4,5,6,7,8,9,10,...,1665,1666,1669,1671,1672,1674,1677,1678,1681,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,4.0,3.0,,5.0,4.0,,5.0,3.0,...,,,,,,,,,,
2,4.0,,,,,,,,,2.0,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,


## Simple Mean

We'll start with a simple method, we'll input the mean rating provided by all the users to a particular movie as a feature to predict User Ids

In [18]:
def simple_mean(user_id,movie_id):
    if movie_id in r_matrix:
        return r_matrix[movie_id].mean()
    else:
        return 3

In [19]:
score(simple_mean)

1.0242109259508403

We get the following RMSE as the output

## Weighted Mean

Instead of just using the mean we'll use a weighted mean<br>
The weights will be the cosine similarities of the given user with other users

In [20]:
r_matrix_dummmy = r_matrix.copy().fillna(0)

In [21]:
from sklearn.metrics.pairwise import cosine_similarity
cosine_sim = cosine_similarity(r_matrix_dummmy,r_matrix_dummmy)

In [22]:
cosine_sim = pd.DataFrame(cosine_sim, index=r_matrix.index, columns=r_matrix.index)
cosine_sim.head()

user_id,1,2,3,4,5,6,7,8,9,10,...,934,935,936,937,938,939,940,941,942,943
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.133087,0.027135,0.005704,0.310514,0.308916,0.340438,0.204723,0.016423,0.271532,...,0.330925,0.109434,0.218651,0.107064,0.13252,0.105178,0.236144,0.059996,0.151718,0.283841
2,0.133087,1.0,0.076378,0.226998,0.032288,0.25925,0.10616,0.092236,0.122885,0.134034,...,0.11882,0.199812,0.304854,0.368688,0.263314,0.222997,0.126015,0.108571,0.146481,0.063155
3,0.027135,0.076378,1.0,0.281951,0.0,0.070221,0.034959,0.066946,0.085768,0.036957,...,0.014063,0.018416,0.129184,0.06686,0.066671,0.019108,0.087982,0.076008,0.045641,0.0
4,0.005704,0.226998,0.281951,1.0,0.028112,0.071351,0.097864,0.145493,0.062248,0.041724,...,0.023816,0.046781,0.126115,0.203807,0.112354,0.0,0.086146,0.205949,0.16386,0.060328
5,0.310514,0.032288,0.0,0.028112,1.0,0.148999,0.260888,0.206361,0.006475,0.171423,...,0.229757,0.019463,0.05421,0.086208,0.114285,0.092893,0.131869,0.055227,0.185711,0.218312


In [23]:
def weighted_mean(user_id,movie_id):
    if movie_id in r_matrix:
        sim_scores = cosine_sim[user_id]
        m_ratings = r_matrix[movie_id]
        idx = m_ratings[m_ratings.isnull()].index
        m_ratings = m_ratings.dropna()
        sim_scores = sim_scores.drop(idx)
        return np.dot(sim_scores,m_ratings)/sim_scores.sum()
    else:
        return 3

In [24]:
score(weighted_mean)

1.018498422170648

We can see that with this approach we performed slightly better than the simple mean

## Demograph as Metrics

In [25]:
merge_df = pd.merge(ratings,users)

In [26]:
merge_df.head()

Unnamed: 0,user_id,movie_id,rating,age,sex,occupation,zip_code
0,196,242,3,49,M,writer,55105
1,196,393,4,49,M,writer,55105
2,196,381,4,49,M,writer,55105
3,196,251,3,49,M,writer,55105
4,196,655,5,49,M,writer,55105


In [27]:
X = merge_df.copy()
Y = merge_df['user_id']

In [28]:
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size = 0.25,stratify = Y)

In [29]:
merge_df = X_train

In [30]:
merge_df.head()

Unnamed: 0,user_id,movie_id,rating,age,sex,occupation,zip_code
81552,758,448,4,27,M,student,53706
98658,932,208,5,58,M,educator,6437
41705,385,657,4,36,M,writer,10003
72544,660,82,2,26,M,student,77380
85892,792,1197,4,40,M,programmer,12205


## Gender

We'll be using means of each gender respectively to distinguish between choices made by a male and a female

In [31]:
gender_mean = merge_df[['movie_id', 'sex', 'rating']].groupby(['movie_id', 'sex'])['rating'].mean()  

In [32]:
gender_mean.head()

movie_id  sex
1         F      3.692308
          M      3.880952
2         F      3.250000
          M      3.186813
3         F      2.700000
Name: rating, dtype: float64

In [33]:
users = users.set_index('user_id')

In [58]:
def gender_mean_model(user_id,movie_id):
    if movie_id in r_matrix and movie_id in gender_mean:
        gender = users.loc[user_id]['sex']
        if gender in gender_mean[movie_id]:
            return gender_mean[movie_id][gender]
        else:
            return 3
    else:
        return 3

In [60]:
score(gender_mean_model)

1.0331217310043792

We can see that we perform poorly meaning gender has little to do with a user's choice regarding movies

## Gender and Occupation

In [62]:
gen_occ_mean = merge_df[['sex', 'rating', 'movie_id', 'occupation']].pivot_table(values='rating',index='movie_id',columns=['occupation', 'sex'],aggfunc='mean')

In [63]:
gen_occ_mean.head()

occupation,administrator,administrator,artist,artist,doctor,educator,educator,engineer,engineer,entertainment,...,salesman,salesman,scientist,scientist,student,student,technician,technician,writer,writer
sex,F,M,F,M,M,F,M,F,M,F,...,F,M,F,M,F,M,F,M,F,M
movie_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
1,3.866667,4.1,4.333333,3.857143,3.5,3.1,3.875,4.0,3.96875,4.5,...,,4.0,3.5,4.0,3.909091,3.740741,4.0,3.933333,4.142857,3.166667
2,3.0,4.5,,3.0,,3.0,3.0,,3.0,,...,,,,3.0,2.8,3.241379,,2.666667,4.5,2.666667
3,3.0,2.5,,,,4.0,2.0,,3.5,,...,,1.0,,,1.666667,3.409091,,4.25,,1.5
4,3.0,3.285714,,4.666667,3.0,3.25,3.444444,4.0,3.5,,...,4.0,3.5,,3.5,3.428571,3.75,,3.25,4.333333,3.333333
5,,2.5,,,,4.0,1.0,,3.0,,...,,,,,4.5,2.909091,,3.0,4.0,2.0


In [64]:
def gen_oc_model(user_id,movie_id):
    if movie_id in gen_occ_mean.index:
        user = users.loc[user_id]
        gender = user['sex']
        occ = user['occupation']
        if occ in gen_occ_mean.loc[movie_id] and gender in gen_occ_mean.loc[movie_id][occ]:
            rating = gen_occ_mean.loc[movie_id][occ][gender]
            if np.isnan(rating):
                return 3
            return rating
    return 3

In [65]:
score(gen_oc_model)

1.1484833292604344

We perform fairly better but not better than the weighted mean