In [7]:
import pandas as pd
import numpy as np
from pyparsing import col
from requests import head
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from torch import rand
import seaborn as sns
import matplotlib.pyplot as plt

df = pd.read_csv('data/ml-100k/u.data', sep='\t', header=None)

df.columns = ['user_id', 'item_id', 'rating', 'timestamp']
# user id, age, gender, occupation, zip code
users = pd.read_csv('final_project/data/ml-100k/u.user', sep='|', header=None)
users.columns = ['user_id', 'age', 'gender', 'occupation', 'zip_code']

genre = pd.read_csv('data/ml-100k/u.genre', sep='|', header=None)
genre.columns = ['genre', 'genre_id']

job = pd.read_csv('dataml-100k\\u.occupation', sep='|', header=None)
job.columns = ['occupation']
job = job.reset_index().rename(columns={'index': 'job'})

# before you run this, if you've newly downloaded the data
# do a ctrl+f for '||' and replace with '|' bc otherwise it doesn't work
items = pd.read_csv('data\\ml-100k\\u.item', sep='|', header=None)
# items[~items.iloc(21) == 'unknown'] ignore
items.columns = ['movie_id', 'movie_title', 'video_release_date', 'imdb_url', 'unknown', 'Action', 'Adventure', 'Animation','Children', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film_Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci_Fi', 'Thriller', 'War', 'Western']

data = df.merge(users, how='left', on = 'user_id').merge(job, how = 'left', on='occupation').merge(items, how='left', left_on='item_id', right_on='movie_id')
# okay there are a few values that don't have a Western tag I'm filling it w/ a 0
# looks like they're all the same movie, item_id which is unknown so I'm actually just gonna remove that from this set
data[data.Western.isna()]

data = data[data.Western.notna()].astype({'Western': 'int'})
# now everything can be nice, neat integers
data.gender.unique()
data['gender_id'] = np.where(data.gender == 'M', 0, 1)

features = ['rating', 'age', 'gender_id', 'job', 'unknown', 'Action', 'Adventure', 'Animation','Children', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film_Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci_Fi', 'Thriller', 'War', 'Western']

data_x = data[features].drop(columns=['rating'])
data_y = data[['rating']]

train_x, train_y, test_x, test_y = train_test_split(data_x, data_y, test_size=0.33, random_state=123)

model = LinearRegression().fit(train_x, test_x)

print(model.intercept_, model.coef_, model.score(train_y, test_y))

print('\n ------------- Model Intercept ------------- \n', sep='')
print(' ------------- ', model.intercept_, ' -------------', sep='')

print('\n ------------- Model Coefficients ------------- \n', sep='')
print(' ------------- ', model.coef_, ' -------------', sep='')

print('\n ------------- Model Score ------------- \n', sep='')
print(' ------------- ', round(model.score(train_y, test_y), 3), ' -------------', sep='')

print('\n \n \n \n \n \n')

r_sq = round(model.score(train_x, test_x), 3)
print('coefficient of determination training set:', r_sq)

r_sq2 = round(model.score(train_y, test_y), 3)
print('coefficient of determination training set:', r_sq2)

model.coef_

pred = pd.DataFrame(model.predict(train_y), columns=['pred_rating'])
test_data = train_y.join(test_y).reset_index().join(pred)

sns.scatterplot(data=test_data, x = 'rating', y = 'pred_rating', hue = 'pred_rating')
sns.set(style='whitegrid',)
# plt.show()

sns.boxplot(data=test_data, x = 'rating', y = 'pred_rating')
sns.set(style='whitegrid',)
# plt.show()

sns.boxplot(data=test_data, x = 'gender_id', y = 'pred_rating')
sns.set(style='whitegrid',)
# plt.show()

## User-User Collaborative Filtering Using Pearson Correlation 

In [8]:
data.head()

Unnamed: 0,user_id,item_id,rating,timestamp,age,gender,occupation,zip_code,job,movie_id,...,Film_Noir,Horror,Musical,Mystery,Romance,Sci_Fi,Thriller,War,Western,gender_id
0,196,242,3,881250949,49,M,writer,55105,20,242,...,0,0,0,0,0,0,0,0,0,0
1,186,302,3,891717742,39,F,executive,0,6,302,...,1,0,0,1,0,0,1,0,0,1
2,22,377,1,878887116,25,M,writer,40206,20,377,...,0,0,0,0,0,0,0,0,0,0
3,244,51,2,880606923,28,M,technician,80525,19,51,...,0,0,0,0,1,0,0,1,1,0
4,166,346,1,886397596,47,M,educator,55113,3,346,...,0,0,0,0,0,0,0,0,0,0


In [23]:
data_matrix = data.pivot(index="user_id", columns="movie_id", values="rating")
# TODO: Remove users with less than N reviews
# TODO: Remove movies with less than M reviews
data_matrix.head

mean_centered_data_matrix = data_matrix.sub(data_matrix.mean(axis=1), axis=0)

mean_centered_data_matrix.head()

movie_id,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.391144,-0.608856,0.391144,-0.608856,-0.608856,1.391144,0.391144,-2.608856,1.391144,-0.608856,...,,,,,,,,,,
2,0.290323,,,,,,,,,-1.709677,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,1.132184,0.132184,,,,,,,,,...,,,,,,,,,,


ValueError: array must not contain infs or NaNs