In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm

In [2]:
# Load the data - MovieLens 100K

data = pd.read_csv('ml-100k/u.data', sep='\t', header=None, names=['user_id', 'item_id', 'rating', 'timestamp'])
data.drop('timestamp', axis=1, inplace=True)

user_count = data['user_id'].nunique()
item_count = data['item_id'].nunique()
print('user_count: ', user_count)
print('item_count: ', item_count)

train_data = pd.read_csv(f'ml-100k/u1.base', sep="\t", header=None, names=['user_id', 'item_id', 'rating', 'timestamp'])
test_data = pd.read_csv(f'ml-100k/u1.test', sep="\t", header=None, names=['user_id', 'item_id', 'rating', 'timestamp'])
train_data.drop('timestamp', axis=1, inplace=True)
test_data.drop('timestamp', axis=1, inplace=True)

user_count:  943
item_count:  1682


In [16]:
# We will create feature vectors for each user and item from u.user and u.item respectively

# User features -> User ID|Age|Gender|Occupation|Zip-code -> We will use Age, Gender and Occupation, One Hot Encoded
# Age buckets -> 0-18, 18-24, 25-34, 35-44, 45-49, 50-55, 56+

# Read u.user into a dataframe
user_data = pd.DataFrame(columns=['user_id', 'age', 'gender', 'occupation', 'zip_code'])
user_file = open('ml-100k/u.user', 'r')
for line in user_file:
    line = line.rstrip()
    line = line.split('|')
    user_data = user_data.append(pd.DataFrame([line], columns=['user_id', 'age', 'gender', 'occupation', 'zip_code']), ignore_index=True)


  user_data = user_data.append(pd.DataFrame([line], columns=['user_id', 'age', 'gender', 'occupation', 'zip_code']), ignore_index=True)
  user_data = user_data.append(pd.DataFrame([line], columns=['user_id', 'age', 'gender', 'occupation', 'zip_code']), ignore_index=True)
  user_data = user_data.append(pd.DataFrame([line], columns=['user_id', 'age', 'gender', 'occupation', 'zip_code']), ignore_index=True)
  user_data = user_data.append(pd.DataFrame([line], columns=['user_id', 'age', 'gender', 'occupation', 'zip_code']), ignore_index=True)
  user_data = user_data.append(pd.DataFrame([line], columns=['user_id', 'age', 'gender', 'occupation', 'zip_code']), ignore_index=True)
  user_data = user_data.append(pd.DataFrame([line], columns=['user_id', 'age', 'gender', 'occupation', 'zip_code']), ignore_index=True)
  user_data = user_data.append(pd.DataFrame([line], columns=['user_id', 'age', 'gender', 'occupation', 'zip_code']), ignore_index=True)
  user_data = user_data.append(pd.DataFrame([lin

In [17]:
# One Hot encoding user_data
user_data['age'] = user_data['age'].astype(int)
# Age buckets -> 0-18, 18-24, 25-34, 35-44, 45-49, 50-55, 56+
user_data['age_0-18'] = np.where(user_data['age'] <= 18, 1, 0)
user_data['age_18-24'] = np.where((user_data['age'] > 18) & (user_data['age'] <= 24), 1, 0)
user_data['age_25-34'] = np.where((user_data['age'] > 24) & (user_data['age'] <= 34), 1, 0)
user_data['age_35-44'] = np.where((user_data['age'] > 34) & (user_data['age'] <= 44), 1, 0)
user_data['age_45-49'] = np.where((user_data['age'] > 44) & (user_data['age'] <= 49), 1, 0)
user_data['age_50-55'] = np.where((user_data['age'] > 49) & (user_data['age'] <= 55), 1, 0)
user_data['age_56+'] = np.where(user_data['age'] > 55, 1, 0)
user_data.drop('age', axis=1, inplace=True)

# Gender 
user_data['gender_M'] = np.where(user_data['gender']=='M', 1, 0)
user_data['gender_F'] = np.where(user_data['gender']=='F', 1, 0)
user_data.drop('gender', axis=1, inplace=True)

# Occupation -> One Hot Encoding using get dummies
user_data = pd.concat([user_data, pd.get_dummies(user_data['occupation'], prefix='occupation')], axis=1)
user_data.drop('occupation', axis=1, inplace=True)

# Zip code -> We will drop this column
user_data.drop('zip_code', axis=1, inplace=True)

user_data.head()

Unnamed: 0,user_id,age_0-18,age_18-24,age_25-34,age_35-44,age_45-49,age_50-55,age_56+,gender_M,gender_F,...,occupation_marketing,occupation_none,occupation_other,occupation_programmer,occupation_retired,occupation_salesman,occupation_scientist,occupation_student,occupation_technician,occupation_writer
0,1,0,1,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
1,2,0,0,0,0,0,1,0,0,1,...,0,0,1,0,0,0,0,0,0,0
2,3,0,1,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
3,4,0,1,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
4,5,0,0,1,0,0,0,0,0,1,...,0,0,1,0,0,0,0,0,0,0


In [24]:
# Convert user_data to a dictionary
user_data_dict = {}
for index, row in user_data.iterrows():
    user_data_dict[int(row['user_id'])] = row[1:].to_numpy()


In [25]:
# Item features -> Item ID|Movie Title|Release Date|Video Release Date|IMDb URL|unknown|Action|Adventure|Animation|Children's|Comedy|Crime|Documentary|Drama|Fantasy|Film-Noir|Horror|Musical|Mystery|Romance|Sci-Fi|Thriller|War|Western
# We will use the genres, One Hot Encoded

# Read u.item into a dataframe
item_data = pd.DataFrame(columns=['item_id', 'movie_title', 'release_date', 'video_release_date', 'imdb_url', 'unknown', 'Action', 'Adventure', 'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western'])
item_file = open('ml-100k/u.item', 'r')
for line in item_file:
    line = line.rstrip()
    line = line.split('|')
    item_data = item_data.append(pd.DataFrame([line], columns=['item_id', 'movie_title', 'release_date', 'video_release_date', 'imdb_url', 'unknown', 'Action', 'Adventure', 'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']), ignore_index=True)

item_data.drop(['movie_title', 'release_date', 'video_release_date', 'imdb_url'], axis=1, inplace=True)
item_data = item_data.astype(int)
item_data.head()

  item_data = item_data.append(pd.DataFrame([line], columns=['item_id', 'movie_title', 'release_date', 'video_release_date', 'imdb_url', 'unknown', 'Action', 'Adventure', 'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']), ignore_index=True)
  item_data = item_data.append(pd.DataFrame([line], columns=['item_id', 'movie_title', 'release_date', 'video_release_date', 'imdb_url', 'unknown', 'Action', 'Adventure', 'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']), ignore_index=True)
  item_data = item_data.append(pd.DataFrame([line], columns=['item_id', 'movie_title', 'release_date', 'video_release_date', 'imdb_url', 'unknown', 'Action', 'Adventure', 'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-N

UnicodeDecodeError: 'utf-8' codec can't decode byte 0xe9 in position 2892: invalid continuation byte