In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# Import Modules
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
import pyspark
from pyspark import SparkContext
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.ml.recommendation import ALS
from pyspark.sql.functions import isnan, when, count, col


%config InlineBackend.figure_format = 'retina'

In [51]:
# Import Custom Modules
from src.helpers import *

In [18]:
# Import Training Data 
ratings_df = pd.read_csv('data/training.csv').drop(columns='timestamp')

print(ratings_df.shape)
ratings_df.head()

(800000, 3)


Unnamed: 0,user,movie,rating
0,6040,858,4
1,6040,593,5
2,6040,2384,4
3,6040,1961,4
4,6040,2019,5


In [22]:
pd.isna(ratings_df).sum()

user      0
movie     0
rating    0
dtype: int64

In [23]:
# Load Movies DAT
movies_df = pd.read_csv('data/movies.dat',
                        names=['movie_id', 'movie_title', 'genre'],
                        engine='python',
                        delimiter="::",
                        header=None) 

print(f"Original: {ratings_df['movie'].unique().shape} Load: {movies_df.shape}")
movies_df.head(2)

Original: (3662,) Load: (3883, 3)


Unnamed: 0,movie_id,movie_title,genre
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy


In [43]:
# pd.isnull(movies_df).sum()

# movies_df.info()

In [50]:
# Load Users DAT
users_df = pd.read_csv('data/users.dat',
                        names=['user_id', 'gender', 'age', 'occ', 'zipcode'],
                        engine='python',
                        delimiter="::",
                        header=None) 

print(f"Original: {ratings_df['user'].unique().shape} Load: {users_df.shape}")
users_df.head(2)

Original: (5399,) Load: (6040, 5)


Unnamed: 0,user_id,gender,age,occ,zipcode
0,1,F,1,10,48067
1,2,M,56,16,70072


## Create Movie Features

In [57]:
# Create Year Feature
movies_df['year'] = movies_df['movie_title'].apply(lambda x: int(getReMax(x)))

In [59]:
# Remove Genre Pipe
movies_df['genre'] = movies_df['genre'].apply(lambda x: x.replace("|", " "))

Unnamed: 0,movie_id,movie_title,genre,year
0,1,Toy Story (1995),Animation Children's Comedy,1995
1,2,Jumanji (1995),Adventure Children's Fantasy,1995
2,3,Grumpier Old Men (1995),Comedy Romance,1995
3,4,Waiting to Exhale (1995),Comedy Drama,1995
4,5,Father of the Bride Part II (1995),Comedy,1995
...,...,...,...,...
3878,3948,Meet the Parents (2000),Comedy,2000
3879,3949,Requiem for a Dream (2000),Drama,2000
3880,3950,Tigerland (2000),Drama,2000
3881,3951,Two Family House (2000),Drama,2000


## Create User Features

In [40]:
# Create Clean Zipcode
users_df['zipclean'] = users_df['zipcode'].apply(lambda x: x.partition('-')[0])

In [48]:
# Create Occupation Name Feature
occ_map = {
0:  "other", 
1:  "academic/educator",
2:  "artist",
3:  "clerical/admin",
4:  "college/grad student",
5:  "customer service",
6:  "doctor/health care",
7:  "executive/managerial",
8:  "farmer",
9:  "homemaker",
10:  "K-12 student",
11:  "lawyer",
12:  "programmer",
13:  "retired",
14:  "sales/marketing",
15:  "scientist",
16:  "self-employed",
17:  "technician/engineer",
18:  "tradesman/craftsman",
19:  "unemployed",
20:  "writer",
}

users_df['occ_name'] = users_df['occ'].map(occ_map)

Unnamed: 0,user_id,gender,age,occ,zipcode,zipclean,occ_name
0,1,F,1,10,48067,48067,K-12 student
1,2,M,56,16,70072,70072,self-employed
2,3,M,25,15,55117,55117,scientist
3,4,M,45,7,02460,02460,executive/managerial
4,5,M,25,20,55455,55455,writer
...,...,...,...,...,...,...,...
6035,6036,F,25,15,32603,32603,scientist
6036,6037,F,45,1,76006,76006,academic/educator
6037,6038,F,56,1,14706,14706,academic/educator
6038,6039,F,45,0,01060,01060,other
