In [None]:
%load_ext autoreload
%autoreload 2

In [2]:
# Import Modules
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np

In [3]:
# Import Custom Modules
from src.helpers import *

In [4]:
# Import Training Data 
ratings_df = pd.read_csv('data/training.csv').drop(columns='timestamp')

print(ratings_df.shape)
ratings_df.head()

(800000, 3)


Unnamed: 0,user,movie,rating
0,6040,858,4
1,6040,593,5
2,6040,2384,4
3,6040,1961,4
4,6040,2019,5


In [5]:
pd.isna(ratings_df).sum()

user      0
movie     0
rating    0
dtype: int64

In [6]:
# Load Movies DAT
movies_df = pd.read_csv('data/movies.dat',
                        names=['movie_id', 'movie_title', 'genre'],
                        engine='python',
                        delimiter="::",
                        header=None) 

print(f"Original: {ratings_df['movie'].unique().shape} Load: {movies_df.shape}")
movies_df.head(2)

Original: (3662,) Load: (3883, 3)


Unnamed: 0,movie_id,movie_title,genre
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy


In [7]:
# pd.isnull(movies_df).sum()

# movies_df.info()

In [8]:
# Load Users DAT
users_df = pd.read_csv('data/users.dat',
                        names=['user_id', 'gender', 'age', 'occ', 'zipcode'],
                        engine='python',
                        delimiter="::",
                        header=None) 

print(f"Original: {ratings_df['user'].unique().shape} Load: {users_df.shape}")
users_df.head(2)

Original: (5399,) Load: (6040, 5)


Unnamed: 0,user_id,gender,age,occ,zipcode
0,1,F,1,10,48067
1,2,M,56,16,70072


## Create Movie Features

In [9]:
# Create Year Feature
movies_df['year'] = movies_df['movie_title'].apply(lambda x: int(getReMax(x)))

In [10]:
# Remove Genre Pipe
movies_df['genre'] = movies_df['genre'].apply(lambda x: x.replace("|", " "))

## Create User Features

In [11]:
# Create Clean Zipcode
users_df['zipclean'] = users_df['zipcode'].apply(lambda x: x.partition('-')[0])

In [12]:
# Create Occupation Name Feature
occ_map = {
0:  "other", 
1:  "academic/educator",
2:  "artist",
3:  "clerical/admin",
4:  "college/grad student",
5:  "customer service",
6:  "doctor/health care",
7:  "executive/managerial",
8:  "farmer",
9:  "homemaker",
10:  "K-12 student",
11:  "lawyer",
12:  "programmer",
13:  "retired",
14:  "sales/marketing",
15:  "scientist",
16:  "self-employed",
17:  "technician/engineer",
18:  "tradesman/craftsman",
19:  "unemployed",
20:  "writer",
}

users_df['occ_name'] = users_df['occ'].map(occ_map)

## Export Data as CSV for Future Use

In [None]:
# Export Ratings DF
ratings_df.to_csv('cleaned_data/ratings.csv', index=False)

In [None]:
# Export Movies DF
movies_df.to_csv('cleaned_data/movies.csv', index=False)

In [None]:
# Export Users DF
users_df.to_csv('cleaned_data/users.csv', index=False)