# Data Engineering

In [70]:
import pandas as pd
import sys
import os
sys.path.append("../")
from definitions import ROOT_DIR
import numpy as np

In [71]:
data_folder = os.path.join(ROOT_DIR, 'data/interim')

In [72]:
data = pd.read_csv(os.path.join(data_folder, 'data.csv'))
films = pd.read_csv(os.path.join(data_folder, 'films.csv'))
user = pd.read_csv(os.path.join(data_folder, 'user.csv'))

## Preparing data

### Processing core dataset

In [73]:
data.head(2)

Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742


In [74]:
data['user_emb_id'] = data['user_id'] - 1
data['item_emb_id'] = data['item_id'] - 1

In [75]:
data.head(2)

Unnamed: 0,user_id,item_id,rating,timestamp,user_emb_id,item_emb_id
0,196,242,3,881250949,195,241
1,186,302,3,891717742,185,301


In [76]:
data.to_csv(os.path.join(data_folder, 'data_processed.csv'), index=False)

### Processing films

In [77]:
films.head(2)

Unnamed: 0,movie_id,movie title,release_date,IMDb_URL,unknown,Action,Adventure,Animation,Children's,Comedy,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [78]:
category = films.iloc[:, 4:].apply(lambda x: '|'.join(films.iloc[:, 4:].columns[x == 1]), axis=1)
films = films.iloc[:, :4]
films['category'] = category

In [79]:
films.head()

Unnamed: 0,movie_id,movie title,release_date,IMDb_URL,category
0,1,Toy Story (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?Toy%20Story%2...,Animation|Children's|Comedy
1,2,GoldenEye (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?GoldenEye%20(...,Action|Adventure|Thriller
2,3,Four Rooms (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?Four%20Rooms%...,Thriller
3,4,Get Shorty (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?Get%20Shorty%...,Action|Comedy|Drama
4,5,Copycat (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?Copycat%20(1995),Crime|Drama|Thriller


In [80]:
films.to_csv(os.path.join(data_folder, 'films_processed.csv'), index=False)

### Processing users

In [26]:
bins = [0, 18, 25, 35, 45, 50, 56, float('inf')]
labels = ['0-18', '18-24', '25-34', '35-44', '45-49', '50-55', '56+']

user['age_range'] = pd.cut(user['age'], bins=bins, labels=labels, right=False)
# age_ranges_df = pd.get_dummies(user['age_range'])

# # Concatenate the new columns to the original DataFrame
# user = pd.concat([user, age_ranges_df], axis=1)

# # Drop the original 'age' and 'age_range' columns if needed
# user = user.drop(['age', 'age_range', 'state'], axis=1, errors='ignore')

In [27]:
user.head()

Unnamed: 0,user_id,age,gender,occupation,zip_code,age_range
0,1,24,M,technician,85711,18-24
1,2,53,F,other,94043,50-55
2,3,23,M,writer,32067,18-24
3,4,24,M,technician,43537,18-24
4,5,33,F,other,15213,25-34


In [29]:

OCCUPATIONS = { 0: "administrator", 1: "artist", 2: "doctor", 3: "educator",
                4: "engineer", 5: "entertainment", 6: "executive",
                7: "healthcare", 8: "homemaker", 9: "lawyer", 10: "librarian", 11: "marketing",
                12: "none", 13: "other", 14: "programmer", 15: "retired", 16: "salesman",
                17: "scientist", 18: "student", 19: "technician", 20: "writer" }
OCCUPATIONS = {v: k for k, v in OCCUPATIONS.items()}

user['occ_desc'] = user['occupation'].apply(lambda x: OCCUPATIONS[x])

In [30]:
user.head(2)

Unnamed: 0,user_id,age,gender,occupation,zip_code,age_range,occ_desc
0,1,24,M,technician,85711,18-24,19
1,2,53,F,other,94043,50-55,13


In [32]:
user.to_csv(os.path.join(data_folder, 'user_processed.csv'), index=False)