In [49]:
import os
import pandas as pd
import numpy as np
from time import time
from datetime import datetime, timedelta
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
import pickle

sets = ["u1", "u2", "u3", "u4", "u5", "ua", "ub"]
RAW_PATH = "../data/raw/"
INTERIM_PATH = "../data/interim/"
SEED = 42

import warnings
warnings.filterwarnings("ignore")

# Read Data

In [50]:
data_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
data = pd.read_csv(os.path.join(RAW_PATH, "u.data"), sep="\t", names=data_cols, encoding='latin-1')

item_cols = ['movie_id', 'movie_title', 'release_date', 'video_release_date', 'imdb_url', 'unknown', 'action', 'adventure', 'animation', 'childrens', 'comedy', 'crime', 'documentary', 'drama', 'fantasy', 'film-noir', 'horror', 'musical', 'mystery', 'romance', 'sci-fi', 'thriller', 'war', 'western']
items = pd.read_csv(os.path.join(RAW_PATH, "u.item"), sep="|", names=item_cols, encoding='latin-1')

user_cols = ['user_id', 'age', 'gender', 'occupation', 'zip_code']
users = pd.read_csv(os.path.join(RAW_PATH, "u.user"), sep="|", names=user_cols, encoding='latin-1')

In [51]:
# Create a merged dataframe
df = pd.merge(pd.merge(items, data), users)

# Create Wide DataFrame

In [52]:
# cross-product feature transformation by combining gender and occupation
df_wide = df[['gender', 'occupation']]
df_wide['gender_occupation'] = df_wide['gender'] + "_" + df_wide['occupation']
encoder = OneHotEncoder(handle_unknown='ignore')
encoder.fit(df_wide[['gender_occupation']])
one_hot_encoded_df = pd.DataFrame(encoder.transform(df_wide[['gender_occupation']]).toarray(),
                                  columns=encoder.get_feature_names_out())
df_wide = df_wide.join(one_hot_encoded_df)
df_wide.drop(['gender', 'occupation', 'gender_occupation'], axis=1, inplace=True)

In [53]:
df_wide.head()

Unnamed: 0,gender_occupation_F_administrator,gender_occupation_F_artist,gender_occupation_F_educator,gender_occupation_F_engineer,gender_occupation_F_entertainment,gender_occupation_F_executive,gender_occupation_F_healthcare,gender_occupation_F_homemaker,gender_occupation_F_lawyer,gender_occupation_F_librarian,...,gender_occupation_M_marketing,gender_occupation_M_none,gender_occupation_M_other,gender_occupation_M_programmer,gender_occupation_M_retired,gender_occupation_M_salesman,gender_occupation_M_scientist,gender_occupation_M_student,gender_occupation_M_technician,gender_occupation_M_writer
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


# Create Deep DataFrame

In [54]:
df_deep = df[['age', 'unknown', 'action', 'adventure', 'animation', 'childrens', 'comedy', 'crime', 'documentary', 'drama', 'fantasy', 'film-noir', 'horror', 'musical', 'mystery', 'romance', 'sci-fi', 'thriller', 'war', 'western','gender', 'occupation']]

# Combine sparse categorical features into one single genre feature
df_deep['genre'] = df_deep[['unknown', 'action', 'adventure', 'animation', 'childrens', 'comedy', 'crime', 'documentary', 'drama', 'fantasy', 'film-noir', 'horror', 'musical', 'mystery', 'romance', 'sci-fi', 'thriller', 'war', 'western']].idxmax(1)
df_deep.drop(columns=['unknown', 'action', 'adventure', 'animation', 'childrens', 'comedy', 'crime', 'documentary', 'drama', 'fantasy', 'film-noir', 'horror', 'musical', 'mystery', 'romance', 'sci-fi', 'thriller', 'war', 'western'], axis=1, inplace=True)

# Encode categorical features
for feature in ['gender', 'occupation', 'genre']:
    encoder = LabelEncoder()
    encoder.fit(df_deep[[feature]])
    transformed_feature = encoder.transform(df_deep[[feature]])
    df_deep[feature] = transformed_feature

# Min-max scaling for numerical features
for feature in ['age']:
    scaler = MinMaxScaler()
    scaler.fit(df_deep[[feature]])
    transformed_feature = scaler.transform(df_deep[[feature]])
    df_deep[feature] = transformed_feature

In [55]:
df_deep.head()

Unnamed: 0,age,gender,occupation,genre
0,0.80303,1,15,2
1,0.80303,1,15,0
2,0.80303,1,15,5
3,0.80303,1,15,7
4,0.80303,1,15,3


In [56]:
# Split data
X = pd.concat([df_wide, df_deep], axis=1)
y = df[['rating']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=SEED)

def prepare_independent_variables(X):
    """Split X dataframe into its separate input components for the neural network"""
    wide_inputs = X[df_wide.columns].values
    categorical_input_1 = X[['gender']].values
    categorical_input_2 = X[['occupation']].values
    categorical_input_3 = X[['genre']].values
    numerical_input = X[['age']].values
    return wide_inputs, categorical_input_1, categorical_input_2, categorical_input_3, \
           numerical_input

wide_inputs_train, categorical_input_1_train, categorical_input_2_train, categorical_input_3_train, numerical_input_train = prepare_independent_variables(X_train)

wide_inputs_test, categorical_input_1_test, categorical_input_2_test, categorical_input_3_test, numerical_input_test = prepare_independent_variables(X_test)

y_train = y_train.values
y_test = y_test.values

In [57]:
with open(os.path.join(INTERIM_PATH, 'wide_and_deep_X_train.pkl'),'wb') as f:
    pickle.dump([wide_inputs_train, categorical_input_1_train, categorical_input_2_train, categorical_input_3_train, numerical_input_train], f)
    
with open(os.path.join(INTERIM_PATH, 'wide_and_deep_X_test.pkl'),'wb') as f:
    pickle.dump([wide_inputs_test, categorical_input_1_test,categorical_input_2_test, categorical_input_3_test, numerical_input_test], f)
    
with open(os.path.join(INTERIM_PATH, 'wide_and_deep_Y_train.pkl'),'wb') as f:
    pickle.dump(y_train, f)
    
with open(os.path.join(INTERIM_PATH, 'wide_and_deep_Y_test.pkl'),'wb') as f:
    pickle.dump(y_test, f)
    
    
df_wide.to_csv(os.path.join(INTERIM_PATH, 'wide_and_deep_df_wide.csv'))
df_deep.to_csv(os.path.join(INTERIM_PATH, 'wide_and_deep_df_deep.csv'))

data.drop(["timestamp"], axis=1).to_csv(os.path.join(INTERIM_PATH, 'svd_dataset.csv'))