In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder

In [2]:
u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv('ml-100k/u.user', sep='|', names=u_cols, encoding='latin-1', parse_dates=True)

r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings = pd.read_csv('ml-100k/u.data', sep='\t', names=r_cols, encoding='latin-1')

m_cols = ['movie_id', 'movie_title', 'release_date', 'video_release_date','imdb_url', 'unknown', 'Action' , 'Adventure' , 'Animation', "Children's" , 'Comedy' , 'Crime' , 'Documentary' , 'Drama' , 'Fantasy', "Film-Noir" , "Horror" , "Musical" , "Mystery" , "Romance" , "Sci-Fi" ,"Thriller" , "War" , "Western" ]
movies = pd.read_csv('ml-100k/u.item', sep='|', names=m_cols, usecols=range(24), encoding='latin-1')

movie_ratings = pd.merge(movies, ratings)
df = pd.merge(movie_ratings, users)

print(df.shape)
print(df.columns)

(100000, 31)
Index(['movie_id', 'movie_title', 'release_date', 'video_release_date',
       'imdb_url', 'unknown', 'Action', 'Adventure', 'Animation', 'Children's',
       'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir',
       'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War',
       'Western', 'user_id', 'rating', 'unix_timestamp', 'age', 'sex',
       'occupation', 'zip_code'],
      dtype='object')


In [3]:
# drop columns video_release_date, imdb_url
df.drop("video_release_date", axis = 1, inplace = True)
df.drop("imdb_url", axis = 1, inplace = True)
df.drop('movie_title', axis = 1, inplace = True)

In [4]:
# drop rows where release_date is NaN
df.drop(index = df[df['release_date'].isnull()].index, inplace = True)

In [5]:
# bin age
df['age'] = pd.cut(df['age'], np.arange(0,90,10))

In [6]:
# format zip_code
df['zip_code'] = df['zip_code'].str[0]

In [7]:
# format release_date
import calendar
import datetime

def date_to_unix(data):
    arr = list()
    for i,l in enumerate(data):
        if ord(l) == 45:
            arr.append(i)
    date = int(data[0:arr[0]])
    month_name = data[arr[0]+1:arr[1]]
    datetime_object = datetime.datetime.strptime(month_name, "%b")
    month = datetime_object.month
    year = int(data[arr[1]+1:arr[1]+5])
    date = datetime.datetime(year, month, date, 0, 0)
    utc_time = calendar.timegm(date.utctimetuple())
    return(utc_time)

df['release_date'] = df['release_date'].apply(date_to_unix)

In [8]:
# get mean_rating

In [9]:
## Start to set up df_training and df_testing

df_testing = df[['rating']]
df_training = pd.DataFrame()
group_shapes = []

In [10]:
# First, ohe movie_id, user_id, sex, age, occupation
ohe_columns = ['user_id', 'movie_id', 'sex', 'age', 'occupation', 'zip_code']
for column in ohe_columns:
    df_ohe = pd.get_dummies(df[column], prefix=column)
    df_training = pd.concat([df_training, df_ohe], axis=1, ignore_index=True)
    group_shapes.append(len(df_ohe.columns))


In [11]:
# Handle the genre field, which is already ohe
genre_columns = ['unknown', 'Action' , 'Adventure' , 'Animation', "Children's" , 'Comedy' , 'Crime' , 'Documentary' , 'Drama' , 'Fantasy', "Film-Noir" , "Horror" , "Musical" , "Mystery" , "Romance" , "Sci-Fi" ,"Thriller" , "War" , "Western"]
df_training = pd.concat([df_training, df[genre_columns]], axis=1, ignore_index=True)
group_shapes.append(len(genre_columns))


In [12]:
# Then append the non-ohe columns
# Wait. For some reason, the algorithm performs really badly when these columns are included. Exclude them for now.
# non_ohe_columns = df.loc[:, ~df.columns.isin(ohe_columns + genre_columns + ['rating'])].columns.tolist()
# df_training = pd.concat([df_training, df[non_ohe_columns]], axis=1, ignore_index=True)
# group_shapes.extend([1 for column in non_ohe_columns])


In [13]:
print(df_training.shape)
print(df_testing.shape)
print(group_shapes)

(99991, 2693)
(99991, 1)
[943, 1681, 2, 8, 21, 19, 19]


In [14]:
# Save group_shapes to file
np.savetxt("group_shapes", 
           group_shapes,
           delimiter =", ", 
           fmt ='% s')

In [15]:
# Save df_training and df_testing to file
df_training.to_csv('x_data', header=False, index=False)
df_testing.to_csv('y_data', header=False, index=False)