In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import calendar
import datetime
import json 
from collections import OrderedDict

In [2]:
# get mean rating for the user x
def get_mean_rating(x, mean_rating):
    return float(mean_rating[mean_rating['user_id'] == x]['mean_rating'].to_numpy()[0])

In [3]:
# format release date
def date_obj_to_datetime(data):
    arr = list()
    for i,l in enumerate(data):
        if ord(l) == 45:
            arr.append(i)
    date = int(data[0:arr[0]])
    month_name = data[arr[0]+1:arr[1]]
    datetime_object = datetime.datetime.strptime(month_name, "%b")
    month = datetime_object.month
    year = int(data[arr[1]+1:arr[1]+5])
    date = datetime.datetime(year, month, date)
#     utc_time = calendar.timegm(date.utctimetuple())
#     return(utc_time)
    return date

In [4]:
# format timestamp
def unix_to_datetime(data):
    temp = datetime.datetime.fromtimestamp(data)
    date = datetime.datetime(temp.year, temp.month, temp.day) # temp.hour
    return date

In [5]:
# import user, data, item and genre data
path = 'dataset/original/'
u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv(path + 'ml-100k/u.user', sep='|', names=u_cols, encoding='latin-1', parse_dates=True) 
d_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
data = pd.read_csv(path + 'ml-100k/u.data', sep='\t', names=d_cols, encoding='latin-1')
m_cols = ['movie_id', 'movie_title', 'release_date', 'video_release_date','imdb_url', 'unknown', 'Action' , 'Adventure' , 'Animation', "Children's" , 'Comedy' , 'Crime' , 'Documentary' , 'Drama' , 'Fantasy', "Film-Noir" , "Horror" , "Musical" , "Mystery" , "Romance" , "Sci-Fi" ,"Thriller" , "War" , "Western" ]
movies = pd.read_csv(path + 'ml-100k/u.item', sep='|', names=m_cols, usecols=range(24), encoding='latin-1')
genre = pd.read_csv(path + 'ml-100k/u.genre')

# merge user, data and item
movie_ratings = pd.merge(movies, data)
df = pd.merge(movie_ratings, users)

# for storing the column length for each unique features
col_len = OrderedDict()
final_columns = ["movie_id", "user_id", "timestamp", "release_date", "unknown", "genre", "timestamp", "sex", "cum_mean_rate", "mean_rate", "zip_code", "age", "occupation" ]
for col in final_columns:
    col_len[col] = 1
col_len["genre"] = len(genre)

In [6]:
# droping some rows that has no release date 
df.drop(index = df[df['release_date'].isnull() == True].index, inplace = True)

In [7]:
# calculating cumulative mean rating based on timestamp
data_t = data.sort_values(['user_id', 'timestamp']).reset_index(drop=True)
data_t['one'] = 1
data_t['cumsum'] = data_t.groupby('user_id')['one'].cumsum()
data_t['cum_mean_rate'] = data_t.groupby('user_id')['rating'].cumsum() / data_t['cumsum']

In [8]:
# calculating mean rating for each user and adding to the original dataframe
mean_rating = data_t.groupby('user_id')['rating'].sum() / data_t.groupby('user_id')['one'].sum()
mean_rating = mean_rating.reset_index()
mean_rating.rename(columns = {0:'mean_rating'}, inplace = True)

In [9]:
data_t['mean_rate'] = data_t['user_id']
data_t["mean_rate"] = data_t['mean_rate'].apply(lambda x: get_mean_rating(x, mean_rating))

In [10]:
# merging to the original dataframe
df = df.merge(data_t, how='left', left_on=['timestamp', 'user_id', 'movie_id', 'rating'], right_on=['timestamp', 'user_id', 'movie_id', 'rating'])

In [11]:
# format release_date
df['release_date'] = df['release_date'].apply(lambda x: date_obj_to_datetime(x))

In [12]:
# format timestamp
df['timestamp'] = df['timestamp'].apply(lambda x : unix_to_datetime(x))

In [13]:
# format age
age_labels = []
age_bins = np.arange(0,90,10)
for i in range(len(age_bins)-1):
    age_labels.append(str(age_bins[i]) + '-' + str(age_bins[i+1]))

In [14]:
# convert ages to group of age ranges
df['age'] = pd.cut(df['age'], bins = age_bins, labels=age_labels)

In [15]:
# format zip code
df['zip_code'] = df['zip_code'].str[0]

In [16]:
# Handling categorical column
df['sex'] = (df['sex'] == 'M').astype(int)
categorical_columns = ["zip_code", "age", "occupation"]
for col in categorical_columns:
    col_len[col] = df[col].nunique()
    df_dummies = pd.get_dummies(df[col], prefix=col)
    df = pd.concat([df, df_dummies], axis = 1)
df.drop(categorical_columns, axis = 1, inplace = True)

In [17]:
# Preprocessing some numerical columns
numerical_columns = ["release_date", "timestamp", "user_id", "movie_id"]
for col in numerical_columns:
    col_len[col] = df[col].nunique()
    df_dummies = pd.get_dummies(df[col], prefix=col)
    df = pd.concat([df_dummies, df], axis = 1)
df.drop(numerical_columns, axis = 1, inplace = True)

In [18]:
# droping all unnecessary columns
df.drop(["video_release_date", "imdb_url", "movie_title", "one", "cumsum"], axis = 1, inplace = True)

In [19]:
export_path = 'dataset/preprocessed/'

In [44]:
# export csv file
df.to_csv(export_path + 'ml-100k-preprocessed.csv', index = False)

In [45]:
# saving info about data
with open(export_path + "ml-100k-preprocessed.json", "w") as outfile:
    json.dump(col_len, outfile)