### MovieLens Dataset

In [None]:
!rm ml-100k.zip
!rm -rf ml-100k
!wget -O ml-100k.zip http://files.grouplens.org/datasets/movielens/ml-100k.zip
!unzip ml-100k.zip

In [None]:
import pandas as pd 
user_path = './ml-100k/u.user'
item_path = './ml-100k/u.item'
user_item = './ml-100k/u.data'

### Observe user data 
* 913 users within this dataset 
* contains user id, age, gender occupation, zipcode with delimiter '|'

In [None]:
user_df = pd.read_csv(user_path, names=['uid','age','gender','occupation','zipcode'],  sep='|')
user_df.head()

In [None]:
user_df.info()

### Observe item datasets 
* 1681 films 
* Each row with information - item id, title, release date, video release date, imdb url and 19 columns indicates the genres it belongs to 
* genres include - 'unknown','Action', 'Adventure', 'Animation', 'Childrens' , 'Comedy' , 'Crime','Documentary','Drama' ,'Fantasy' , 'Film-Noir' , 'Horror' , 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western'

In [None]:
genres = ['unknown','Action' , 'Adventure', 'Animation', 'Childrens' , 'Comedy' , 'Crime', \
                                        'Documentary', 'Drama' ,'Fantasy' , 'Film-Noir' , 'Horror' , 'Musical', \
                                        'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']


item_df = pd.read_csv(item_path, names=['iid','title','release_date','video_release_date', 'imdb url'] + genres,  sep='|', encoding = "ISO-8859-1")



In [None]:
item_df.info()

### Observe rating dataset
* 90640 entries 
* Each row contains item id, user id, rating and timestamp when rating is given 

In [None]:
ui_interaction = pd.read_csv(user_item, names=['iid', 'uid', 'rating', 'timestamp'], sep='\t')
ui_interaction

In [None]:
ranks = ui_interaction.groupby('uid').timestamp.rank(pct=True, method='first')
data = ui_interaction.join((ranks>0.9).to_frame('holdout'))
test_data = data[data['holdout']].drop('holdout', axis=1)
train_data = data[~data['holdout']].drop('holdout', axis=1)

In [None]:
train_data = ui_interaction[(ui_interaction['timestamp']<=888700934) ] #20210426
test_data = ui_interaction[(ui_interaction['timestamp']>888700934) & (ui_interaction['timestamp']<=893286638)] #20210426 - 20210429
rating_threshold=3

### Propensity Score For Bias Correction 

In [None]:


# merge item_df and user_df for further observation 
user_item_df = train_data.merge(item_df, on=['iid'])
user_item_df = user_item_df.merge(user_df, on=['uid'])
user_item_df.head() 


In [None]:
test_user_item_df = test_data.merge(item_df, on=['iid'])
test_user_item_df = test_user_item_df.merge(user_df, on=['uid'])
test_user_item_df.head()

In [None]:
user_item_df.info()

### Observe - Gender v.s. Genre 

In [None]:
import numpy 
import matplotlib.pyplot as plt
import seaborn as sns

def plot_heat_map(df, figsize=(10,7)): 
    df = df.div(df.sum(axis=1), axis=0)     
    plt.subplots(figsize=figsize)
    sns.heatmap(df)

gender = user_item_df[user_item_df['rating']>rating_threshold][['gender']+genres].groupby(['gender']).sum()
plot_heat_map(gender, figsize=(10,2))
    

### Observe - Occupation v.s. Genre 

In [None]:
occupation = user_item_df[user_item_df['rating']>rating_threshold][['occupation']+genres].groupby(['occupation']).sum()
plot_heat_map(occupation, figsize=(10,7))

### Observe - Age v.s. Genre 

In [None]:
user_item_df['age_segment']=user_item_df['age']//10
test_user_item_df['age_segment'] = test_user_item_df['age']//10
age = user_item_df[user_item_df['rating']>rating_threshold][['age_segment']+genres].groupby(['age_segment']).sum()
plot_heat_map(age, figsize=(10,5))

In [None]:
user_item_df['loc']=user_item_df['zipcode'].apply(lambda x: x[:3])
test_user_item_df['loc']= test_user_item_df['zipcode'].apply(lambda x: x[:3])
loc = user_item_df[user_item_df['rating']>rating_threshold][['loc']+genres].groupby(['loc']).sum()
plot_heat_map(loc, figsize=(10,10))

In [None]:
from datetime import datetime

def to_year(x):
    try: 
        return int(datetime.strptime(x, "%d-%b-%Y").timetuple()[0])
    except: 
        return None
user_item_df['year'] = user_item_df['release_date'].apply(lambda x: to_year(x))
test_user_item_df['year'] = test_user_item_df['release_date'].apply(lambda x: to_year(x))



In [None]:
user_item_df[user_item_df['rating']>rating_threshold][['age_segment']+['year']]

In [None]:
year = user_item_df[user_item_df['rating']>rating_threshold][['age_segment']+['year']]
year['count'] = 1 
year = year.groupby(['age_segment', 'year']).sum().reset_index()
year = year.pivot(index='age_segment', columns='year', values=['count'])
year = year.fillna(0)

year

In [None]:
plot_heat_map(year, figsize=(10,10))

### Observe user profile 

In [None]:
user_genre_profile = user_item_df[user_item_df['rating']>rating_threshold][['uid']+genres].groupby(['uid']).sum().reset_index()

total_features = user_item_df.merge(user_genre_profile, on=['uid'])
test_total_features = test_user_item_df.merge(user_genre_profile, on=['uid'])

In [None]:
total_features

In [None]:
test_total_features = test_total_features.fillna(0)
test_total_features

In [None]:
total_features = pd.get_dummies(total_features, columns=['gender', 'occupation', 'loc'])
test_total_features = pd.get_dummies(test_total_features, columns=['gender', 'occupation', 'loc'])

In [None]:
!pip install xgboost 

In [None]:
import xgboost
import shap

total_features=(total_features - total_features.mean()) / total_features.std()
y = total_features['rating']
X = total_features.drop(['rating', 'iid', 'uid', 'title', 'release_date', 'imdb url', 'zipcode'], axis=1)

In [None]:



model = xgboost.XGBRegressor().fit(X, y)


In [None]:
from sklearn.metrics import mean_squared_error
import math
test_total_features = (test_total_features - total_features.mean()) / total_features.std()
test_y = test_total_features['rating']
test_X = test_total_features.drop(['rating', 'iid', 'uid', 'title', 'release_date', 'imdb url', 'zipcode'], axis=1)

y_pred = model.predict(test_X)


math.sqrt(mean_squared_error(y_pred, test_y))/test_y.mean()

In [None]:
explainer = shap.Explainer(model)
shap_values = explainer(X)

In [None]:
shap.plots.beeswarm(shap_values)


In [None]:
shap.plots.bar(shap_values)

In [None]:
user_df['loc'] = user_df['zipcode'].apply(lambda x:x[0:3])
user_demo_df = user_df[['uid', 'age', 'gender', 'occupation']]

In [None]:
user_raw = user_demo_df.merge(user_genre_profile, on=['uid'])


In [None]:
oh_user = pd.get_dummies(user_raw, columns=['gender', 'occupation'])

In [None]:
oh_user = oh_user.fillna(0)

In [None]:
from sklearn.decomposition import PCA
import numpy
import matplotlib.pyplot as plot
import pandas 

df_normalized=(oh_user - oh_user.mean()) / oh_user.std()
df_normalized = df_normalized.drop(['uid'], axis=1)
pca = PCA(n_components=5)
principal_components = pca.fit_transform(df_normalized)


print(pca.explained_variance_ratio_)
plot.plot(pca.explained_variance_ratio_)
plot.ylabel('Explained Variance')
plot.xlabel('Components')
plot.show()


In [None]:
principalDf = pd.DataFrame(data = principal_components
             , columns = ['principal component 1', 'principal component 2', 'principal component 3', 'principal component 4', 'principal component 5'])


In [None]:
fig = plt.figure(figsize = (8,8))
ax = fig.add_subplot(1,1,1) 
ax.set_xlabel('Principal Component 1', fontsize = 15)
ax.set_ylabel('Principal Component 2', fontsize = 15)
ax.set_title('2 component PCA', fontsize = 20)
ax.scatter(principalDf['principal component 1']
               , principalDf['principal component 2']
               , s = 50)
ax.grid()

In [None]:
user_principal = pd.concat([oh_user[['uid']], principalDf], axis=1)
user_principal

In [None]:
user_principal.to_csv('user_principal.csv')

In [None]:
user_item_df.to_pickle("user_item_df.p")
item_df.to_pickle("item_df.p")
oh_user.to_pickle("user_df.p")

In [None]:
train_data.to_csv('./ml-100k/train.csv', index=None)
test_data.to_csv('./ml-100k/test.csv',index=None)

In [None]:
%store rating_threshold