In [1]:
import pyarrow # Please note that you need to install `pyarrow` (`pip install pyarrow`) to use Parquet IO functionalities.

import pandas as pd
from sklearn.preprocessing import OrdinalEncoder

from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import umap 

# Data Overview

> Prepare the training and testing data separately, but make sure to follow the exact same steps for both.

In [2]:
process_train_data = False # set this value as False for testing dataset
file_name = "train" if process_train_data else "test"

In [3]:
df = pd.read_csv(f"../data/1-recommendations_merged_100000_samples-{file_name}.csv.gz")
df.head()

Unnamed: 0,app_id,helpful,funny,date,is_recommended,hours,user_id,review_id,title,date_release,...,tag_Well-Written,tag_Werewolves,tag_Western,tag_Wholesome,tag_Word Game,tag_World War I,tag_World War II,tag_Wrestling,tag_Zombies,tag_eSports
0,359320,0,0,2019-07-01,True,68.1,13070481,8088480,Elite Dangerous,2015-04-02,...,0,0,0,0,0,0,0,0,0,0
1,471710,0,0,2021-11-27,True,5.0,12556062,38525173,Rec Room,2021-09-02,...,0,0,0,0,0,0,0,0,0,0
2,200210,0,0,2016-08-01,True,3.0,5376357,12799785,Realm of the Mad God Exalt,2012-02-20,...,0,0,0,0,0,0,0,0,0,0
3,307640,10,3,2017-10-22,False,4.6,7432800,35803840,Stick RPG 2: Director's Cut,2014-06-16,...,0,0,0,0,0,0,0,0,0,0
4,234190,21,0,2014-02-28,True,25.4,13804196,37668065,Receiver,2013-04-29,...,0,0,0,0,0,0,0,0,0,0


In [4]:
df.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 462 columns):
 #    Column                                 Dtype  
---   ------                                 -----  
 0    app_id                                 int64  
 1    helpful                                int64  
 2    funny                                  int64  
 3    date                                   object 
 4    is_recommended                         bool   
 5    hours                                  float64
 6    user_id                                int64  
 7    review_id                              int64  
 8    title                                  object 
 9    date_release                           object 
 10   win                                    bool   
 11   mac                                    bool   
 12   linux                                  bool   
 13   rating                                 object 
 14   positive_ratio                      

# Drop Unnecessary Columns

In [5]:
df.drop(["app_id", "review_id", "user_id", "title", "description"], axis=1, inplace=True)

# Common Data Type Conversion

In [6]:
# DataTime format
df['date'] = pd.to_datetime(df['date']).astype("int")
df['date_release'] = pd.to_datetime(df['date_release']).astype("int")
# df['date'] - df['date_release']

In [7]:
# Bool type
bool_columns = df.select_dtypes(include=bool).columns
# Convert boolean columns to 0s and 1s
df[bool_columns] = df[bool_columns].astype(int)

In [8]:
df.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 457 columns):
 #    Column                                 Dtype  
---   ------                                 -----  
 0    helpful                                int64  
 1    funny                                  int64  
 2    date                                   int64  
 3    is_recommended                         int64  
 4    hours                                  float64
 5    date_release                           int64  
 6    win                                    int64  
 7    mac                                    int64  
 8    linux                                  int64  
 9    rating                                 object 
 10   positive_ratio                         int64  
 11   user_reviews                           int64  
 12   price_final                            float64
 13   price_original                         float64
 14   discount                            

# Ordinal Features Processing

In [9]:
df.rating.unique()

array(['Mostly Positive', 'Very Positive', 'Overwhelmingly Positive',
       'Mixed', 'Positive', 'Mostly Negative', 'Overwhelmingly Negative',
       'Negative'], dtype=object)

In [10]:
df.rating

0        Mostly Positive
1          Very Positive
2          Very Positive
3          Very Positive
4          Very Positive
              ...       
19995      Very Positive
19996      Very Positive
19997    Mostly Positive
19998    Mostly Positive
19999      Very Positive
Name: rating, Length: 20000, dtype: object

In [11]:
rating_order = ['Overwhelmingly Negative', 'Very Negative', 'Negative', 'Mostly Negative',
                'Mixed', 
                'Mostly Positive', 'Positive', 'Very Positive', 'Overwhelmingly Positive']
encoder = OrdinalEncoder(categories=[rating_order])
df['rating'] = encoder.fit_transform(df[['rating']]).astype(int)

In [12]:
df.rating

0        5
1        7
2        7
3        7
4        7
        ..
19995    7
19996    7
19997    5
19998    5
19999    7
Name: rating, Length: 20000, dtype: int64

In [13]:
df.describe()

Unnamed: 0,helpful,funny,date,is_recommended,hours,date_release,win,mac,linux,rating,...,tag_Well-Written,tag_Werewolves,tag_Western,tag_Wholesome,tag_Word Game,tag_World War I,tag_World War II,tag_Wrestling,tag_Zombies,tag_eSports
count,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,...,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0
mean,4.0139,1.1603,1.582183e+18,0.8499,100.45077,1.494777e+18,0.99935,0.39385,0.28035,6.7593,...,0.00215,0.0015,0.0031,0.0011,0.00175,0.0034,0.0131,0.0004,0.03865,0.0153
std,131.182892,26.173736,7.32711e+16,0.357178,176.703246,1.207029e+17,0.025487,0.488614,0.449181,1.187449,...,0.046319,0.038702,0.055593,0.033149,0.041797,0.058212,0.113706,0.019996,0.192764,0.122746
min,0.0,0.0,1.291766e+18,0.0,0.0,8.676288e+17,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,1.542758e+18,1.0,7.7,1.425946e+18,1.0,0.0,0.0,7.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,1.602634e+18,1.0,27.1,1.508976e+18,1.0,0.0,0.0,7.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,1.638403e+18,1.0,97.525,1.591142e+18,1.0,1.0,1.0,7.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,18120.0,2555.0,1.672445e+18,1.0,999.8,1.692835e+18,1.0,1.0,1.0,8.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


# Data Imputation

> There are no null values, so there's no need for data imputation.

# Dimension Reduction

In [14]:
tags_columns = [c for c in df.columns if c.startswith("tag_")]
df_tags = df[tags_columns]
df_tags

Unnamed: 0,tag_1980s,tag_1990's,tag_2.5D,tag_2D,tag_2D Fighter,tag_2D Platformer,tag_360 Video,tag_3D,tag_3D Fighter,tag_3D Platformer,...,tag_Well-Written,tag_Werewolves,tag_Western,tag_Wholesome,tag_Word Game,tag_World War I,tag_World War II,tag_Wrestling,tag_Zombies,tag_eSports
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
19996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
19997,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
19998,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [17]:
# Function to apply PCA and reduce dimensionality
def apply_pca(df, n_components=10):
    pca = PCA(n_components=n_components)
    pca_result = pca.fit_transform(df.values)
    pca_df = pd.DataFrame(data=pca_result, columns=[f'PCA_{i}' for i in range(1, n_components+1)])

    print(f"{n_components=} Explained Variance Ratio:")
    print(pca.explained_variance_ratio_)
    print(f"{n_components=} Sum of Explained Variance Ratio:")
    print(sum(pca.explained_variance_ratio_))
    return pca_df

# Function to apply t-SNE and reduce dimensionality
def apply_tsne(df, n_components=2, perplexity=30, learning_rate=200):
    tsne = TSNE(n_components=n_components, perplexity=perplexity, learning_rate=learning_rate, n_jobs=-1)
    tsne_result = tsne.fit_transform(df.values)
    tsne_df = pd.DataFrame(data=tsne_result, columns=[f'tSNE_{i}' for i in range(1, n_components+1)])
    return tsne_df

# Function to apply UMAP and reduce dimensionality
def apply_umap(df, n_components=2, n_neighbors=15, min_dist=0.1):
    umap_obj = umap.UMAP(n_components=n_components, n_neighbors=n_neighbors, min_dist=min_dist)
    umap_result = umap_obj.fit_transform(df.values)
    umap_df = pd.DataFrame(data=umap_result, columns=[f'UMAP_{i}' for i in range(1, n_components+1)])
    return umap_df

# Append reduced dimension columns to original DataFrame
def append_dimensions(df, pca_df, tsne_df, umap_df):
    df = pd.concat([df, pca_df, tsne_df, umap_df], axis=1)
    return df

In [20]:
# Apply PCA
pca_df = apply_pca(df_tags)
pca_df

n_components=10 Explained Variance Ratio:
[0.14679534 0.05807191 0.04217454 0.03222231 0.02794991 0.02350984
 0.02152643 0.01990591 0.01751607 0.0165942 ]
n_components=10 Sum of Explained Variance Ratio:
0.4062664569875134


Unnamed: 0,PCA_1,PCA_2,PCA_3,PCA_4,PCA_5,PCA_6,PCA_7,PCA_8,PCA_9,PCA_10
0,-0.996341,0.068097,-0.112916,0.002531,-0.005819,-0.014188,0.014074,-0.034879,0.004337,-0.023157
1,-0.996341,0.068097,-0.112916,0.002531,-0.005819,-0.014188,0.014074,-0.034879,0.004337,-0.023157
2,-0.996341,0.068097,-0.112916,0.002531,-0.005819,-0.014188,0.014074,-0.034879,0.004337,-0.023157
3,1.292698,-1.142232,0.992198,0.261531,-0.155303,-0.463202,0.634730,-0.176670,-1.032588,-0.473616
4,1.242478,0.570785,-0.155852,-1.266510,0.745922,1.115369,-0.683855,-0.444470,-0.435452,0.127897
...,...,...,...,...,...,...,...,...,...,...
19995,-0.996341,0.068097,-0.112916,0.002531,-0.005819,-0.014188,0.014074,-0.034879,0.004337,-0.023157
19996,-0.996341,0.068097,-0.112916,0.002531,-0.005819,-0.014188,0.014074,-0.034879,0.004337,-0.023157
19997,0.772011,-1.504048,0.511627,-0.017805,0.274444,-0.686770,-0.319560,1.857165,-0.614755,-0.284398
19998,-0.996341,0.068097,-0.112916,0.002531,-0.005819,-0.014188,0.014074,-0.034879,0.004337,-0.023157


In [17]:
# Applying t-SNE after PCA
tsne_df = apply_tsne(df_tags)  
tsne_df

Unnamed: 0,tSNE_1,tSNE_2
0,-31.768894,-1.777161
1,-31.774498,-1.781216
2,-31.815800,-1.816161
3,13.724992,-4.075267
4,49.594894,16.781155
...,...,...
19995,-33.415123,0.196950
19996,-33.415123,0.196950
19997,-25.146723,-55.852413
19998,-33.415123,0.196950


In [18]:
# Applying UMAP after PCA
umap_df = apply_umap(df_tags)  
umap_df


failed. This is likely due to too small an eigengap. Consider
adding some noise or jitter to your data.

Falling back to random initialisation!
  warn(


Unnamed: 0,UMAP_1,UMAP_2
0,-0.109410,-5.660317
1,-1.018438,-5.351407
2,0.342578,-7.019988
3,5.714807,5.111104
4,6.468112,8.574800
...,...,...
19995,-1.973978,-4.874712
19996,0.054759,-5.362456
19997,8.485496,5.742757
19998,-1.182518,-5.787058


In [19]:
# Append reduced dimension columns to original DataFrame
df = append_dimensions(df, pca_df, tsne_df, umap_df)
df

Unnamed: 0,helpful,funny,date,is_recommended,hours,date_release,win,mac,linux,rating,...,PCA_5,PCA_6,PCA_7,PCA_8,PCA_9,PCA_10,tSNE_1,tSNE_2,UMAP_1,UMAP_2
0,0,0,1561939200000000000,1,68.1,1427932800000000000,1,0,0,5,...,-0.005817,-0.014193,0.014085,-0.034865,0.004092,-0.022869,-31.768894,-1.777161,-0.109410,-5.660317
1,0,0,1637971200000000000,1,5.0,1630540800000000000,1,0,0,7,...,-0.005817,-0.014193,0.014085,-0.034865,0.004092,-0.022869,-31.774498,-1.781216,-1.018438,-5.351407
2,0,0,1470009600000000000,1,3.0,1329696000000000000,1,1,0,7,...,-0.005817,-0.014193,0.014085,-0.034865,0.004092,-0.022869,-31.815800,-1.816161,0.342578,-7.019988
3,10,3,1508630400000000000,0,4.6,1402876800000000000,1,1,0,7,...,-0.155092,-0.463906,0.634547,-0.175196,-1.039181,-0.464736,13.724992,-4.075267,5.714807,5.111104
4,21,0,1393545600000000000,1,25.4,1367193600000000000,1,1,1,7,...,0.745866,1.115563,-0.684972,-0.446322,-0.432909,0.134876,49.594894,16.781155,6.468112,8.574800
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,0,0,1419206400000000000,1,0.1,1585008000000000000,1,0,0,7,...,-0.005817,-0.014193,0.014085,-0.034865,0.004092,-0.022869,-33.415123,0.196950,-1.973978,-4.874712
19996,0,0,1521158400000000000,0,532.8,1525737600000000000,1,1,0,7,...,-0.005817,-0.014193,0.014085,-0.034865,0.004092,-0.022869,-33.415123,0.196950,0.054759,-5.362456
19997,0,0,1561766400000000000,1,3.5,1521244800000000000,1,0,0,5,...,0.274396,-0.686753,-0.318425,1.856322,-0.618559,-0.274566,-25.146723,-55.852413,8.485496,5.742757
19998,31,2,1513900800000000000,1,666.1,1427932800000000000,1,0,0,5,...,-0.005817,-0.014193,0.014085,-0.034865,0.004092,-0.022869,-33.415123,0.196950,-1.182518,-5.787058


In [20]:
# Parquet preserves data types, whereas CSV does not. Since I need to share this file with my teammate, I'll use Parquet.
# Please note that you need to install `pyarrow` (`pip install pyarrow`) to use Parquet IO functionalities.
df.to_parquet(f"../data/3-recommendations_100000_samples-{file_name}_preprocessed.parquet")