In [231]:
import os
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing
from sklearn import utils
from sklearn.metrics import mean_squared_error, r2_score

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set(style='white', context='notebook', palette='deep')

from sklearn.model_selection import train_test_split

In [220]:
# path for the csv files
DATA_PATH = os.path.join(os.getcwd(), 'data')

# loading data to pandas dataframe
def load_data(file_name): 
    file_path = os.path.join(DATA_PATH, file_name) 
    return pd.read_csv(file_path, parse_dates = ['purchase_date', 'release_date'])

In [221]:
def extract_dateinfo(df, col_name):
    df[col_name+'_year'] = df[col_name].apply(lambda x: x.year)
    df[col_name+'_month'] = df[col_name].apply(lambda x: x.month)
    df[col_name+'_day'] = df[col_name].apply(lambda x: x.day)
    
    return df

In [222]:
#Outlier detection
def detect_outliers(df, n, features):
    """
    Takes a dataframe df of features and returns a list of the indices
    corresponding to the observations containing more than n outliers according
    to the Tukey method.
    """
    
    outlier_indices = []
    
    for col in features:
        # calculating interquartile range
        Q1 = np.percentile(df[col], 25)
        Q3 = np.percentile(df[col], 75)
        IQR = Q3 - Q1
        
        outlier_step = 1.5 * IQR
        
        # get the indices of outliers for feature col
        outliers_in_col = df[(df[col] < Q1 - outlier_step) | (df[col] > Q3 + outlier_step)].index
        
        # append the indices to oulier_indices
        outlier_indices.extend(outliers_in_col)
    
    outlier_indices = Counter(outlier_indices)
    result = list(k for k, v in outlier_indices.items() if v > n)
    
    return result

In [223]:
game_info = load_data('train.csv')
test_set = load_data('test.csv')
game_info.head()

# check null values
game_info.fillna(np.nan, inplace=True)

#fill missing purchase date with the most frequent value in purchase_date column
game_info['purchase_date'].fillna(game_info['purchase_date'].mode()[0], inplace=True)

#fille missing number of positive_reviews and negative_reviews with zeros
game_info['total_positive_reviews'].fillna(0.0, inplace=True)
game_info['total_negative_reviews'].fillna(0.0, inplace=True)

#transfer boolean values to 1(true) and 0(false)
game_info['is_free'] = game_info['is_free'].map({False: 0.0, True: 1.0})

#drop outliers
outliers_to_drop = detect_outliers(game_info, 2 ,['price', 'total_negative_reviews', 'total_positive_reviews'])
game_info.loc[outliers_to_drop]
game_info = game_info.drop(outliers_to_drop, axis = 0).reset_index(drop=True)

# split strings in the categorical columns
game_info['genres'] = game_info['genres'].str.split(',')
game_info['categories'] = game_info['categories'].str.split(',')
game_info['tags'] = game_info['tags'].str.split(',')

#game_info

#game_info.isnull().sum()

In [224]:
#dataframe with only categorical values
cate_df = game_info[['genres', 'categories', 'tags']]


# one_hot encoding categorical columns 
genres_df = pd.get_dummies(cate_df['genres'].apply(pd.Series).stack()).sum(level=0)
categories_df = pd.get_dummies(cate_df['categories'].apply(pd.Series).stack()).sum(level=0)
tags_df = pd.get_dummies(cate_df['tags'].apply(pd.Series).stack()).sum(level=0)

# contatenate categorical dataframes with game_info
game_df = pd.concat([game_info.drop(columns=['genres', 'categories', 'tags']), genres_df, categories_df, tags_df], axis=1, sort=False)

# drop duplicate columns
game_df = game_df.loc[:, ~new_df.columns.duplicated()]
game_df

Unnamed: 0,id,playtime_forever,is_free,price,purchase_date,release_date,total_positive_reviews,total_negative_reviews,Action,Adventure,...,Voxel,Walking Simulator,War,Wargame,Warhammer 40K,Western,World War I,World War II,Zombies,eSports
0,0,0.000000,0.0,3700.0,2018-07-02,2013-12-10,372.0,96.0,0,1,...,0,1,0,0,0,0,0,0,0,0
1,1,0.016667,1.0,0.0,2016-11-26,2015-08-12,23.0,0.0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,0.000000,0.0,5000.0,2018-07-02,2014-01-28,3018.0,663.0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,3,1.533333,0.0,9900.0,2016-11-28,2010-03-31,63078.0,1746.0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,4,22.333333,0.0,4800.0,2018-03-04,2012-07-30,8841.0,523.0,1,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
346,352,0.783333,0.0,8800.0,2017-11-24,2016-12-20,150.0,91.0,1,0,...,0,0,0,0,0,0,0,0,0,0
347,353,0.000000,0.0,6800.0,2018-08-15,2015-06-23,19008.0,4849.0,1,1,...,0,0,0,0,0,0,0,0,0,0
348,354,0.000000,0.0,8300.0,2018-01-30,2015-11-05,5099.0,1719.0,1,1,...,0,0,0,0,0,0,0,0,0,0
349,355,0.000000,0.0,6800.0,2017-09-23,2016-06-03,718.0,159.0,1,1,...,0,0,0,0,0,0,0,0,0,0


In [227]:
game_df = extract_dateinfo(game_df, 'purchase_date')
game_df = extract_dateinfo(game_df, 'release_date')
game_df.drop(columns=['purchase_date', 'release_date'], inplace=True)
#game_df.drop(['purchase_date'])
#'purchase_date' in game_df
game_df

Unnamed: 0,id,playtime_forever,is_free,price,total_positive_reviews,total_negative_reviews,Action,Adventure,Animation & Modeling,Audio Production,...,World War I,World War II,Zombies,eSports,purchase_date_year,purchase_date_month,purchase_date_day,release_date_year,release_date_month,release_date_day
0,0,0.000000,0.0,3700.0,372.0,96.0,0,1,0,0,...,0,0,0,0,2018,7,2,2013,12,10
1,1,0.016667,1.0,0.0,23.0,0.0,0,0,0,0,...,0,0,0,0,2016,11,26,2015,8,12
2,2,0.000000,0.0,5000.0,3018.0,663.0,0,1,0,0,...,0,0,0,0,2018,7,2,2014,1,28
3,3,1.533333,0.0,9900.0,63078.0,1746.0,1,0,0,0,...,0,0,0,0,2016,11,28,2010,3,31
4,4,22.333333,0.0,4800.0,8841.0,523.0,1,0,0,0,...,0,0,0,0,2018,3,4,2012,7,30
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
346,352,0.783333,0.0,8800.0,150.0,91.0,1,0,0,0,...,0,0,0,0,2017,11,24,2016,12,20
347,353,0.000000,0.0,6800.0,19008.0,4849.0,1,1,0,0,...,0,0,0,0,2018,8,15,2015,6,23
348,354,0.000000,0.0,8300.0,5099.0,1719.0,1,1,0,0,...,0,0,0,0,2018,1,30,2015,11,5
349,355,0.000000,0.0,6800.0,718.0,159.0,1,1,0,0,...,0,0,0,0,2017,9,23,2016,6,3


In [228]:
X = game_df.drop('playtime_forever', axis=1)
y = game_df['playtime_forever']
# split game_info into training and validating datasets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

print('X_train shape: ', X_train.shape)
print('X_val shape: ', X_val.shape)
print('y_train shape : ', y_train.shape)
print('y_val shape : ', y_val.shape)

X_train shape:  (280, 351)
X_val shape:  (71, 351)
y_train shape :  (280,)
y_val shape :  (71,)


In [233]:
lab_enc = preprocessing.LabelEncoder()
encoded = lab_enc.fit_transform(y_train)

logreg = LogisticRegression(multi_class='ovr', solver='liblinear', max_iter=200)
logreg.fit(X_train, encoded)
y_pred = logreg.predict(X_val)