In [1]:
import numpy as np
import pandas as pd

from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor

np.set_printoptions(suppress=True)

In [2]:
# Read csv file into dataframe
train_df = pd.read_csv("train.csv", parse_dates=["purchase_date", "release_date"])
test_df = pd.read_csv("test.csv", parse_dates=["purchase_date", "release_date"], index_col = 0)

## Part 1 Data processing
* Do one-hot encoding for `tags`, `genres` and `categories` of trainning dataset and test dataset respectively. Then union the one-hot encoding of and drop duplicates of columns. 
* Split the date into year and month ignoring day. 
* Add a new feature *interval* = *purchase_date_day* - *release_date_day*. 

In [3]:
# Add a feature of interval using purchase_date - release_date
train_df["interval"] = train_df["purchase_date"].apply(lambda x: x.day) - train_df["release_date"].apply(lambda x: x.day)

def extract_date(df, column):
    df[column + "_year"] = df[column].apply(lambda x: x.year)
    df[column + "_month"] = df[column].apply(lambda x: x.month)

extract_date(train_df, "purchase_date")
extract_date(train_df, "release_date")

train_df = train_df.fillna(0.0)

In [4]:
# One-hot encoding for trainning data
tags_train = train_df["tags"].str.get_dummies(",") # shape = (357, 312)
genres_train = train_df["genres"].str.get_dummies(",") # (357, 20)
categories_train = train_df["categories"].str.get_dummies(",") # (357, 29)

diff = genres_train.columns.difference(tags_train.columns) # drop duplicates of columns
train_one_hot = tags_train.join(genres_train[diff]) # (357, 312)
 
diff = categories_train.columns.difference(train_one_hot.columns)
train_one_hot = train_one_hot.join(categories_train[diff]) # (357, 340)

In [5]:
# One-hot endcoding for test data
tags_test = test_df["tags"].str.get_dummies(",") # (90, 229)
genres_test = test_df["genres"].str.get_dummies(",") # (90, 14)
categories_test = test_df["categories"].str.get_dummies(",") # (90, 28)

diff = genres_test.columns.difference(tags_test.columns)
test_one_hot = tags_test.join(genres_test[diff]) # (90, 229)

diff = categories_test.columns.difference(test_one_hot.columns)
test_one_hot = test_one_hot.join(categories_test[diff]) # (90, 256)

In [6]:
# Union train_one_hot and test_one hot features
# Fill the NaN with 0.0

diff = test_one_hot.columns.difference(train_one_hot.columns) # (5,)
train_one_hot = pd.concat([train_one_hot, pd.DataFrame(columns=list(diff))], axis=1) # (357, 345)
train_one_hot = train_one_hot.fillna(0.0)

diff = train_one_hot.columns.difference(test_one_hot.columns) # (89,)
test_one_hot = pd.concat([test_one_hot, pd.DataFrame(columns=list(diff))], axis=1) # (90, 345)
test_one_hot = test_one_hot.fillna(0.0)

## Part 2 Prepare data for trainning

* `[X_train, y_train]` use to train a basic model
* `[X_train_correct, y_train_corretc]` use to train a correction model

In [7]:
# The full trainning dataset. 
# Use it to train and predict on test dataset at first.
X_train = train_df.join(train_one_hot) \
                  .drop(["genres", "tags", "categories", "purchase_date", "release_date", \
                         "id", "playtime_forever"], axis = 1)
y_train = train_df["playtime_forever"]

In [8]:
# The sub-dataset (playtime_forever > 3) of the oringinal trainning set. 
# Use the sub-dataset to re-train a new model and then do prediction again to correct the outliers.
train_df_correct = train_df[train_df["playtime_forever"] > 3]
X_train_correct = train_df_correct.join(train_one_hot) \
                                  .drop(["genres", "tags", "categories", "purchase_date","release_date", \
                                         "id", "playtime_forever"], axis = 1)
y_train_correct = train_df_correct["playtime_forever"]

## Part 3 Train models

In [9]:
# Train a decision tree regression model using full trainning dateset
regDT = DecisionTreeRegressor(random_state=0)
regDT.fit(X_train, y_train)

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
                      max_leaf_nodes=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      presort=False, random_state=0, splitter='best')

In [10]:
# Train a correction decision tree regression model using sub trainning dateset
regDTCorrect = DecisionTreeRegressor(random_state=0)
regDTCorrect.fit(X_train_correct, y_train_correct)

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
                      max_leaf_nodes=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      presort=False, random_state=0, splitter='best')

In [11]:
# Train a ramdom forest tree regression model using full trainning dataset
regRF = RandomForestRegressor(max_depth=None, random_state=0, n_estimators=100)
regRF.fit(X_train, y_train)  

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=100,
                      n_jobs=None, oob_score=False, random_state=0, verbose=0,
                      warm_start=False)

## Part 4 Predict on test dataset

In [12]:
# Read test.csv again
test_df = pd.read_csv("test.csv", parse_dates=["purchase_date", "release_date"], index_col = 0)

In [13]:
# Pre-processing the test data
test_df["interval"] = test_df["purchase_date"].apply(lambda x: x.day) - test_df["release_date"].apply(lambda x: x.day)

extract_date(test_df, "purchase_date")
extract_date(test_df, "release_date")

test_df = test_df.fillna(0.0)

In [14]:
# prepare the test data for prediction
X_test = test_df.join(test_one_hot).drop(["genres", "tags", "categories", "purchase_date", "release_date"], axis = 1)

In [15]:
regDT.predict(X_test)

array([ 0.        ,  0.63333333,  0.63333333,  0.        ,  0.1       ,
        0.63333333,  0.03333333,  0.        ,  0.63333333,  4.21666667,
        0.        ,  0.01666667,  0.        ,  0.1       ,  0.        ,
        0.        ,  0.01666667,  1.26666667,  0.05      ,  0.63333333,
        0.        ,  0.13333333,  0.96666667,  0.05      ,  0.        ,
        0.        ,  0.63333333,  0.13333333,  1.68333333,  1.68333333,
        1.68333333, 49.38333333,  2.3       ,  0.05      ,  4.21666667,
        0.05      ,  0.        ,  0.        ,  0.        ,  2.5       ,
        0.        ,  0.        ,  0.        ,  0.63333333,  0.        ,
        0.        ,  0.96666667,  0.        ,  0.        ,  2.45      ,
        0.        ,  0.        ,  4.33333333,  0.63333333,  0.63333333,
        0.        ,  0.5       ,  0.        ,  0.01666667,  0.        ,
        0.        ,  0.01666667,  0.63333333,  0.        ,  0.63333333,
        0.2       ,  1.68333333,  4.51666667,  6.3       ,  0.63

In [16]:
regDTCorrect.predict(X_test)

array([  3.11666667,   6.36666667,   6.36666667,   6.36666667,
         9.88333333,   3.06666667,   6.36666667,   3.11666667,
         3.06666667,   6.36666667,   3.06666667,   3.06666667,
         6.36666667,   3.16666667,   9.88333333,  28.98333333,
         4.21666667,   4.55      ,   4.55      ,   3.11666667,
         3.11666667,   6.36666667,   6.36666667,   4.55      ,
         3.11666667,   3.11666667,   3.06666667,   4.55      ,
         4.55      ,   6.36666667,  63.8       , 113.8       ,
         4.55      ,   4.55      ,   3.16666667,   3.16666667,
         6.36666667,  10.        ,   4.55      ,   3.06666667,
         3.06666667,   3.11666667,   6.36666667,   4.21666667,
         4.55      ,  20.56666667,  63.8       ,   4.21666667,
         3.11666667,  31.98333333,  28.98333333,   3.11666667,
         3.16666667,   3.06666667,   6.36666667,   4.21666667,
         5.88333333,   6.36666667,  28.98333333,   6.36666667,
         6.36666667,   4.21666667,   4.21666667,  20.61

In [17]:
regRF.predict(X_test)

array([ 1.6075    ,  9.05633333,  0.59283333,  0.44283333,  5.47966667,
        1.02783333,  2.53966667,  0.48116667,  1.49266667,  2.39583333,
        1.871     ,  1.49583333,  9.027     ,  8.98783333,  9.0995    ,
        2.33066667,  1.92133333,  1.83      ,  1.74116667,  9.33666667,
        0.69783333,  6.40866667,  2.84316667,  0.46216667,  0.20533333,
        1.95083333,  0.95466667,  1.00516667,  9.38366667,  2.52683333,
        4.70283333, 27.89166667,  8.37833333,  0.9655    ,  2.47866667,
        1.62483333,  4.17233333,  5.21016667,  2.30433333,  1.9535    ,
        3.42633333,  1.68233333,  4.00233333,  2.394     ,  8.688     ,
        1.75183333, 12.22083333,  0.75883333,  2.80733333, 14.18733333,
        1.83083333,  1.97683333,  1.6875    ,  2.40983333,  1.20833333,
        2.254     ,  2.39383333,  2.70683333,  1.28983333,  0.66016667,
        1.887     ,  8.43683333,  1.03033333, 10.227     ,  7.80866667,
        8.38283333,  0.98333333,  2.39733333, 10.4935    ,  0.45

In [18]:
correct_distance = regDTCorrect.predict(X_test) - regDT.predict(X_test) # calculate the correction distance
filter = np.vectorize(lambda x: 0 if x < 30 else x) # filter the distance that < 30
correct_vector = filter(correct_distance)
correct_vector

array([ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 62, 64,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 62,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0, 64,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0, 63,  0,  0])

## Part 5 Generate submission csv file

In [19]:
# First submission: desicion tree + correct vector
y_test_predict = regDT.predict(X_test) + correct_vector
result = pd.DataFrame({"id":range(90), "playtime_forever":y_test_predict})
result.to_csv("submission_DT_correct.csv", index=False)

# Before add the correct vector, the DT model achieves 11+ score in public board.
# After using correction, it can achieve 4+ score in public board.
# However, the TA warned that the high performance in public board didn't mean the model also performed well in private board.

In [20]:
# Second submission: random forest
y_test_predict = regRF.predict(X_test)
result = pd.DataFrame({"id":range(90), "playtime_forever":y_test_predict})
result.to_csv("submission_RF.csv", index=False)

# The RF model gaines 15+ score in public board.