In [None]:
import os
import gc
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics
import matplotlib.pyplot as plt

np.random.seed(0)
gc.enable()

In [None]:
train = pd.read_csv("../data/small_train_FE.csv")
train = train[train['maxPlace'] > 1]

In [None]:
target = "winPlacePerc"
features = list(train.columns)
features.remove("Id")
features.remove("matchId")
features.remove("groupId")
features.remove("matchType")

y = np.array(train[target])
features.remove(target)
x = train[features]

del train
gc.collect()

In [None]:
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=.1, random_state=0)

del x, y
gc.collect()

In [None]:
ran_forest = RandomForestRegressor(n_estimators=100, n_jobs=4, random_state=0)

In [None]:
%%time
ran_forest.fit(x_train, y_train)

In [None]:
print('Training MAE: ', metrics.mean_absolute_error(ran_forest.predict(x_train), y_train))
print('Validation MAE: ', metrics.mean_absolute_error(ran_forest.predict(x_val), y_val))

In [None]:
feature_importance = pd.DataFrame(ran_forest.feature_importances_, index=x_train.columns, columns=['importance']).sort_values('importance', ascending=False)
del ran_forest
gc.collect()

In [None]:
x_values = list(range(len(feature_importance.index)))
fig = plt.figure(figsize=(8,6))
plt.style.use('fivethirtyeight')
plt.bar(x_values, feature_importance['importance'])
plt.xticks(x_values, np.array(feature_importance.index), rotation='vertical')
plt.ylabel('Importance')
plt.xlabel('Feature')
plt.title('Feature Importances\n(with feature engineering)')
plt.tight_layout()
plt.savefig('viz/FE_small_RF_importances.png', dpi=fig.dpi)
plt.close()

In [None]:
# train mae: 0.0224
# test mae: 0.0602