In [1]:
# Statistics
import pandas as pd
import numpy as np
import math as mt

# Data Visualization
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

# Data Preprocessing - Encoding and Imputation
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Data Preprocessing - Feature Engineering
from sklearn.feature_selection import mutual_info_regression
from sklearn.decomposition import PCA

# Data Preprocessing - ML Pipelines
from sklearn.compose import ColumnTransformer

# ML - Modeling
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

# ML - Evaluation
from sklearn.model_selection import cross_val_score

# ML - Tuning
from sklearn.model_selection import GridSearchCV

In [2]:
df = pd.read_csv('data/train.csv')

In [8]:
# Mark the train dataset with kfold = 5
from sklearn import model_selection

kf = model_selection.KFold(n_splits=5, shuffle=True, random_state=42)
for fold, (train_indicies, valid_indicies) in enumerate(kf.split(X=df)):
    print(fold, train_indicies, valid_indicies)
    #df.loc[valid_indicies, "kfold"] = fold

0 [     1      2      3 ... 299997 299998 299999] [     0      6     11 ... 299973 299983 299992]
1 [     0      1      2 ... 299995 299996 299998] [     4     10     31 ... 299982 299997 299999]
2 [     0      2      3 ... 299997 299998 299999] [     1      9     20 ... 299987 299988 299991]
3 [     0      1      2 ... 299995 299997 299999] [     3      7      8 ... 299994 299996 299998]
4 [     0      1      3 ... 299997 299998 299999] [     2      5     13 ... 299990 299993 299995]


In [7]:
fold

4

In [19]:
# Export train dataset with kfold mark
df.to_csv("data/train_fold.csv", index=False)

In [20]:
# Read train dataset

train_data = pd.read_csv('data/train_fold.csv')
test_data = pd.read_csv('data/test.csv')
sample_submission = pd.read_csv('data/sample_submission.csv')

In [21]:
useful_features = [col for col in train_data.columns if col not in ("id", "target", "kfold")]
cat_cols = [col for col in useful_features if "cat" in col]
test_data = test_data[useful_features]

In [34]:
X_test

Unnamed: 0,cat0,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,cat9,...,cont4,cont5,cont6,cont7,cont8,cont9,cont10,cont11,cont12,cont13
0,1.0,1.0,1.0,2.0,1.0,1.0,0.0,4.0,4.0,8.0,...,0.476739,0.376350,0.337884,0.321832,0.445212,0.290258,0.244476,0.087914,0.301831,0.845702
1,0.0,1.0,0.0,2.0,1.0,2.0,0.0,4.0,2.0,7.0,...,0.285509,0.860046,0.798712,0.835961,0.391657,0.288276,0.549568,0.905097,0.850684,0.693940
2,1.0,0.0,0.0,0.0,1.0,1.0,0.0,4.0,3.0,10.0,...,0.697272,0.683600,0.404089,0.879379,0.275549,0.427871,0.491667,0.384315,0.376689,0.508099
3,1.0,1.0,0.0,2.0,1.0,3.0,0.0,4.0,0.0,13.0,...,0.719306,0.777890,0.730954,0.644315,1.024017,0.391090,0.988340,0.411828,0.393585,0.461372
4,1.0,1.0,0.0,2.0,1.0,2.0,0.0,4.0,2.0,5.0,...,0.313032,0.431007,0.390992,0.408874,0.447887,0.390253,0.648932,0.385935,0.370401,0.900412
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
199995,1.0,0.0,0.0,2.0,1.0,3.0,0.0,4.0,4.0,8.0,...,0.287454,0.543800,0.682378,1.028978,1.022741,0.683903,0.877273,0.532410,0.605397,0.884581
199996,1.0,0.0,0.0,2.0,1.0,1.0,0.0,4.0,2.0,5.0,...,0.794881,0.432778,0.389775,0.359871,0.550013,0.492082,0.202295,0.416875,0.406205,0.758665
199997,0.0,1.0,1.0,2.0,1.0,1.0,0.0,4.0,2.0,8.0,...,0.514487,0.060997,0.171741,0.317185,0.150340,0.122109,0.390524,0.334026,0.378987,0.839416
199998,0.0,0.0,0.0,2.0,1.0,3.0,0.0,3.0,0.0,5.0,...,0.286144,1.061710,0.819811,0.901241,0.555339,0.844315,0.894193,0.794102,0.844279,0.890473


In [43]:
final_predictions = []
for fold in range(5):
    X_train = train_data[train_data.kfold != fold].reset_index(drop=True)
    X_valid = train_data[train_data.kfold == fold].reset_index(drop=True)
    X_test = test_data.copy()
    
    y_train = X_train.target
    y_valid = X_valid.target
    
    X_train = X_train[useful_features]
    X_valid = X_valid[useful_features]
    
    #print("encoding")
    ordinal_encoder = OrdinalEncoder()
    X_train[cat_cols] = ordinal_encoder.fit_transform(X_train[cat_cols])
    X_valid[cat_cols] = ordinal_encoder.transform(X_valid[cat_cols])
    X_test[cat_cols] = ordinal_encoder.transform(X_test[cat_cols]) # Q. The last transform
    
    #print("training")
    #model = RandomForestRegressor(random_state=fold, n_jobs=-1)
    model = XGBRegressor(random_state=fold, n_jobs=8)
    model.fit(X_train, y_train)
    preds_valid = model.predict(X_valid)
    test_preds = model.predict(X_test)
    final_predictions.append(test_preds)
    
    print(fold, mean_squared_error(y_valid, preds_valid, squared=False))

0 0.7242812912900478
1 0.7232810321072864
2 0.725452249623988
3 0.725286377838993
4 0.7242629367174095


In [44]:
np.column_stack(final_predictions).shape

(200000, 5)

In [45]:
np.column_stack(final_predictions)

array([[8.079589 , 8.039468 , 7.961714 , 7.8637094, 7.976139 ],
       [8.331573 , 8.284987 , 8.272598 , 8.327856 , 8.320301 ],
       [8.423865 , 8.445584 , 8.369508 , 8.327526 , 8.199669 ],
       ...,
       [8.257159 , 8.337429 , 8.588288 , 8.460814 , 8.394254 ],
       [8.299861 , 8.049863 , 8.017136 , 8.097175 , 8.053148 ],
       [7.9633756, 7.8542085, 7.976585 , 7.896254 , 7.967789 ]],
      dtype=float32)

In [48]:
np.mean(np.column_stack(final_predictions), axis=1)

array([7.984124, 8.307463, 8.35323 , ..., 8.407589, 8.103436, 7.931642],
      dtype=float32)

In [51]:
preds = np.mean(np.column_stack(final_predictions), axis=1)

In [52]:
preds = pd.DataFrame({'id': sample_submission.id, 'target': preds})

In [54]:
preds.to_csv('submission.csv', index=False)

In [14]:
np.log1p([2])

array([1.09861229])

In [12]:
mt.log1p(1)

0.6931471805599453

In [15]:
mt.log(2+1)

1.0986122886681098

In [17]:
assert X_train.columns == X_test.columns, 'ValueError: Columns of X_train are not equal to columns of X_test'

NameError: name 'X_train' is not defined