## Notebook Name: Baseline Competition.<BR>
## by: Abhishek Thakur


## Imports

In [14]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
import warnings
warnings.filterwarnings("ignore")


## Load Data

In [15]:
df_train = pd.read_csv('../30Days-ML/train_folds.csv',index_col=0)
df_test = pd.read_csv('../30Days-ML/test.csv',index_col = 0)
sample_submission = pd.read_csv('../30Days-ML/sample_submission.csv')

## Picking up categorical and useful columns

In [16]:
useful_features = [c for c in df_train.columns if c not in ('id','target','kfold')]
obj_cols = [c for c in df_train.columns if 'cat' in c]
df_test = df_test[useful_features]

## Training folds loop

In [17]:
final_prediction = []
for fold in range(5):
    # Data handling
    xtrain = df_train[df_train.kfold != fold].reset_index(drop=True)
    xvalid = df_train[df_train.kfold == fold].reset_index(drop=True)
    x_test = df_test.copy()
    
    ytrain = xtrain.target
    yvalid = xvalid.target

    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features]

    # Preprocessing
    ord_encoder = OrdinalEncoder()
    xtrain[obj_cols] = ord_encoder.fit_transform(xtrain[obj_cols])
    xvalid[obj_cols] = ord_encoder.transform(xvalid[obj_cols])
    x_test[obj_cols] = ord_encoder.transform(x_test[obj_cols])

    # Training model
    model = XGBRegressor(random_state=fold,n_jobs = -1)
    model.fit(xtrain,ytrain)
    
    # Model Prediction
    valid_preds = model.predict(xvalid)
    test_preds = model.predict(x_test)
    final_prediction.append(test_preds)
    
    print(fold,"=>",mean_squared_error(yvalid,valid_preds,squared=False))





0 => 0.7242812912900478
1 => 0.7232810321072864
2 => 0.725452249623988
3 => 0.725286377838993
4 => 0.7242629367174095


## Average of the stacked test prediction of folds

In [None]:
test_preds = np.mean(np.column_stack(final_prediction),axis = 1)
sample_submission.target = test_preds
sample_submission.to_csv('submission.csv',index=False)
