In [221]:
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import balanced_accuracy_score, roc_auc_score, make_scorer
import datetime

In [203]:
df = pd.read_csv('./train.csv/train.csv')
test_df = pd.read_csv('./test.csv/test.csv')

In [204]:
# for i in range (1, 38):
#     index = 'P' + str(i)
#     mean_encodefor_df = df.groupby(index)['revenue'].mean()
#     df.loc[:, index + "_encoded"] = df[index].map(mean_encode)
#     df.drop(columns = [index], inplace = True)
    
#     mean_encodefor_testdf = test_df.groupby(index)['revenue'].mean()
#     test_df.loc[:, index + "_encoded"] = test_df[index].map(mean_encodefor_testdf)
#     test_df.drop(columns = [index], inplace = True)

    
# print(df, test_df)

In [205]:
#cities irregularites between train & test dataset i.e. no. of city in test >>> train 
df.drop(columns = ['City'], inplace = True)
test_df.drop(columns = ['City'], inplace = True)

In [206]:
y = df['revenue']
X = df.loc[:, df.columns != 'revenue']
X.head()

Unnamed: 0,Id,Open Date,City Group,Type,P1,P2,P3,P4,P5,P6,...,P28,P29,P30,P31,P32,P33,P34,P35,P36,P37
0,0,07/17/1999,Big Cities,IL,4,5.0,4.0,4.0,2,2,...,2.0,3.0,5,3,4,5,5,4,3,4
1,1,02/14/2008,Big Cities,FC,4,5.0,4.0,4.0,1,2,...,3.0,3.0,0,0,0,0,0,0,0,0
2,2,03/09/2013,Other,IL,2,4.0,2.0,5.0,2,3,...,1.0,3.0,0,0,0,0,0,0,0,0
3,3,02/02/2012,Other,IL,6,4.5,6.0,6.0,4,4,...,2.5,7.5,25,12,10,6,18,12,12,6
4,4,05/09/2009,Other,IL,3,4.0,3.0,4.0,2,2,...,1.0,3.0,5,1,3,2,3,4,3,3


In [207]:
#converting open date to days sinced opened to get numeric value.
def days_calculator(x):
    return int(abs(datetime.datetime.strptime(x, '%m/%d/%Y') - datetime.datetime.today()).days)

X['Open_days'] = X['Open Date'].apply(days_calculator)
test_df['Open_days'] = test_df['Open Date'].apply(days_calculator)

In [208]:
#One hot encoding of city groups and types 
#Note: Type in test in 1 greater than train

X = pd.get_dummies(X, columns = ['City Group', 'Type'])
test_df = pd.get_dummies(test_df, columns = ['City Group', 'Type'])

In [209]:
# Dropping unnecessarity columns ie: has nothing to do with classfication
# dropping type_mb from test as train doesn't contain MB type.
# dropping Open date as Days open is inserted above
# test_id isn't drop as it is required during submission 

X.drop(columns = ['Id', 'Open Date'], inplace = True)
test_df.drop(columns = ['Open Date', 'Type_MB'], inplace = True)

*Using GridSearch CrossValidation to find best params*

In [210]:
params = {
    'max_depth': [3,4,5],
    'learning_rate': [0.0625,0.125, 0.25, 0.33, 0.5, 0.625, 0.875, 1],
    'gamma': [0.1, 0.25, 0.5, 1, 2 ],
    
}
XGB_Model1 = xgb.XGBRegressor()
XGB = GridSearchCV(estimator = XGB_Model1, param_grid = params, scoring='neg_root_mean_squared_error', cv=10)
XGB.fit(X, y)

GridSearchCV(cv=10,
             estimator=XGBRegressor(base_score=None, booster=None,
                                    colsample_bylevel=None,
                                    colsample_bynode=None,
                                    colsample_bytree=None, gamma=None,
                                    gpu_id=None, importance_type='gain',
                                    interaction_constraints=None,
                                    learning_rate=None, max_delta_step=None,
                                    max_depth=None, min_child_weight=None,
                                    missing=nan, monotone_constraints=None,
                                    n_estimators=100, n_jobs=None,
                                    num_parallel_tree=None, random_state=None,
                                    reg_alpha=None, reg_lambda=None,
                                    scale_pos_weight=None, subsample=None,
                                    tree_method=None, validate_par

In [211]:
XGB.best_estimator_.gamma

0.1

In [212]:
XGB_best_Model = xgb.XGBRegressor(learning_rate = XGB.best_estimator_.learning_rate,
                             max_depth = XGB.best_estimator_.max_depth,
                              gamma = XGB.best_estimator_.gamma)
XGB_best_Model.fit(X,y)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0.1, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.0625, max_delta_step=0, max_depth=4,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=100, n_jobs=4, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [213]:
result = XGB.predict(test_df.drop(columns = ['Id']))

In [214]:
result

array([3502096.8, 3268815.5, 2817710. , ..., 2180362. , 3621647. ,
       5547767. ], dtype=float32)

In [230]:
result_in_submission_format = pd.DataFrame(columns = ['Id', 'Prediction'])
result_in_submission_format['Id'] = test_df["Id"]
result_in_submission_format['Prediction'] = result

In [234]:
result_in_submission_format.to_csv('Submit_final.csv', index = False)