In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
train_data=pd.read_csv(r'../input/cross-sale-prediction/train.csv')
test_data=pd.read_csv(r'../input/cross-sale-prediction/test.csv')

In [3]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 381109 entries, 0 to 381108
Data columns (total 12 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   id                    381109 non-null  int64  
 1   Gender                381109 non-null  object 
 2   Age                   381109 non-null  int64  
 3   Driving_License       381109 non-null  int64  
 4   Region_Code           381109 non-null  float64
 5   Previously_Insured    381109 non-null  int64  
 6   Vehicle_Age           381109 non-null  object 
 7   Vehicle_Damage        381109 non-null  object 
 8   Annual_Premium        381109 non-null  float64
 9   Policy_Sales_Channel  381109 non-null  float64
 10  Vintage               381109 non-null  int64  
 11  Response              381109 non-null  int64  
dtypes: float64(3), int64(6), object(3)
memory usage: 34.9+ MB


In [4]:
train_data['Response'].value_counts()

0    334399
1     46710
Name: Response, dtype: int64

In [5]:
df=train_data[train_data['Response']==1]

In [6]:
df1=train_data[train_data['Response']==0]

In [7]:
df2=df1[0:46710]

In [8]:
df2.shape

(46710, 12)

In [9]:
cross_train=pd.concat([df,df2],axis='rows')

In [10]:
cross_train.shape

(93420, 12)

In [11]:
cross_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 93420 entries, 0 to 53280
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   id                    93420 non-null  int64  
 1   Gender                93420 non-null  object 
 2   Age                   93420 non-null  int64  
 3   Driving_License       93420 non-null  int64  
 4   Region_Code           93420 non-null  float64
 5   Previously_Insured    93420 non-null  int64  
 6   Vehicle_Age           93420 non-null  object 
 7   Vehicle_Damage        93420 non-null  object 
 8   Annual_Premium        93420 non-null  float64
 9   Policy_Sales_Channel  93420 non-null  float64
 10  Vintage               93420 non-null  int64  
 11  Response              93420 non-null  int64  
dtypes: float64(3), int64(6), object(3)
memory usage: 9.3+ MB


In [12]:
# lets remove id as it has no contribution in model building
cross_train.drop('id',axis=1,inplace=True)

In [13]:
cross_train=pd.get_dummies(cross_train,drop_first=True)

In [14]:
test_data=pd.get_dummies(test_data,drop_first=True)

In [15]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 127037 entries, 0 to 127036
Data columns (total 12 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   id                     127037 non-null  int64  
 1   Age                    127037 non-null  int64  
 2   Driving_License        127037 non-null  int64  
 3   Region_Code            127037 non-null  float64
 4   Previously_Insured     127037 non-null  int64  
 5   Annual_Premium         127037 non-null  float64
 6   Policy_Sales_Channel   127037 non-null  float64
 7   Vintage                127037 non-null  int64  
 8   Gender_Male            127037 non-null  uint8  
 9   Vehicle_Age_< 1 Year   127037 non-null  uint8  
 10  Vehicle_Age_> 2 Years  127037 non-null  uint8  
 11  Vehicle_Damage_Yes     127037 non-null  uint8  
dtypes: float64(3), int64(5), uint8(4)
memory usage: 8.2 MB


In [16]:
#lets drop id column from test_data
test_data.drop('id',axis=1,inplace=True)

In [17]:
cross_train.columns=['Age', 'Driving_License', 'Region_Code', 'Previously_Insured',
       'Annual_Premium', 'Policy_Sales_Channel', 'Vintage', 'Response',
       'Gender_Male', 'Vehicle_Age_1Year', 'Vehicle_Age_2Years',
       'Vehicle_Damage_Yes']

In [18]:
test_data.columns=['Age', 'Driving_License', 'Region_Code', 'Previously_Insured',
       'Annual_Premium', 'Policy_Sales_Channel', 'Vintage', 'Gender_Male',
       'Vehicle_Age_1Year', 'Vehicle_Age_2Years', 'Vehicle_Damage_Yes']

In [19]:
from sklearn.model_selection import train_test_split
t1,t2=train_test_split(cross_train,test_size=0.2,random_state=123)

In [20]:
x_train=t1.drop('Response',axis=1)
y_train=t1['Response']
x_test=t2.drop('Response',axis=1)
y_test=t2['Response']

In [21]:
from xgboost.sklearn import XGBClassifier
xgb_param={'n_estimators':[20,30,40,80,100,200,300,400,600]}
xgb=XGBClassifier(colsample_bytree=0.8,colsample_bylevel=0.8,subsample=0.8)

In [22]:
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] ==i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.5f} (std: {1:.5f})".format(
            results['mean_test_score'][candidate],
            results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")

In [23]:
from sklearn.model_selection import RandomizedSearchCV
random_search=RandomizedSearchCV(xgb,param_distributions=xgb_param,cv=5,scoring='roc_auc',n_jobs=-1,verbose=2,n_iter=30)

In [24]:
random_search.fit(x_train,y_train)

Fitting 5 folds for each of 9 candidates, totalling 45 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:  2.9min finished


RandomizedSearchCV(cv=5,
                   estimator=XGBClassifier(base_score=None, booster=None,
                                           colsample_bylevel=0.8,
                                           colsample_bynode=None,
                                           colsample_bytree=0.8, gamma=None,
                                           gpu_id=None, importance_type='gain',
                                           interaction_constraints=None,
                                           learning_rate=None,
                                           max_delta_step=None, max_depth=None,
                                           min_child_weight=None, missing=nan,
                                           monotone_constraints=None,
                                           n_estimators=100, n_jobs=None,
                                           num_parallel_tree=None,
                                           random_state=None, reg_alpha=None,
                            

In [25]:
report(random_search.cv_results_,5)

Model with rank: 1
Mean validation score: 0.85549 (std: 0.00177)
Parameters: {'n_estimators': 20}

Model with rank: 2
Mean validation score: 0.85533 (std: 0.00195)
Parameters: {'n_estimators': 30}

Model with rank: 3
Mean validation score: 0.85467 (std: 0.00206)
Parameters: {'n_estimators': 40}

Model with rank: 4
Mean validation score: 0.85230 (std: 0.00147)
Parameters: {'n_estimators': 80}

Model with rank: 5
Mean validation score: 0.85158 (std: 0.00178)
Parameters: {'n_estimators': 100}



In [26]:
xgb_param1={'gamma':[5,6,7,8],
           'max_depth':[5,6,7],
           'min_child_weight':[0.3,0.5,0.8,1]}

In [27]:
xgb2=XGBClassifier(n_estimators=20,subsample=0.8,colsample_bytree=0.8,colsample_bylevel=0.8)
from sklearn.model_selection import GridSearchCV
random=GridSearchCV(xgb2,param_grid=xgb_param1,cv=5,scoring='roc_auc',n_jobs=-1,verbose=2)

In [28]:
random.fit(x_train,y_train)

Fitting 5 folds for each of 48 candidates, totalling 240 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   13.3s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done 240 out of 240 | elapsed:  1.6min finished


GridSearchCV(cv=5,
             estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=0.8,
                                     colsample_bynode=None,
                                     colsample_bytree=0.8, gamma=None,
                                     gpu_id=None, importance_type='gain',
                                     interaction_constraints=None,
                                     learning_rate=None, max_delta_step=None,
                                     max_depth=None, min_child_weight=None,
                                     missing=nan, monotone_constraints=None,
                                     n_estimators=20, n_jobs=None,
                                     num_parallel_tree=None, random_state=None,
                                     reg_alpha=None, reg_lambda=None,
                                     scale_pos_weight=None, subsample=0.8,
                                     tree_method=None, val

In [29]:
report(random.cv_results_,5)

Model with rank: 1
Mean validation score: 0.85631 (std: 0.00132)
Parameters: {'gamma': 7, 'max_depth': 6, 'min_child_weight': 0.3}

Model with rank: 2
Mean validation score: 0.85631 (std: 0.00131)
Parameters: {'gamma': 7, 'max_depth': 6, 'min_child_weight': 0.5}

Model with rank: 3
Mean validation score: 0.85629 (std: 0.00132)
Parameters: {'gamma': 7, 'max_depth': 6, 'min_child_weight': 0.8}

Model with rank: 4
Mean validation score: 0.85621 (std: 0.00139)
Parameters: {'gamma': 7, 'max_depth': 6, 'min_child_weight': 1}

Model with rank: 5
Mean validation score: 0.85617 (std: 0.00104)
Parameters: {'gamma': 7, 'max_depth': 7, 'min_child_weight': 0.8}



In [30]:
xgb3=XGBClassifier(n_estimators=20,subsample=0.8,colsample_bytree=0.8,colsample_bylevel=0.8,min_child_weight=0.3,max_depth=6,gamma=7)

In [31]:
xgb_param2={'max_delta_step':[3,4,5,6,9],
           'scale_pos_weight':[1,3,4,5,6,7]}

In [32]:
random2=RandomizedSearchCV(xgb3,param_distributions=xgb_param2,cv=5,n_iter=20,scoring='roc_auc',n_jobs=-1,verbose=2)

In [33]:
random2.fit(x_train,y_train)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   14.7s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   41.3s finished


RandomizedSearchCV(cv=5,
                   estimator=XGBClassifier(base_score=None, booster=None,
                                           colsample_bylevel=0.8,
                                           colsample_bynode=None,
                                           colsample_bytree=0.8, gamma=7,
                                           gpu_id=None, importance_type='gain',
                                           interaction_constraints=None,
                                           learning_rate=None,
                                           max_delta_step=None, max_depth=6,
                                           min_child_weight=0.3, missing=nan,
                                           monotone_constraints=None,
                                           n_estimators=20, n_jobs=None,
                                           num_parallel_tree=None,
                                           random_state=None, reg_alpha=None,
                                    

In [34]:
report(random2.cv_results_,5)

Model with rank: 1
Mean validation score: 0.85631 (std: 0.00132)
Parameters: {'scale_pos_weight': 1, 'max_delta_step': 3}

Model with rank: 1
Mean validation score: 0.85631 (std: 0.00132)
Parameters: {'scale_pos_weight': 1, 'max_delta_step': 9}

Model with rank: 1
Mean validation score: 0.85631 (std: 0.00132)
Parameters: {'scale_pos_weight': 1, 'max_delta_step': 5}

Model with rank: 4
Mean validation score: 0.85615 (std: 0.00150)
Parameters: {'scale_pos_weight': 3, 'max_delta_step': 6}

Model with rank: 4
Mean validation score: 0.85615 (std: 0.00150)
Parameters: {'scale_pos_weight': 3, 'max_delta_step': 5}



In [35]:
xgb_param3={'subsample':[i/10 for i in range(5,11)],
           'colsample_bytree':[i/10 for i in range(5,11)],
           'colsample_bylevel':[i/10 for i in range(5,11)]}

In [36]:
xgb4=XGBClassifier(n_estimators=20,min_child_weight=0.5,max_depth=5,gamma=8,scale_pos_weight=1,max_delta_step=5)

In [37]:
random2=RandomizedSearchCV(xgb4,param_distributions=xgb_param3,cv=5,n_iter=20,scoring='roc_auc',n_jobs=-1,verbose=2)

In [38]:
random2.fit(x_train,y_train)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   11.9s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   34.7s finished


RandomizedSearchCV(cv=5,
                   estimator=XGBClassifier(base_score=None, booster=None,
                                           colsample_bylevel=None,
                                           colsample_bynode=None,
                                           colsample_bytree=None, gamma=8,
                                           gpu_id=None, importance_type='gain',
                                           interaction_constraints=None,
                                           learning_rate=None, max_delta_step=5,
                                           max_depth=5, min_child_weight=0.5,
                                           missing=nan,
                                           monotone_constraints=None,
                                           n_estimators=20, n_jobs=None,
                                           num_parallel_tree=None,
                                           random_state=None, reg_alpha=None,
                                     

In [39]:
report(random2.cv_results_,3)

Model with rank: 1
Mean validation score: 0.85628 (std: 0.00193)
Parameters: {'subsample': 1.0, 'colsample_bytree': 0.6, 'colsample_bylevel': 0.9}

Model with rank: 2
Mean validation score: 0.85621 (std: 0.00123)
Parameters: {'subsample': 0.9, 'colsample_bytree': 0.6, 'colsample_bylevel': 1.0}

Model with rank: 3
Mean validation score: 0.85610 (std: 0.00169)
Parameters: {'subsample': 0.8, 'colsample_bytree': 0.7, 'colsample_bylevel': 1.0}



In [40]:
xgb_param4={'reg_lamda':[i/10 for i in range(0,50)],
           'reg_alpha':[i/10 for i in range(0,50)]}

In [41]:
xgb5=XGBClassifier(n_estimators=20,min_child_weight=0.5,max_depth=5,gamma=8,scale_pos_weight=1,max_delta_step=9,subsample=1.0,colsample_bytree=0.6,colsample_bylevel=0.9)
random3=RandomizedSearchCV(xgb5,param_distributions=xgb_param4,cv=5,n_iter=20,scoring='roc_auc',n_jobs=-1,verbose=2)

In [42]:
random3.fit(x_train,y_train)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   11.1s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   32.0s finished


Parameters: { reg_lamda } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




RandomizedSearchCV(cv=5,
                   estimator=XGBClassifier(base_score=None, booster=None,
                                           colsample_bylevel=0.9,
                                           colsample_bynode=None,
                                           colsample_bytree=0.6, gamma=8,
                                           gpu_id=None, importance_type='gain',
                                           interaction_constraints=None,
                                           learning_rate=None, max_delta_step=9,
                                           max_depth=5, min_child_weight=0.5,
                                           missing=nan,
                                           monotone_constraints=None,
                                           n_estimators=20, n_jobs=None,
                                           n...
                                           verbosity=None),
                   n_iter=20, n_jobs=-1,
                   param_distributi

In [43]:
report(random3.cv_results_,3)

Model with rank: 1
Mean validation score: 0.85631 (std: 0.00162)
Parameters: {'reg_lamda': 2.1, 'reg_alpha': 0.9}

Model with rank: 2
Mean validation score: 0.85625 (std: 0.00165)
Parameters: {'reg_lamda': 4.0, 'reg_alpha': 1.8}

Model with rank: 3
Mean validation score: 0.85624 (std: 0.00155)
Parameters: {'reg_lamda': 3.0, 'reg_alpha': 1.0}

Model with rank: 3
Mean validation score: 0.85624 (std: 0.00155)
Parameters: {'reg_lamda': 2.8, 'reg_alpha': 1.0}

Model with rank: 3
Mean validation score: 0.85624 (std: 0.00155)
Parameters: {'reg_lamda': 4.6, 'reg_alpha': 1.0}



In [44]:
xgb6=XGBClassifier(n_estimators=20,min_child_weight=0.5,max_depth=5,gamma=8,scale_pos_weight=1,max_delta_step=9,subsample=1.0,colsample_bytree=0.9,colsample_bylevel=0.7,reg_lamda=2.1,reg_alpha=0.9)

In [45]:
xgb6.fit(x_train,y_train)

Parameters: { reg_lamda } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=0.7,
              colsample_bynode=1, colsample_bytree=0.9, gamma=8, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=9, max_depth=5,
              min_child_weight=0.5, missing=nan, monotone_constraints='()',
              n_estimators=20, n_jobs=0, num_parallel_tree=1, random_state=0,
              reg_alpha=0.9, reg_lambda=1, reg_lamda=2.1, scale_pos_weight=1,
              subsample=1.0, tree_method='exact', validate_parameters=1,
              verbosity=None)

In [46]:
xgb6.score(x_test,y_test)

0.7986512524084779

In [48]:
prediction=xgb6.predict(test_data)

In [49]:
pd.DataFrame(prediction).to_csv('submission.csv')

In [50]:
import pickle

In [None]:
file=open('Xgboost_prediction.pkl','wb')
pickle.dump(xgb6,file)