In [2]:
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

# Plotting options
mpl.style.use('ggplot')
sns.set(style='whitegrid')

In [3]:
df_train= pd.read_csv(r'C:\Users\Alvi Mahmud\Desktop\BAN Sp20\Kag\train.csv')
df_test=pd.read_csv(r'C:\Users\Alvi Mahmud\Desktop\BAN Sp20\Kag\test.csv')

In [4]:
df_train.info()
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 600000 entries, 0 to 599999
Data columns (total 25 columns):
id        600000 non-null int64
bin_0     582106 non-null float64
bin_1     581997 non-null float64
bin_2     582070 non-null float64
bin_3     581986 non-null object
bin_4     581953 non-null object
nom_0     581748 non-null object
nom_1     581844 non-null object
nom_2     581965 non-null object
nom_3     581879 non-null object
nom_4     581965 non-null object
nom_5     582222 non-null object
nom_6     581869 non-null object
nom_7     581997 non-null object
nom_8     582245 non-null object
nom_9     581927 non-null object
ord_0     581712 non-null float64
ord_1     581959 non-null object
ord_2     581925 non-null object
ord_3     582084 non-null object
ord_4     582070 non-null object
ord_5     582287 non-null object
day       582048 non-null float64
month     582012 non-null float64
target    600000 non-null int64
dtypes: float64(6), int64(2), object(17)
memory usage: 114.4

In [5]:
print(df_train.columns)
print(df_train.head(3))
print(df_test.head(3))

#Mask the Null values to retain them during encoding
mask_train= df_train.isin(['nan'])
mask_test= df_test.isin(['nan'])

Index(['id', 'bin_0', 'bin_1', 'bin_2', 'bin_3', 'bin_4', 'nom_0', 'nom_1',
       'nom_2', 'nom_3', 'nom_4', 'nom_5', 'nom_6', 'nom_7', 'nom_8', 'nom_9',
       'ord_0', 'ord_1', 'ord_2', 'ord_3', 'ord_4', 'ord_5', 'day', 'month',
       'target'],
      dtype='object')
   id  bin_0  bin_1  bin_2 bin_3 bin_4 nom_0      nom_1    nom_2   nom_3  ...  \
0   0    0.0    0.0    0.0     F     N   Red  Trapezoid  Hamster  Russia  ...   
1   1    1.0    1.0    0.0     F     Y   Red       Star  Axolotl     NaN  ...   
2   2    0.0    1.0    0.0     F     N   Red        NaN  Hamster  Canada  ...   

       nom_9 ord_0        ord_1     ord_2 ord_3 ord_4  ord_5  day month target  
0  02e7c8990   3.0  Contributor       Hot     c     U     Pw  6.0   3.0      0  
1  f37df64af   3.0  Grandmaster      Warm     e     X     pE  7.0   7.0      0  
2        NaN   3.0          NaN  Freezing     n     P     eN  5.0   9.0      0  

[3 rows x 25 columns]
       id  bin_0  bin_1  bin_2 bin_3 bin_4 nom_0    nom_

In [6]:
#Convert object to string
df_train[['bin_3','bin_4','nom_0', 'nom_1',
       'nom_2', 'nom_3', 'nom_4', 'nom_5', 'nom_6', 'nom_7', 'nom_8', 'nom_9']]= df_train[['bin_3','bin_4','nom_0', 'nom_1',
       'nom_2', 'nom_3', 'nom_4', 'nom_5', 'nom_6', 'nom_7', 'nom_8', 'nom_9']].astype('str')

df_test[['bin_3','bin_4','nom_0', 'nom_1',
       'nom_2', 'nom_3', 'nom_4', 'nom_5', 'nom_6', 'nom_7', 'nom_8', 'nom_9']]= df_test[['bin_3','bin_4','nom_0', 'nom_1',
       'nom_2', 'nom_3', 'nom_4', 'nom_5', 'nom_6', 'nom_7', 'nom_8', 'nom_9']].astype('str')


In [7]:
#Encode all the nominal features and the two string binary feature

from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline


class MultiColumnLabelEncoder:
    def __init__(self,columns = None):
        self.columns = columns # array of column names to encode

    def fit(self,X,y=None):
        return self # not relevant here

    def transform(self,X):
        '''
        Transforms columns of X specified in self.columns using
        LabelEncoder(). If no columns specified, transforms all
        columns in X.
        '''
        output = X.copy()
        if self.columns is not None:
            for col in self.columns:
                output[col] = LabelEncoder().fit_transform(output[col])
        else:
            for colname,col in output.iteritems():
                output[colname] = LabelEncoder().fit_transform(col)
        return output

    def fit_transform(self,X,y=None):
        return self.fit(X,y).transform(X)
df_train_enc= MultiColumnLabelEncoder(columns = [ 'bin_3','bin_4','nom_0', 'nom_1',
       'nom_2', 'nom_3', 'nom_4', 'nom_5', 'nom_6', 'nom_7', 'nom_8', 'nom_9']).fit_transform(df_train)
df_train_enc=pd.DataFrame(df_train_enc.where(~mask_train, other=np.nan))

df_test_enc= MultiColumnLabelEncoder(columns = [ 'bin_3','bin_4','nom_0', 'nom_1',
       'nom_2', 'nom_3', 'nom_4', 'nom_5', 'nom_6', 'nom_7', 'nom_8', 'nom_9']).fit_transform(df_test)
df_test_enc=pd.DataFrame(df_test_enc.where(~mask_test, other=np.nan))

In [8]:
df_train_enc.bin_4.value_counts()

0    312344
1    269609
2     18047
Name: bin_4, dtype: int64

In [9]:
#map low cardinality ordinal features
map_ord1 = {'Novice':1, 
            'Contributor':2, 
            'Expert':3, 
            'Master':4, 
            'Grandmaster':5}
df_train_enc.ord_1 = df_train_enc.ord_1.replace(map_ord1)
df_test_enc.ord_1 = df_test_enc.ord_1.replace(map_ord1)

map_ord2 = {'Freezing':1, 
            'Cold':10, 
            'Warm':25, 
            'Hot':50, 
            'Boiling Hot':100, 
            'Lava Hot':800}
df_train_enc.ord_2 = df_train_enc.ord_2.replace(map_ord2)
df_test_enc.ord_2 = df_test_enc.ord_2.replace(map_ord2)

In [10]:
#Encode high cardinality features
map_ord3 = {'a':1, 'b':2, 'c':3, 'd':4, 'e':5, 'f':6, 'g':7, 'h':8, 'i':9, 'j':10, 'k':11, 'l':12,'m':13,'n':14, 'o':15}
df_train_enc.ord_3 = df_train_enc.ord_3.replace(map_ord3)
df_test_enc.ord_3 = df_test_enc.ord_3.replace(map_ord3)

map_ord4 = {'A':1, 'B':2, 'C':3, 'D':4, 'E':5, 'F':6, 'G':7, 'H':8, 'I':9, 'J':10, 'K':11, 'L':12,'M':13,'N':14, 'O':15,
           'P':16, 'Q':17, 'R':18, 'S':19, 'T':20, 'U':21, 'V':22, 'W':23, 'X':24, 'Y':25, 'Z':26}
df_train_enc.ord_4 = df_train_enc.ord_4.replace(map_ord4)
df_test_enc.ord_4 = df_test_enc.ord_4.replace(map_ord4)

In [11]:
import category_encoders as ce
df_train_enc['ord_5_enc']=df_train_enc['ord_5']
df_test_enc['ord_5_enc']=df_test_enc['ord_5']
ce_ord = ce.OrdinalEncoder(cols = ['ord_5'])
df_train_encall=ce_ord.fit_transform(df_train_enc, df_train_enc['ord_5_enc'])
df_test_encall=ce_ord.fit_transform(df_test_enc, df_test_enc['ord_5_enc'])
df_train_encall.ord_5_enc.value_counts()

Fl    10562
DN     9527
Sz     8654
RV     5648
oJ     5596
      ...  
vw      189
gV      124
vQ      120
eA       91
Zv       87
Name: ord_5_enc, Length: 190, dtype: int64

In [12]:
df_train_encall.ord_5.value_counts()
df_train_encoded= df_train_encall.drop(['target','ord_5_enc'], axis=1)
df_test_encoded= df_test_encall.drop('ord_5_enc', axis=1)

In [13]:
df_train_encoded.columns
print(df_train_encoded.head(3))
df_train_encoded.info()

   id  bin_0  bin_1  bin_2  bin_3  bin_4  nom_0  nom_1  nom_2  nom_3  ...  \
0   0    0.0    0.0    0.0      0      0      2      4      3      5  ...   
1   1    1.0    1.0    0.0      0      1      2      3      0      6  ...   
2   2    0.0    1.0    0.0      0      0      2      6      3      0  ...   

   nom_8  nom_9  ord_0  ord_1  ord_2  ord_3  ord_4  ord_5  day  month  
0      1     27    3.0    2.0   50.0    3.0   21.0      1  6.0    3.0  
1     69   2112    3.0    5.0   25.0    5.0   24.0      2  7.0    7.0  
2    102   2218    3.0    NaN    1.0   14.0   16.0      3  5.0    9.0  

[3 rows x 24 columns]
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 600000 entries, 0 to 599999
Data columns (total 24 columns):
id       600000 non-null int64
bin_0    582106 non-null float64
bin_1    581997 non-null float64
bin_2    582070 non-null float64
bin_3    600000 non-null int32
bin_4    600000 non-null int32
nom_0    600000 non-null int32
nom_1    600000 non-null int32
nom_2    600000

In [14]:
from sklearn.impute import SimpleImputer
imp_mean = SimpleImputer(missing_values=np.nan, strategy='most_frequent') #for median imputation replace 'mean' with 'median'
imp_mean.fit(df_train_encoded)
df_train_imputed=pd.DataFrame(imp_mean.transform(df_train_encoded))

In [15]:
imp_mean2 = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
imp_mean2.fit(df_test_encoded)
df_test_imputed=pd.DataFrame(imp_mean2.transform(df_test_encoded))

In [16]:
print(df_train_imputed.head(3))
print(df_test_imputed.head(3))

    0    1    2    3    4    5    6    7    8    9   ...     14      15   16  \
0  0.0  0.0  0.0  0.0  0.0  0.0  2.0  4.0  3.0  5.0  ...    1.0    27.0  3.0   
1  1.0  1.0  1.0  0.0  0.0  1.0  2.0  3.0  0.0  6.0  ...   69.0  2112.0  3.0   
2  2.0  0.0  1.0  0.0  0.0  0.0  2.0  6.0  3.0  0.0  ...  102.0  2218.0  3.0   

    17    18    19    20   21   22   23  
0  2.0  50.0   3.0  21.0  1.0  6.0  3.0  
1  5.0  25.0   5.0  24.0  2.0  7.0  7.0  
2  1.0   1.0  14.0  16.0  3.0  5.0  9.0  

[3 rows x 24 columns]
         0    1    2    3    4    5    6    7    8    9   ...     14      15  \
0  600000.0  0.0  0.0  0.0  0.0  1.0  0.0  1.0  0.0  3.0  ...  174.0  2194.0   
1  600001.0  0.0  0.0  0.0  0.0  1.0  2.0  0.0  4.0  5.0  ...    4.0  1105.0   
2  600002.0  0.0  0.0  0.0  0.0  1.0  0.0  0.0  0.0  5.0  ...   16.0   810.0   

    16   17     18    19    20    21   22   23  
0  3.0  1.0  100.0   6.0  21.0  49.0  3.0  9.0  
1  1.0  1.0   10.0  14.0  14.0   4.0  2.0  8.0  
2  1.0  3.0   25.0  

In [17]:
df_train_imputed.columns=df_train_encoded.columns
df_test_imputed.columns=df_test_encoded.columns

df_test_imputed.bin_3.value_counts()

0.0    244092
1.0    143957
2.0     11951
Name: bin_3, dtype: int64

In [18]:
print(df_train_imputed.dtypes)

id       float64
bin_0    float64
bin_1    float64
bin_2    float64
bin_3    float64
bin_4    float64
nom_0    float64
nom_1    float64
nom_2    float64
nom_3    float64
nom_4    float64
nom_5    float64
nom_6    float64
nom_7    float64
nom_8    float64
nom_9    float64
ord_0    float64
ord_1    float64
ord_2    float64
ord_3    float64
ord_4    float64
ord_5    float64
day      float64
month    float64
dtype: object


In [19]:
print(df_test_imputed.dtypes)
print()

id       float64
bin_0    float64
bin_1    float64
bin_2    float64
bin_3    float64
bin_4    float64
nom_0    float64
nom_1    float64
nom_2    float64
nom_3    float64
nom_4    float64
nom_5    float64
nom_6    float64
nom_7    float64
nom_8    float64
nom_9    float64
ord_0    float64
ord_1    float64
ord_2    float64
ord_3    float64
ord_4    float64
ord_5    float64
day      float64
month    float64
dtype: object



In [20]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(df_train_imputed,df_train_encall.target, test_size=0.25, random_state=420)

In [20]:
from imblearn.over_sampling import SMOTENC
smote_nc = SMOTENC(categorical_features=[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23], random_state=420)
X_train_bl, y_train_bl = smote_nc.fit_resample(X_train, y_train)

Using TensorFlow backend.


In [21]:
np.save(r'C:\Users\Alvi Mahmud\Desktop\BAN Sp20\Kag\imp_bal_X.npy', X_train_bl)
np.save(r'C:\Users\Alvi Mahmud\Desktop\BAN Sp20\Kag\imp_bal_Y.npy', y_train_bl)

In [21]:
X_train_bl= np.load(r'C:\Users\Alvi Mahmud\Desktop\BAN Sp20\Kag\imp_bal_X.npy', allow_pickle=True) 
y_train_bl=np.load(r'C:\Users\Alvi Mahmud\Desktop\BAN Sp20\Kag\imp_bal_Y.npy', allow_pickle=True)

In [22]:
df_train2=pd.DataFrame(X_train_bl)
df_train2.columns= df_train_imputed.columns
trn= pd.get_dummies(df_train2,columns = [ 'bin_0', 'bin_1', 'bin_2', 'bin_3', 'bin_4', 'nom_0', 'nom_1',
       'nom_2', 'nom_3', 'nom_4', 'ord_0', 'ord_1', 'ord_2'],drop_first=True)
val= pd.get_dummies(X_val,columns = [ 'bin_0', 'bin_1', 'bin_2', 'bin_3', 'bin_4', 'nom_0', 'nom_1',
       'nom_2', 'nom_3', 'nom_4', 'ord_0', 'ord_1', 'ord_2'],drop_first=True)

In [23]:
print(trn.shape)
print(val.shape)
print(val.columns)

(731238, 54)
(150000, 54)
Index(['id', 'nom_5', 'nom_6', 'nom_7', 'nom_8', 'nom_9', 'ord_3', 'ord_4',
       'ord_5', 'day', 'month', 'bin_0_1.0', 'bin_1_1.0', 'bin_2_1.0',
       'bin_3_1.0', 'bin_3_2.0', 'bin_4_1.0', 'bin_4_2.0', 'nom_0_1.0',
       'nom_0_2.0', 'nom_0_3.0', 'nom_1_1.0', 'nom_1_2.0', 'nom_1_3.0',
       'nom_1_4.0', 'nom_1_5.0', 'nom_1_6.0', 'nom_2_1.0', 'nom_2_2.0',
       'nom_2_3.0', 'nom_2_4.0', 'nom_2_5.0', 'nom_2_6.0', 'nom_3_1.0',
       'nom_3_2.0', 'nom_3_3.0', 'nom_3_4.0', 'nom_3_5.0', 'nom_3_6.0',
       'nom_4_1.0', 'nom_4_2.0', 'nom_4_3.0', 'nom_4_4.0', 'ord_0_2.0',
       'ord_0_3.0', 'ord_1_2.0', 'ord_1_3.0', 'ord_1_4.0', 'ord_1_5.0',
       'ord_2_10.0', 'ord_2_25.0', 'ord_2_50.0', 'ord_2_100.0', 'ord_2_800.0'],
      dtype='object')


In [24]:
test=pd.get_dummies(df_test_imputed,columns = [ 'bin_0', 'bin_1', 'bin_2', 'bin_3', 'bin_4', 'nom_0', 'nom_1',
       'nom_2', 'nom_3', 'nom_4', 'ord_0', 'ord_1', 'ord_2'],drop_first=True)

In [25]:
print(test.shape)
print(test.columns)

(400000, 54)
Index(['id', 'nom_5', 'nom_6', 'nom_7', 'nom_8', 'nom_9', 'ord_3', 'ord_4',
       'ord_5', 'day', 'month', 'bin_0_1.0', 'bin_1_1.0', 'bin_2_1.0',
       'bin_3_1.0', 'bin_3_2.0', 'bin_4_1.0', 'bin_4_2.0', 'nom_0_1.0',
       'nom_0_2.0', 'nom_0_3.0', 'nom_1_1.0', 'nom_1_2.0', 'nom_1_3.0',
       'nom_1_4.0', 'nom_1_5.0', 'nom_1_6.0', 'nom_2_1.0', 'nom_2_2.0',
       'nom_2_3.0', 'nom_2_4.0', 'nom_2_5.0', 'nom_2_6.0', 'nom_3_1.0',
       'nom_3_2.0', 'nom_3_3.0', 'nom_3_4.0', 'nom_3_5.0', 'nom_3_6.0',
       'nom_4_1.0', 'nom_4_2.0', 'nom_4_3.0', 'nom_4_4.0', 'ord_0_2.0',
       'ord_0_3.0', 'ord_1_2.0', 'ord_1_3.0', 'ord_1_4.0', 'ord_1_5.0',
       'ord_2_10.0', 'ord_2_25.0', 'ord_2_50.0', 'ord_2_100.0', 'ord_2_800.0'],
      dtype='object')


In [26]:
train= trn.drop('id',axis=1)
val= val.drop('id',axis=1)
test= test.drop('id',axis=1)
print(train.shape)
print(val.shape)
print(test.shape)
#print(train.columns)

(731238, 53)
(150000, 53)
(400000, 53)


In [27]:
from sklearn.preprocessing import StandardScaler
SC= StandardScaler()
train_sc = SC.fit_transform(train)
val_sc=SC.fit_transform(val)
test_sc=SC.fit_transform(test)


In [28]:
train_gcv=pd.DataFrame(train_sc)
train_gcv.columns=train.columns
val_gcv=pd.DataFrame(val_sc)
val_gcv.columns=val.columns
test_gcv=pd.DataFrame(test_sc)
test_gcv.columns=test.columns
target=pd.Series(y_train_bl).rename('target')

import xgboost as xgb
from sklearn.model_selection import GridSearchCV

xgb_model = xgb.XGBClassifier()
param2 = {'max_depth':[10,20,30], 'subsample':[0.7], 'booster':['dart'], 'tree_method': ['hist'],
         'rate_drop': [0.1],'gamma':[0.01,0.1,1],'learning_rate': [0.2],
          'min_child_weight': [5,10,30],'n_estimators': [50], 'seed':[420]}
fit_params={"early_stopping_rounds":[20], "eval_metric":["auc"], "eval_set" : [[val_gcv, y_val]]}   
#model2= xgb.train(dtrain=dtrain,params= param2,num_boost_round=250,early_stopping_rounds=20,
                        #evals= [(dval, 'eval'), (dtrain, 'train')], verbose_eval=10 )

clf = GridSearchCV(xgb_model, param2, verbose=2, refit=True)

In [174]:
clf.fit(train_gcv, target)

Fitting 5 folds for each of 27 candidates, totalling 135 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] booster=dart, gamma=0.01, learning_rate=0.2, max_depth=10, min_child_weight=5, n_estimators=50, rate_drop=0.1, seed=420, subsample=0.7, tree_method=hist 
[CV]  booster=dart, gamma=0.01, learning_rate=0.2, max_depth=10, min_child_weight=5, n_estimators=50, rate_drop=0.1, seed=420, subsample=0.7, tree_method=hist, total= 3.9min
[CV] booster=dart, gamma=0.01, learning_rate=0.2, max_depth=10, min_child_weight=5, n_estimators=50, rate_drop=0.1, seed=420, subsample=0.7, tree_method=hist 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  3.9min remaining:    0.0s


[CV]  booster=dart, gamma=0.01, learning_rate=0.2, max_depth=10, min_child_weight=5, n_estimators=50, rate_drop=0.1, seed=420, subsample=0.7, tree_method=hist, total= 4.1min
[CV] booster=dart, gamma=0.01, learning_rate=0.2, max_depth=10, min_child_weight=5, n_estimators=50, rate_drop=0.1, seed=420, subsample=0.7, tree_method=hist 
[CV]  booster=dart, gamma=0.01, learning_rate=0.2, max_depth=10, min_child_weight=5, n_estimators=50, rate_drop=0.1, seed=420, subsample=0.7, tree_method=hist, total= 3.9min
[CV] booster=dart, gamma=0.01, learning_rate=0.2, max_depth=10, min_child_weight=5, n_estimators=50, rate_drop=0.1, seed=420, subsample=0.7, tree_method=hist 
[CV]  booster=dart, gamma=0.01, learning_rate=0.2, max_depth=10, min_child_weight=5, n_estimators=50, rate_drop=0.1, seed=420, subsample=0.7, tree_method=hist, total= 2.6min
[CV] booster=dart, gamma=0.01, learning_rate=0.2, max_depth=10, min_child_weight=5, n_estimators=50, rate_drop=0.1, seed=420, subsample=0.7, tree_method=hist 
[

[Parallel(n_jobs=1)]: Done 135 out of 135 | elapsed: 912.7min finished


GridSearchCV(cv=None, error_score=nan,
             estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                     colsample_bylevel=1, colsample_bynode=1,
                                     colsample_bytree=1, gamma=0,
                                     learning_rate=0.1, max_delta_step=0,
                                     max_depth=3, min_child_weight=1,
                                     missing=None, n_estimators=100, n_jobs=1,
                                     nthread=None, objective='binary:logistic',
                                     random_state=0, reg_alpha=0, reg_lambda=1,
                                     scale...
                                     subsample=1, verbosity=1),
             iid='deprecated', n_jobs=None,
             param_grid={'booster': ['dart'], 'gamma': [0.01, 0.1, 1],
                         'learning_rate': [0.2], 'max_depth': [10, 20, 30],
                         'min_child_weight': [5, 10, 30], 'n_e

print(clf.best_params_)

In [38]:
import gc
gc.collect()

337

In [None]:
import xgboost as xgb
dtrain = xgb.DMatrix(train_sc, label=y_train_bl)
dval = xgb.DMatrix(val_sc, label=y_val.values)
dtest = xgb.DMatrix(test_sc)


param = {'max_depth':30, 'silent':1, 'objective':'binary:logistic', 'subsample':0.7,"booster": 'dart',  'tree_method': 'hist', 
         'sample_type': 'weighted', 'eval_metric':["error","auc"],'learning_rate': 0.25, 'n_estimator':200,
         'rate_drop': 0.1, 'seed':420}

model= xgb.train(dtrain=dtrain,params= param,num_boost_round=250,early_stopping_rounds=20,
                        evals= [(dval, 'eval'), (dtrain, 'train')], verbose_eval=10 )

[0]	eval-error:0.333327	eval-auc:0.601355	train-error:0.128578	train-auc:0.924037
Multiple eval metrics have been passed: 'train-auc' will be used for early stopping.

Will train until train-auc hasn't improved in 20 rounds.
[10]	eval-error:0.27676	eval-auc:0.645921	train-error:0.026636	train-auc:0.996286
[20]	eval-error:0.272	eval-auc:0.653235	train-error:0.015375	train-auc:0.998715
[30]	eval-error:0.26492	eval-auc:0.661881	train-error:0.007995	train-auc:0.99947


In [31]:
y_pred = model.predict(dtest, ntree_limit= 250)

In [32]:
result= pd.concat([pd.Series(y_pred), df_test.id],axis=1)

In [33]:
print(result)

               0      id
0       0.207773  600000
1       0.590388  600001
2       0.585616  600002
3       0.073573  600003
4       0.245190  600004
...          ...     ...
399995  0.667515  999995
399996  0.440480  999996
399997  0.852044  999997
399998  0.726058  999998
399999  0.250907  999999

[400000 rows x 2 columns]


In [63]:
result_xgb= result.values
np.save(r'C:\Users\Alvi Mahmud\Desktop\BAN Sp20\Kag\xgb2.npy', result_xgb)

In [51]:

submission =pd.DataFrame(np.load(r'C:\Users\Alvi Mahmud\Desktop\BAN Sp20\Kag\xgb.npy', allow_pickle=True), columns= ('target', 'id'))

In [56]:
submission.id= submission.id.astype(int)
print(submission)
submission.info()

          target      id
0       0.001812  600000
1       0.308687  600001
2       0.806775  600002
3       0.014121  600003
4       0.002912  600004
...          ...     ...
399995  0.846877  999995
399996  0.077470  999996
399997  0.904900  999997
399998  0.146185  999998
399999  0.327749  999999

[400000 rows x 2 columns]
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400000 entries, 0 to 399999
Data columns (total 2 columns):
target    400000 non-null float64
id        400000 non-null int32
dtypes: float64(1), int32(1)
memory usage: 4.6 MB


In [57]:
submission.to_csv(r'C:\Users\Alvi Mahmud\Desktop\BAN Sp20\Kag\Submission_Xgb', index=False)

In [34]:
result.to_csv(r'C:\Users\Alvi Mahmud\Desktop\BAN Sp20\Kag\Submission_Xgb3.csv', index=False)