In [38]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('display.max_rows',None)

In [39]:
import re
from datetime import timedelta
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import log_loss, accuracy_score

In [40]:
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, AdaBoostClassifier, ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier, plot_importance
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

In [41]:
from utils import (load_dataset,save_dataset)

In [42]:
df_train = load_dataset('Train')
df_test = load_dataset('Test')
#df_sub = load_dataset('Sample_Submission')
print(df_train.shape)
print(df_test.shape)

(12666, 7)
(29555, 6)


In [43]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12666 entries, 0 to 12665
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Area_Code      12666 non-null  int64  
 1   Locality_Code  12666 non-null  int64  
 2   Region_Code    12666 non-null  int64  
 3   Height         12666 non-null  float64
 4   Diameter       12666 non-null  float64
 5   Class          12666 non-null  int64  
 6   Species        12666 non-null  int64  
dtypes: float64(2), int64(5)
memory usage: 692.8 KB


In [44]:
df_train.isnull().sum()

Area_Code        0
Locality_Code    0
Region_Code      0
Height           0
Diameter         0
Class            0
Species          0
dtype: int64

In [45]:
df_test.isnull().sum()

Area_Code        0
Locality_Code    0
Region_Code      0
Height           0
Diameter         0
Species          0
dtype: int64

In [46]:
df_train.head()

Unnamed: 0,Area_Code,Locality_Code,Region_Code,Height,Diameter,Class,Species
0,4694,17,1609,3.0,5.0,7,48
1,781,7,1380,4.0,17.0,2,54
2,21014,6,1122,3.0,11.0,1,22
3,7326,6,556,3.0,34.0,7,25
4,13122,17,1752,5.0,16.0,2,128


In [47]:
df_test['Class'] = -1

In [48]:
df_full = pd.concat([df_train,df_test],axis=0,ignore_index=True)

In [49]:
def generate_key(df_data):
    
    df= df_data.copy()
    
    df['key'] = df['Area_Code'].astype(str) + '_' + df['Locality_Code'].astype(str) + '_' + df['Region_Code'].astype(str) + '_' + df['Height'].astype(str) + '_' + df['Diameter'].astype(str) + '_' + df['Species'].astype(str)
    
    return df

In [50]:
df_full = generate_key(df_full)

In [51]:
df_full_treated = df_full.copy()

In [52]:
train_cols = [x for x in df_full_treated.columns if x not in ['Class','key']]
target = 'Class'

In [53]:
df_train_treated = df_full_treated[df_full_treated['Class']!=-1].copy()
df_test_treated = df_full_treated[df_full_treated['Class']==-1].copy()

In [54]:
def get_key_download_map(df_tr,df_ts):
    tr_key = set(df_tr['key'].values)
    ts_key = set(df_ts['key'].values)
    ins = tr_key.intersection(ts_key)
    df_map = df_tr[df_tr['key'].isin(list(ins))][['key','Class']].copy()
    df_map.set_index('key',inplace=True)
    map_dict = df_map['Class'].to_dict()
    return map_dict

In [55]:
map_dict = get_key_download_map(df_train_treated,df_test_treated)

In [56]:
#len(map_dict)

In [57]:
model = XGBClassifier(random_state=0,n_estimators=40)
#model = CatBoostClassifier(random_state=0)

In [58]:
model.fit(df_train_treated[train_cols],df_train_treated[target])

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=40, n_jobs=0, num_parallel_tree=1,
              objective='multi:softprob', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=None, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [59]:
y_pred_train = model.predict(df_train_treated[train_cols])

In [60]:
accuracy_score(df_train_treated[target],y_pred_train)

0.8063319122059056

In [61]:
y_pred_prob_train = model.predict_proba(df_train_treated[train_cols])

In [62]:
log_loss(df_train_treated[target],y_pred_prob_train)

0.5243942278000038

In [63]:
test_preds = pd.DataFrame(model.predict_proba(df_test_treated[train_cols]))

In [64]:
y_pred_act_test = model.predict(df_test_treated[train_cols])

In [65]:
#save_dataset(test_preds,name='BASE_MODEL_CAT')

In [66]:
df_test_treated['MapClass'] = df_test_treated['key'].map(map_dict)
df_test_treated['Class'] = y_pred_act_test

In [67]:
df_add = pd.concat([df_test_treated.reset_index(drop=True),test_preds],axis=1)
df_add.head()

Unnamed: 0,Area_Code,Locality_Code,Region_Code,Height,Diameter,Class,Species,key,MapClass,0,1,2,3,4,5,6,7
0,25836,4,903,7.5,8.0,6,14,25836_4_903_7.5_8.0_14,6.0,0.001443,0.013379,0.247352,0.010755,0.000258,0.000149,0.721613,0.005051
1,28544,14,1576,3.0,7.0,7,48,28544_14_1576_3.0_7.0_48,7.0,0.005527,0.111644,0.015421,0.020487,0.002277,0.000402,0.03587,0.808372
2,7037,11,350,15.0,57.0,3,56,7037_11_350_15.0_57.0_56,,0.002109,0.000275,0.00519,0.897162,0.000156,0.007548,0.087058,0.000502
3,20460,17,1208,3.0,6.0,7,180,20460_17_1208_3.0_6.0_180,,0.004168,0.066195,0.007946,0.00206,0.000111,7.1e-05,0.001871,0.917578
4,17555,14,994,4.5,22.0,6,225,17555_14_994_4.5_22.0_225,,0.001507,0.025973,0.435505,0.043815,0.000266,0.000297,0.490529,0.002108


In [68]:
pred_cols = sorted(df_train_treated['Class'].unique())
pred_map = { pred_cols[i]:i for i in range(0,8)}
pred_map

{0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7}

In [69]:
columns = [x for x in range(0,8)]

In [70]:
df_add[['Class','MapClass']+columns][df_add['MapClass'].notnull()].head(100)

Unnamed: 0,Class,MapClass,0,1,2,3,4,5,6,7
0,6,6.0,0.001443,0.013379,0.247352,0.010755,0.000258,0.000149,0.721613,0.005051
1,7,7.0,0.005527,0.111644,0.015421,0.020487,0.002277,0.000402,0.03587,0.808372
12,6,6.0,0.000409,0.00216,0.041915,0.005033,5.6e-05,0.000189,0.949789,0.000449
94,6,6.0,0.002137,0.036642,0.086103,0.071034,0.000392,0.000278,0.796324,0.007089
98,3,3.0,0.00181,0.000147,0.0012,0.960947,9.2e-05,0.009342,0.025962,0.0005
110,6,6.0,0.000565,0.000439,0.015466,0.023508,6e-05,8.5e-05,0.959503,0.000375
337,3,3.0,0.00258,0.014278,0.16133,0.563444,0.000275,0.013339,0.243022,0.001732
362,0,0.0,0.954222,0.001095,0.005544,0.013921,0.007387,0.000652,0.011378,0.005802
385,2,7.0,0.017226,0.293327,0.375836,0.013543,0.000706,0.000386,0.124624,0.174352
394,0,0.0,0.936918,0.010482,0.020088,0.003239,0.000138,9.5e-05,0.022445,0.006596


In [71]:
df_add['Class'] = df_add['Class'].astype(float)

In [72]:
for col in df_add['MapClass'].unique():
    if pd.isnull(col):
        continue
    cols_to_one = pred_map[col]
    cols_to_zero =[x for x in range(0,8) if x!=cols_to_one]
    df_add[cols_to_one] = np.where(df_add['MapClass']==col,0.99999,df_add[cols_to_one])
    for z in cols_to_zero:
        df_add[z] = np.where(df_add['MapClass']==col,0,df_add[z])

In [73]:
df_x = df_add[[x for x in range(0,8)]].copy()

In [74]:
save_dataset(df_x,name='BASE_MODEL_XGB_KEY_MAP')

In [75]:
df_add.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29555 entries, 0 to 29554
Data columns (total 17 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Area_Code      29555 non-null  int64  
 1   Locality_Code  29555 non-null  int64  
 2   Region_Code    29555 non-null  int64  
 3   Height         29555 non-null  float64
 4   Diameter       29555 non-null  float64
 5   Class          29555 non-null  float64
 6   Species        29555 non-null  int64  
 7   key            29555 non-null  object 
 8   MapClass       1353 non-null   float64
 9   0              29555 non-null  float32
 10  1              29555 non-null  float32
 11  2              29555 non-null  float32
 12  3              29555 non-null  float32
 13  4              29555 non-null  float32
 14  5              29555 non-null  float32
 15  6              29555 non-null  float32
 16  7              29555 non-null  float32
dtypes: float32(8), float64(4), int64(4), object(1)
mem