In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('display.max_rows',None)

In [2]:
import re
from datetime import timedelta
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import log_loss, accuracy_score

In [3]:
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, AdaBoostClassifier, ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier, plot_importance
from lightgbm import LGBMClassifier

In [4]:
from utils import (load_dataset,save_dataset)

In [5]:
df_train = load_dataset('Train')
df_test = load_dataset('Test')
#df_sub = load_dataset('Sample_Submission')
print(df_train.shape)
print(df_test.shape)

(12666, 7)
(29555, 6)


In [6]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12666 entries, 0 to 12665
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Area_Code      12666 non-null  int64  
 1   Locality_Code  12666 non-null  int64  
 2   Region_Code    12666 non-null  int64  
 3   Height         12666 non-null  float64
 4   Diameter       12666 non-null  float64
 5   Class          12666 non-null  int64  
 6   Species        12666 non-null  int64  
dtypes: float64(2), int64(5)
memory usage: 692.8 KB


In [7]:
df_train.isnull().sum()

Area_Code        0
Locality_Code    0
Region_Code      0
Height           0
Diameter         0
Class            0
Species          0
dtype: int64

In [8]:
df_test.isnull().sum()

Area_Code        0
Locality_Code    0
Region_Code      0
Height           0
Diameter         0
Species          0
dtype: int64

In [9]:
df_train.head()

Unnamed: 0,Area_Code,Locality_Code,Region_Code,Height,Diameter,Class,Species
0,4694,17,1609,3.0,5.0,7,48
1,781,7,1380,4.0,17.0,2,54
2,21014,6,1122,3.0,11.0,1,22
3,7326,6,556,3.0,34.0,7,25
4,13122,17,1752,5.0,16.0,2,128


In [10]:
df_test['Class'] = -1
df_full = pd.concat([df_train,df_test],axis=0,ignore_index=True)

In [11]:
len(df_full)

42221

In [12]:
df_full['Area_Code_Locality_Code'] = df_full['Area_Code'].astype(str) + df_full['Locality_Code'].astype(str)
df_full['Region_Code_Locality_Code'] = df_full['Region_Code'].astype(str) + df_full['Locality_Code'].astype(str)
df_full['Area_Code_Region_Code'] = df_full['Area_Code'].astype(str) + df_full['Region_Code'].astype(str)

In [13]:
def create_maps_and_add_cols(df_data):
    df = df_data.copy()
    ops = ['min', 'max','mean','std']
    
    key_list = ['Area_Code_Locality_Code','Region_Code_Locality_Code','Area_Code_Region_Code']
    var_list = ['Height','Diameter']
    
    for key in key_list:
        for var in var_list:
            df_map = df.groupby(key)[var].agg(ops)
            df_map.reset_index(inplace=True)
            df_map.set_index(key,inplace=True)
            df_map.columns = [key + '_' + var + '_' + x for x in ops]
            df_map.reset_index(inplace=True)
            df = pd.merge(df,df_map,how='left',on=key)
    df['HeightDiameter'] = df['Height'] / df['Diameter']
    df['HeightS2'] = df['Height']*df['Height']
    df['HeightS2Diameter'] = df['HeightS2'] / df['Diameter']
    
    return df

In [14]:
df_full_treated = create_maps_and_add_cols(df_full)
df_full_treated.head()

Unnamed: 0,Area_Code,Locality_Code,Region_Code,Height,Diameter,Class,Species,Area_Code_Locality_Code,Region_Code_Locality_Code,Area_Code_Region_Code,...,Area_Code_Region_Code_Height_max,Area_Code_Region_Code_Height_mean,Area_Code_Region_Code_Height_std,Area_Code_Region_Code_Diameter_min,Area_Code_Region_Code_Diameter_max,Area_Code_Region_Code_Diameter_mean,Area_Code_Region_Code_Diameter_std,HeightDiameter,HeightS2,HeightS2Diameter
0,4694,17,1609,3.0,5.0,7,48,469417,160917,46941609,...,12.5,4.051282,2.229571,1.0,100.0,20.102564,15.862559,0.6,9.0,1.8
1,781,7,1380,4.0,17.0,2,54,7817,13807,7811380,...,4.0,4.0,,17.0,17.0,17.0,,0.235294,16.0,0.941176
2,21014,6,1122,3.0,11.0,1,22,210146,11226,210141122,...,3.0,3.0,,11.0,11.0,11.0,,0.272727,9.0,0.818182
3,7326,6,556,3.0,34.0,7,25,73266,5566,7326556,...,3.0,3.0,,34.0,34.0,34.0,,0.088235,9.0,0.264706
4,13122,17,1752,5.0,16.0,2,128,1312217,175217,131221752,...,5.0,5.0,,16.0,16.0,16.0,,0.3125,25.0,1.5625


In [15]:
df_full_treated.isnull().sum()

Area_Code                                      0
Locality_Code                                  0
Region_Code                                    0
Height                                         0
Diameter                                       0
Class                                          0
Species                                        0
Area_Code_Locality_Code                        0
Region_Code_Locality_Code                      0
Area_Code_Region_Code                          0
Area_Code_Locality_Code_Height_min             0
Area_Code_Locality_Code_Height_max             0
Area_Code_Locality_Code_Height_mean            0
Area_Code_Locality_Code_Height_std         28723
Area_Code_Locality_Code_Diameter_min           0
Area_Code_Locality_Code_Diameter_max           0
Area_Code_Locality_Code_Diameter_mean          0
Area_Code_Locality_Code_Diameter_std       28723
Region_Code_Locality_Code_Height_min           0
Region_Code_Locality_Code_Height_max           0
Region_Code_Locality

In [16]:
df_full_treated.columns

Index(['Area_Code', 'Locality_Code', 'Region_Code', 'Height', 'Diameter',
       'Class', 'Species', 'Area_Code_Locality_Code',
       'Region_Code_Locality_Code', 'Area_Code_Region_Code',
       'Area_Code_Locality_Code_Height_min',
       'Area_Code_Locality_Code_Height_max',
       'Area_Code_Locality_Code_Height_mean',
       'Area_Code_Locality_Code_Height_std',
       'Area_Code_Locality_Code_Diameter_min',
       'Area_Code_Locality_Code_Diameter_max',
       'Area_Code_Locality_Code_Diameter_mean',
       'Area_Code_Locality_Code_Diameter_std',
       'Region_Code_Locality_Code_Height_min',
       'Region_Code_Locality_Code_Height_max',
       'Region_Code_Locality_Code_Height_mean',
       'Region_Code_Locality_Code_Height_std',
       'Region_Code_Locality_Code_Diameter_min',
       'Region_Code_Locality_Code_Diameter_max',
       'Region_Code_Locality_Code_Diameter_mean',
       'Region_Code_Locality_Code_Diameter_std',
       'Area_Code_Region_Code_Height_min', 'Area_Code_R

In [17]:
train_cols = [
    'Area_Code',
    'Locality_Code',
    'Region_Code',
    'Height',
    'Diameter',
       #'Class',
    'Species',
    #'Area_Code_Locality_Code',
       #'Region_Code_Locality_Code',
    #'Area_Code_Region_Code',
       'Area_Code_Locality_Code_Height_min',
       'Area_Code_Locality_Code_Height_max',
       'Area_Code_Locality_Code_Height_mean',
       'Area_Code_Locality_Code_Height_std',
       'Area_Code_Locality_Code_Diameter_min',
       'Area_Code_Locality_Code_Diameter_max',
       'Area_Code_Locality_Code_Diameter_mean',
       'Area_Code_Locality_Code_Diameter_std',
       'Region_Code_Locality_Code_Height_min',
       'Region_Code_Locality_Code_Height_max',
       'Region_Code_Locality_Code_Height_mean',
       'Region_Code_Locality_Code_Height_std',
       'Region_Code_Locality_Code_Diameter_min',
       'Region_Code_Locality_Code_Diameter_max',
       'Region_Code_Locality_Code_Diameter_mean',
       'Region_Code_Locality_Code_Diameter_std',
       'Area_Code_Region_Code_Height_min',
    'Area_Code_Region_Code_Height_max',
       'Area_Code_Region_Code_Height_mean',
    'Area_Code_Region_Code_Height_std',
       'Area_Code_Region_Code_Diameter_min',
       'Area_Code_Region_Code_Diameter_max',
       'Area_Code_Region_Code_Diameter_mean',
       'Area_Code_Region_Code_Diameter_std',
    #'HeightDiameter',
    #'HeightS2',
    #'HeightS2Diameter'
    
]
target = 'Class'

In [18]:
df_train_treated = df_full_treated[df_full_treated[target]!=-1].copy()
df_test_treated = df_full_treated[df_full_treated[target]==-1].copy()

In [19]:
model = XGBClassifier(random_state=0,n_estimators=40)
#model = XGBClassifier(random_state=0,n_estimators=100)
#model = LGBMClassifier(random_state=0)

In [20]:
model.fit(df_train_treated[train_cols],df_train_treated[target])

LGBMClassifier(random_state=0)

In [21]:
#plot_importance(model)

In [22]:
y_pred_train = model.predict(df_train_treated[train_cols])

In [23]:
accuracy_score(df_train_treated[target],y_pred_train)

0.8420180009474183

In [24]:
y_pred_prob_train = model.predict_proba(df_train_treated[train_cols])

In [25]:
log_loss(df_train_treated[target],y_pred_prob_train)

0.4606840664959794

In [26]:
test_preds = pd.DataFrame(model.predict_proba(df_test_treated[train_cols]))

In [27]:
save_dataset(test_preds,name='BASE_MODEL_XGB')