In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import  GridSearchCV, train_test_split
from sklearn.metrics import log_loss
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from ngboost import NGBClassifier

%matplotlib inline

  data = yaml.load(f.read()) or {}
  defaults = yaml.load(f)


In [2]:
train = pd.read_csv('Train.csv')
test = pd.read_csv('Test.csv')

In [3]:
train.columns

Index(['grade_A_Component_1', 'grade_A_Component_2', 'max_luminosity',
       'thickness', 'xmin', 'xmax', 'ymin', 'ymax', 'pixel_area', 'log_area',
       'x_component_1', 'x_component_2', 'x_component_3', 'x_component_4',
       'x_component_5', 'class'],
      dtype='object')

In [4]:
feature_list = ['max_luminosity','thickness', 'xmin', 'xmax', 'ymin', 'ymax', 'pixel_area', 'log_area','x_component_1', 'x_component_2', 'x_component_3', 'x_component_4','x_component_5']

In [5]:
X = train[feature_list]
y = train['class']
test = test[feature_list]

X_train, X_val, y_train, y_val = train_test_split(X,y,test_size=0.2,random_state=101)

In [6]:
print(X.shape)
print(test.shape)

(1358, 13)
(583, 13)


In [7]:
sc = StandardScaler()

X_train = sc.fit_transform(X_train)
X_val = sc.fit_transform(X_val)
test = sc.fit_transform(test)

In [8]:
def model_pred(model):
    model.fit(X_train,y_train)
    model_pred = model.predict_proba(X_val)
    model_score = log_loss(y_val,model_pred)
    return model_score, model_pred

In [9]:
def predis(model):
    model.fit(X,y)
    model_preds = model.predict_proba(test)
    return model_preds

## Decision Trees

In [10]:
dtc = DecisionTreeClassifier(min_samples_split=150,max_depth=20)

In [11]:
dtc_score,dtc_val_pred = model_pred(dtc)

In [12]:
dtc_score

0.32648907591852694

In [13]:
dtc_predis = predis(dtc)

## Random Forest

In [14]:
rfc = RandomForestClassifier(bootstrap=False,n_estimators=200,max_depth=20,min_samples_split=10,min_samples_leaf=2)

In [15]:
rfc_score,rfc_val_pred = model_pred(rfc)
rfc_score

0.31589918636855013

In [16]:
rfc_predis = predis(rfc)

## Extra Trees 

In [17]:
etc = ExtraTreesClassifier(bootstrap=False,n_estimators=200,max_depth=20,min_samples_split=10,min_samples_leaf=2)

In [18]:
etc_score,etc_val_pred = model_pred(etc)
etc_score

0.3326863447569123

In [19]:
etc_predis = predis(etc)

## XGBoost

In [20]:
xgbc = XGBClassifier(learning_rate=0.1,n_estimators=100)

In [21]:
xgbc_score,xgbc_val_pred = model_pred(etc)
xgbc_score

0.3269813491768673

In [22]:
xgbc_predis = predis(xgbc)

ValueError: feature_names mismatch: ['max_luminosity', 'thickness', 'xmin', 'xmax', 'ymin', 'ymax', 'pixel_area', 'log_area', 'x_component_1', 'x_component_2', 'x_component_3', 'x_component_4', 'x_component_5'] ['f0', 'f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9', 'f10', 'f11', 'f12']
expected thickness, ymax, log_area, x_component_1, x_component_4, max_luminosity, x_component_3, pixel_area, xmax, x_component_5, ymin, xmin, x_component_2 in input data
training data did not have the following fields: f1, f3, f4, f10, f9, f11, f2, f5, f0, f6, f12, f8, f7

## LightGBM

In [None]:
lgbmc = LGBMClassifier(learning_rate=0.1,n_estimators=100)

In [None]:
lgbmc_score,lgbmc_val_pred = model_pred(lgbmc)
lgbmc_score

In [None]:
lgbmc_predis = predis(lgbmc)

## Boosting predictions

In [None]:
boosted_predis = 0.1*dtc_predis+0.15*rfc_predis+0.15*etc_predis+0.3*xgbc_predis+0.3*lgbmc_predis

In [None]:
boost_predis = pd.DataFrame(boosted_predis,columns=['1','2'])

filename = 'submit_boosted_2.xlsx'
boost_predis.to_excel(filename,index=False)