In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

%matplotlib inline

In [2]:
train = pd.read_csv('Train.csv')
test = pd.read_csv('Test.csv')

In [3]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1358 entries, 0 to 1357
Data columns (total 16 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   grade_A_Component_1  1358 non-null   int64  
 1   grade_A_Component_2  1358 non-null   int64  
 2   max_luminosity       1358 non-null   int64  
 3   thickness            1358 non-null   int64  
 4   xmin                 1358 non-null   int64  
 5   xmax                 1358 non-null   int64  
 6   ymin                 1358 non-null   int64  
 7   ymax                 1358 non-null   int64  
 8   pixel_area           1358 non-null   int64  
 9   log_area             1358 non-null   float64
 10  x_component_1        1358 non-null   int64  
 11  x_component_2        1358 non-null   int64  
 12  x_component_3        1358 non-null   int64  
 13  x_component_4        1358 non-null   int64  
 14  x_component_5        1358 non-null   int64  
 15  class                1358 non-null   i

In [4]:
from sklearn.model_selection import train_test_split

In [5]:
train2 = train.iloc[:1000,:]
val = train.iloc[1000:,:]

In [6]:
train2.shape

(1000, 16)

In [7]:
val.shape

(358, 16)

In [8]:
# from featexp import get_univariate_plots

# get_univariate_plots(data=train,target_col='class',data_test=val,bins=10)

In [9]:
# from featexp import get_trend_stats

# stats = get_trend_stats(data=train,target_col='class',data_test=val)

In [10]:
# stats

In [11]:
train.columns

Index(['grade_A_Component_1', 'grade_A_Component_2', 'max_luminosity',
       'thickness', 'xmin', 'xmax', 'ymin', 'ymax', 'pixel_area', 'log_area',
       'x_component_1', 'x_component_2', 'x_component_3', 'x_component_4',
       'x_component_5', 'class'],
      dtype='object')

In [12]:
features_list = ['max_luminosity','thickness', 'xmin', 'xmax','ymin', 'ymax', 'pixel_area', 'log_area','x_component_1', 'x_component_2', 'x_component_3', 'x_component_4','x_component_5']

In [13]:
X = train[features_list]
y = train['class']
test2 = test[features_list]

In [14]:
X_train, X_val, y_train, y_val = train_test_split(X,y,test_size=0.2,random_state=101)

In [15]:
from sklearn.preprocessing import StandardScaler

In [16]:
sc = StandardScaler()

X_train = sc.fit_transform(X_train)
X_val = sc.fit_transform(X_val)

In [17]:
from xgboost import XGBClassifier

In [18]:
import hyperopt
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

In [21]:
space={ 'max_depth': 16.0,
        'gamma': hp.uniform('gamma', 0,9),
        'learning_rate': hp.uniform('learning_rate',0.1,1),
        'reg_alpha': 41.0,
        'reg_lambda': 0.9255906450789302,
        'colsample_bytree': 0.9975162562350268,
        'min_child_weight': 7.0,
        'n_estimators': 120
    }

In [22]:
from sklearn.metrics import log_loss

In [23]:
def hyperparameter_tuning(space):
    clf= XGBClassifier(n_estimators =space['n_estimators'], max_depth = int(space['max_depth']), gamma = space['gamma'],
                         reg_alpha = int(space['reg_alpha']),min_child_weight=space['min_child_weight'],
                         colsample_bytree=space['colsample_bytree'])
    evaluation = [( X_train, y_train), ( X_val, y_val)]
    
    clf.fit(X_train, y_train,
            eval_set=evaluation, eval_metric="rmse",
            early_stopping_rounds=10,verbose=False)

    pred = clf.predict_proba(X_val)
    loss_log = log_loss(y_val, pred)
    print ("SCORE:", loss_log)
    #change the metric if you like
    return {'loss': loss_log, 'status': STATUS_OK }


In [None]:
trials = Trials()
best = fmin(fn=hyperparameter_tuning,
            space=space,
            algo=tpe.suggest,
            max_evals=100,
            trials=trials)

print (best)

SCORE:                                                 
0.534965504618252                                      
SCORE:                                                                          
0.5340263426084729                                                              
SCORE:                                                                           
0.5323036927182008                                                               
SCORE:                                                                           
0.5336995136869305                                                               
SCORE:                                                                           
0.5345089973553139                                                               
SCORE:                                                                           
0.534965504618252                                                                
SCORE:                                                                

In [None]:
# {'colsample_bytree': 0.9975162562350268, 'gamma': 4.365372330371842, 'max_depth': 16.0, 'min_child_weight': 7.0, 'reg_alpha': 41.0, 'reg_lambda': 0.9255906450789302}

In [20]:
xgbc.fit(X_train,y_train)
xgbc_pred = xgbc.predict_proba(X_val)
xgbc_score = log_loss(y_val,xgbc_pred)

In [21]:
print(xgbc_score)

0.3149532938721891


In [22]:
for feat_imp,param in zip(xgbc.feature_importances_,features_list):
    print(feat_imp,param)

0.034643937 max_luminosity
0.04134077 thickness
0.034019727 xmin
0.030967478 xmax
0.0249384 ymin
0.021259068 ymax
0.02315349 pixel_area
0.025714822 log_area
0.16891061 x_component_1
0.16693996 x_component_2
0.2335242 x_component_3
0.10340897 x_component_4
0.09117854 x_component_5


In [23]:
xgbc.fit(X,y)
predis = xgbc.predict_proba(test2)

In [24]:
predics = pd.DataFrame(predis,columns=['1','2'])

filename = 'submit_fxp_3.xlsx'
predics.to_excel(filename,index=False)

####Inference = Better results on same model with dropping feats - grade_comp_a_1/2