In [1]:
#IMPORT LIBRARIES
import pandas as pd
import numpy as np


In [2]:
#load dataset

df = pd.read_csv('Data_for_UCI_named.csv')
df.head()

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4,stab,stabf
0,2.95906,3.079885,8.381025,9.780754,3.763085,-0.782604,-1.257395,-1.723086,0.650456,0.859578,0.887445,0.958034,0.055347,unstable
1,9.304097,4.902524,3.047541,1.369357,5.067812,-1.940058,-1.872742,-1.255012,0.413441,0.862414,0.562139,0.78176,-0.005957,stable
2,8.971707,8.848428,3.046479,1.214518,3.405158,-1.207456,-1.27721,-0.920492,0.163041,0.766689,0.839444,0.109853,0.003471,unstable
3,0.716415,7.6696,4.486641,2.340563,3.963791,-1.027473,-1.938944,-0.997374,0.446209,0.976744,0.929381,0.362718,0.028871,unstable
4,3.134112,7.608772,4.943759,9.857573,3.525811,-1.125531,-1.845975,-0.554305,0.79711,0.45545,0.656947,0.820923,0.04986,unstable


In [3]:
#check if there is any missing data
df.isnull().sum()

tau1     0
tau2     0
tau3     0
tau4     0
p1       0
p2       0
p3       0
p4       0
g1       0
g2       0
g3       0
g4       0
stab     0
stabf    0
dtype: int64

there is no missing data

In [4]:
# Because of the direct relationship between 'stab' and 'stabf','stab' column will be dropped
# 'stabf' will remain as the sole dependent variable for binary classification.

df = df.drop(columns='stab', axis=1)
df

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4,stabf
0,2.959060,3.079885,8.381025,9.780754,3.763085,-0.782604,-1.257395,-1.723086,0.650456,0.859578,0.887445,0.958034,unstable
1,9.304097,4.902524,3.047541,1.369357,5.067812,-1.940058,-1.872742,-1.255012,0.413441,0.862414,0.562139,0.781760,stable
2,8.971707,8.848428,3.046479,1.214518,3.405158,-1.207456,-1.277210,-0.920492,0.163041,0.766689,0.839444,0.109853,unstable
3,0.716415,7.669600,4.486641,2.340563,3.963791,-1.027473,-1.938944,-0.997374,0.446209,0.976744,0.929381,0.362718,unstable
4,3.134112,7.608772,4.943759,9.857573,3.525811,-1.125531,-1.845975,-0.554305,0.797110,0.455450,0.656947,0.820923,unstable
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,2.930406,9.487627,2.376523,6.187797,3.343416,-0.658054,-1.449106,-1.236256,0.601709,0.779642,0.813512,0.608385,unstable
9996,3.392299,1.274827,2.954947,6.894759,4.349512,-1.663661,-0.952437,-1.733414,0.502079,0.567242,0.285880,0.366120,stable
9997,2.364034,2.842030,8.776391,1.008906,4.299976,-1.380719,-0.943884,-1.975373,0.487838,0.986505,0.149286,0.145984,stable
9998,9.631511,3.994398,2.757071,7.821347,2.514755,-0.966330,-0.649915,-0.898510,0.365246,0.587558,0.889118,0.818391,unstable


In [5]:
column_names = df.columns
column_names

Index(['tau1', 'tau2', 'tau3', 'tau4', 'p1', 'p2', 'p3', 'p4', 'g1', 'g2',
       'g3', 'g4', 'stabf'],
      dtype='object')

In [6]:
#check the distribution of the target variable
df['stabf'].value_counts()

unstable    6380
stable      3620
Name: stabf, dtype: int64

In [7]:
#check the types of all features 
df.dtypes

tau1     float64
tau2     float64
tau3     float64
tau4     float64
p1       float64
p2       float64
p3       float64
p4       float64
g1       float64
g2       float64
g3       float64
g4       float64
stabf     object
dtype: object

One of the feature is an object, so we need to encode it 

In [8]:
# encode the object column stabf 
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
df.stabf = encoder.fit_transform(df.stabf)
df.stabf

0       1
1       0
2       1
3       1
4       1
       ..
9995    1
9996    0
9997    0
9998    1
9999    1
Name: stabf, Length: 10000, dtype: int32

In [9]:
#split the data into training and testing set
x = df.drop(columns='stabf', axis=1)
y = df['stabf']
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2, random_state=1)
y_train.value_counts()

1    5092
0    2908
Name: stabf, dtype: int64

In [10]:
# Using the standard scaler to transform the model
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
normalized_xtrain = pd.DataFrame(scaler.fit_transform(x_train,y_train),columns = x_train.columns)
normalized_xtest = pd.DataFrame(scaler.fit_transform(x_test),columns = x_test.columns)

In [11]:
normalized_xtrain.head()

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4
0,0.367327,-0.986042,0.650447,1.547527,-0.29149,0.061535,1.293862,-0.845074,0.160918,0.339859,0.585568,0.492239
1,-0.064659,0.089437,1.035079,-1.641494,0.619865,-0.067235,-1.502925,0.486613,-0.293143,-1.558488,1.429649,-1.443521
2,-1.46785,1.298418,-0.502536,1.166046,-0.180521,0.490603,0.68256,-0.855302,1.39935,1.451534,-1.045743,0.492489
3,0.820081,0.52992,1.299657,-1.141975,-0.812854,-0.763632,1.521579,0.65878,-0.958319,1.361958,1.60414,0.275303
4,0.665424,-1.425627,0.3123,0.919137,-1.614296,0.760315,1.422019,0.639243,1.676895,0.69566,1.137504,-1.312575


In [12]:
normalized_xtest.head()

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4
0,0.592163,-0.420565,1.472472,1.093036,0.426786,-1.504594,-0.792677,1.600201,-0.925703,1.175287,-1.492644,1.086291
1,0.199183,0.364543,-0.190076,-0.518473,-0.229402,-1.071766,0.427103,1.052337,-1.65591,-0.395949,1.412703,1.227535
2,-1.086035,-0.321834,-0.873505,0.011761,-0.977094,0.094896,0.813041,0.751381,1.450284,-1.44437,0.654216,-1.679799
3,-0.087014,-1.113357,0.361518,-1.684316,0.79228,-1.649041,0.410662,-0.084473,0.066085,-1.67945,-0.349573,1.057439
4,0.873004,1.425833,0.080476,1.681022,-0.154247,-0.024315,-0.197525,0.485988,0.119716,-1.475773,0.957057,-0.817608


In [13]:
#import the libraries 
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import precision_score,f1_score,recall_score,classification_report,accuracy_score


In [14]:
#using random forest classifier to train the model
forest = RandomForestClassifier(random_state=1)
forest.fit(normalized_xtrain,y_train)
pred1 =forest.predict(normalized_xtest)
print('Accuracy:{}'.format(round(accuracy_score(y_test,pred1),4)))

Accuracy:0.928


In [15]:
#using extra tree classifier to train the model

tree = ExtraTreesClassifier(random_state=3)
tree.fit(normalized_xtrain,y_train)
pred2 =forest.predict(normalized_xtest)
print('Accuracy:{}'.format(round(accuracy_score(y_test,pred2),4)))


Accuracy:0.928


In [16]:
#using extra gradient boost classifier to train the model
xgb = XGBClassifier(random_state=1,learning_rate = 0.1, max_depth = 3,eval_metric='error', use_label_encoder=False)
xgb.fit(normalized_xtrain,y_train)
pred3 =xgb.predict(normalized_xtest)
print('Accuracy:{}'.format(round(accuracy_score(y_test,pred3),4)))

Accuracy:0.919


In [17]:
#using LGBM classifier to train the model
lgbm = LGBMClassifier(random_state=2)
lgbm.fit(normalized_xtrain,y_train)
pred4 =lgbm.predict(normalized_xtest)
print('Accuracy:{}'.format(round(accuracy_score(y_test,pred4),4)))

Accuracy:0.9365


In [18]:
#Training with new Hyperparameters
n_estimators = [50,100,300,500,1000]
min_samples_split = [2,3,5,7,9]
min_samples_leaf = [1,2,4,6,8]
max_features = ['auto','sqrt','log2',None]
hyperparameter_grid = {'n_estimators': n_estimators,
                    'min_samples_leaf': min_samples_leaf,
                       'min_samples_split': min_samples_split,
                    'max_features': max_features}


In [19]:
#Using the RandomizedSearchCV to get the best hyperparameters
from sklearn.model_selection import RandomizedSearchCV
rsv = RandomizedSearchCV(tree, hyperparameter_grid,cv=5,n_iter=10,scoring='accuracy',n_jobs=-1,verbose=1,random_state=1)
search = rsv.fit(normalized_xtrain,y_train)
search.best_params_

Fitting 5 folds for each of 10 candidates, totalling 50 fits


{'n_estimators': 1000,
 'min_samples_split': 2,
 'min_samples_leaf': 8,
 'max_features': None}

In [20]:
#getting the accuracy of the trained model with the new ExtraTreesClassifier 
tree = ExtraTreesClassifier(**search.best_params_, random_state=1)
tree.fit(normalized_xtrain,y_train)
pred5 = tree.predict(normalized_xtest)
print('Accuracy: {}'.format(accuracy_score(y_test,pred5)))

Accuracy: 0.9285


The the accuracy of the new optimal model is lower than the initial ExtraTreesClassifier model with no hyperparameter tuning