In [35]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split as tts
from sklearn.model_selection import RandomizedSearchCV, cross_val_score
import sklearn.utils
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import recall_score, accuracy_score, precision_score, f1_score, confusion_matrix, classification_report
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

In [2]:
data = pd.read_csv('data/Data_for_UCI_named.csv')
data.head()

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4,stab,stabf
0,2.95906,3.079885,8.381025,9.780754,3.763085,-0.782604,-1.257395,-1.723086,0.650456,0.859578,0.887445,0.958034,0.055347,unstable
1,9.304097,4.902524,3.047541,1.369357,5.067812,-1.940058,-1.872742,-1.255012,0.413441,0.862414,0.562139,0.78176,-0.005957,stable
2,8.971707,8.848428,3.046479,1.214518,3.405158,-1.207456,-1.27721,-0.920492,0.163041,0.766689,0.839444,0.109853,0.003471,unstable
3,0.716415,7.6696,4.486641,2.340563,3.963791,-1.027473,-1.938944,-0.997374,0.446209,0.976744,0.929381,0.362718,0.028871,unstable
4,3.134112,7.608772,4.943759,9.857573,3.525811,-1.125531,-1.845975,-0.554305,0.79711,0.45545,0.656947,0.820923,0.04986,unstable


In [3]:
data.drop(columns=['stab'], inplace=True)
data.head()

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4,stabf
0,2.95906,3.079885,8.381025,9.780754,3.763085,-0.782604,-1.257395,-1.723086,0.650456,0.859578,0.887445,0.958034,unstable
1,9.304097,4.902524,3.047541,1.369357,5.067812,-1.940058,-1.872742,-1.255012,0.413441,0.862414,0.562139,0.78176,stable
2,8.971707,8.848428,3.046479,1.214518,3.405158,-1.207456,-1.27721,-0.920492,0.163041,0.766689,0.839444,0.109853,unstable
3,0.716415,7.6696,4.486641,2.340563,3.963791,-1.027473,-1.938944,-0.997374,0.446209,0.976744,0.929381,0.362718,unstable
4,3.134112,7.608772,4.943759,9.857573,3.525811,-1.125531,-1.845975,-0.554305,0.79711,0.45545,0.656947,0.820923,unstable


In [4]:
data['stabf'].value_counts()

unstable    6380
stable      3620
Name: stabf, dtype: int64

In [5]:
data.isna().sum()

tau1     0
tau2     0
tau3     0
tau4     0
p1       0
p2       0
p3       0
p4       0
g1       0
g2       0
g3       0
g4       0
stabf    0
dtype: int64

In [6]:
# Split data into fatures and labels.
x = data.drop(columns=['stabf'])
y = data['stabf']

In [7]:
x.head()

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4
0,2.95906,3.079885,8.381025,9.780754,3.763085,-0.782604,-1.257395,-1.723086,0.650456,0.859578,0.887445,0.958034
1,9.304097,4.902524,3.047541,1.369357,5.067812,-1.940058,-1.872742,-1.255012,0.413441,0.862414,0.562139,0.78176
2,8.971707,8.848428,3.046479,1.214518,3.405158,-1.207456,-1.27721,-0.920492,0.163041,0.766689,0.839444,0.109853
3,0.716415,7.6696,4.486641,2.340563,3.963791,-1.027473,-1.938944,-0.997374,0.446209,0.976744,0.929381,0.362718
4,3.134112,7.608772,4.943759,9.857573,3.525811,-1.125531,-1.845975,-0.554305,0.79711,0.45545,0.656947,0.820923


In [8]:
y.head()

0    unstable
1      stable
2    unstable
3    unstable
4    unstable
Name: stabf, dtype: object

In [9]:
# Split the data into train and test set.
x_train, x_test, y_train, y_test = tts(x, y, test_size=0.20, random_state=1)

In [10]:
x_train.head()

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4
2694,6.255995,2.542401,7.024714,9.476518,3.529888,-1.224881,-0.688228,-1.61678,0.568221,0.618403,0.685739,0.660088
5140,5.070581,5.490253,8.075688,0.761075,4.220888,-1.280596,-1.902185,-1.038107,0.443515,0.097244,0.916955,0.129254
2568,1.220072,8.804028,3.874283,8.433949,3.614027,-1.039236,-0.953566,-1.621224,0.908353,0.923594,0.238881,0.660156
3671,7.498402,6.697603,8.798626,2.126236,3.134585,-1.581906,-0.589386,-0.963293,0.260826,0.899003,0.964752,0.600598
7427,7.074006,1.337511,6.100756,7.759156,2.526922,-0.92254,-0.6326,-0.971782,0.98458,0.716082,0.836928,0.165162


In [11]:
x_train.shape

(8000, 12)

In [12]:
y_train.shape

(8000,)

In [13]:
y_test.shape

(2000,)

In [14]:
data.shape

(10000, 13)

In [15]:
x_test.shape

(2000, 12)

In [16]:
x_test.head()

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4
9953,6.877876,4.11382,9.356768,8.299753,4.056779,-1.89747,-1.590581,-0.568728,0.276567,0.845536,0.11244,0.822562
3850,5.802841,6.271371,4.73154,3.819867,3.579569,-1.70948,-1.067511,-0.802579,0.077527,0.416478,0.912846,0.861306
4962,2.286998,4.385142,2.830232,5.29388,3.035814,-1.202764,-0.902011,-0.931039,0.924216,0.130186,0.703887,0.063811
3886,5.01992,2.209962,6.26608,0.578901,4.322584,-1.960207,-1.074561,-1.287815,0.54691,0.065992,0.427349,0.814648
5437,7.646145,9.187896,5.484219,9.934313,3.634226,-1.254541,-1.335366,-1.044319,0.561528,0.121611,0.787318,0.300314


In [17]:
y_train.head()

2694    unstable
5140    unstable
2568    unstable
3671    unstable
7427    unstable
Name: stabf, dtype: object

In [18]:
y_test.head()

9953    unstable
3850    unstable
4962      stable
3886      stable
5437    unstable
Name: stabf, dtype: object

In [19]:
# Using Standard Scaler to transform the train and test set.
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train, y_train)
x_test_scaled = scaler.transform(x_test)
x_train_scaled = pd.DataFrame(x_train_scaled, columns=x_train.columns)
x_test_scaled = pd.DataFrame(x_test_scaled, columns=x_test.columns)

In [20]:
x_train_scaled.head()

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4
0,0.367327,-0.986042,0.650447,1.547527,-0.29149,0.061535,1.293862,-0.845074,0.160918,0.339859,0.585568,0.492239
1,-0.064659,0.089437,1.035079,-1.641494,0.619865,-0.067235,-1.502925,0.486613,-0.293143,-1.558488,1.429649,-1.443521
2,-1.46785,1.298418,-0.502536,1.166046,-0.180521,0.490603,0.68256,-0.855302,1.39935,1.451534,-1.045743,0.492489
3,0.820081,0.52992,1.299657,-1.141975,-0.812854,-0.763632,1.521579,0.65878,-0.958319,1.361958,1.60414,0.275303
4,0.665424,-1.425627,0.3123,0.919137,-1.614296,0.760315,1.422019,0.639243,1.676895,0.69566,1.137504,-1.312575


In [21]:
x_test_scaled.head()

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4
0,0.593951,-0.412733,1.503924,1.116943,0.403423,-1.492971,-0.785033,1.566781,-0.901007,1.167203,-1.50733,1.084726
1,0.20219,0.374416,-0.1888,-0.522268,-0.225967,-1.058483,0.420047,1.028627,-1.625721,-0.39566,1.414651,1.226011
2,-1.079044,-0.313745,-0.884634,0.01708,-0.943122,0.112653,0.801335,0.733004,1.457108,-1.438495,0.651821,-1.682168
3,-0.08312,-1.107327,0.372805,-1.708152,0.75399,-1.637972,0.403805,-0.088036,0.083322,-1.672322,-0.357714,1.055865
4,0.873921,1.438466,0.086662,1.715037,-0.15388,-0.007015,-0.197053,0.472315,0.136549,-1.469731,0.956396,-0.819727


In [22]:
y_train.head()

2694    unstable
5140    unstable
2568    unstable
3671    unstable
7427    unstable
Name: stabf, dtype: object

In [23]:
# Training Random Forest Classifier
rfc = RandomForestClassifier(random_state=1)
rfc.fit(x_train_scaled, y_train)

# making predictions
rfc_pred = rfc.predict(x_test_scaled)

In [36]:
# Random Forest Evaluation
# Accuracy
accuracy = accuracy_score(y_test, rfc_pred)
print("Accuracy: %.4f" % accuracy)

# Precision
precision = precision_score(y_test, rfc_pred, pos_label='stable')
print('Precision: %.4f' % precision)

# Recall
recall = recall_score(y_test, rfc_pred, pos_label='stable')
print('Recall: %.4f' % recall)

# F1 Score
f1 = f1_score(y_test, rfc_pred, pos_label='stable')
print('F1 Score: %.4f' % f1)

# Confusion Matrix
cm = confusion_matrix(y_test, rfc_pred)
print('Random Forest Confusion Matrix')
print(cm)

# Classification Report
rfc_report = classification_report(y_test, rfc_pred)
print("Random Forest Classification Report")
print(rfc_report)

# Cross Validation and Accuracy
scores = cross_val_score(rfc, x_train_scaled, y_train, cv=5, scoring='f1_macro')
scores

Accuracy: 0.9290
Precision: 0.9191
Recall: 0.8778
F1 Score: 0.8980
Random Forest Confusion Matrix
[[ 625   87]
 [  55 1233]]
Random Forest Classification Report
              precision    recall  f1-score   support

      stable       0.92      0.88      0.90       712
    unstable       0.93      0.96      0.95      1288

    accuracy                           0.93      2000
   macro avg       0.93      0.92      0.92      2000
weighted avg       0.93      0.93      0.93      2000



array([0.91143756, 0.91136454, 0.91564855, 0.90214725, 0.91555674])

In [25]:
# Extra Trees Classifier
etc = ExtraTreesClassifier(random_state=1)
etc.fit(x_train_scaled, y_train)

# making predictions
etc_pred = etc.predict(x_test_scaled)

In [26]:
# Extra Trees Evaluation
# Accuracy
accuracy = accuracy_score(y_test, etc_pred)
print("Accuracy: %.4f" % accuracy)

# Precision
precision = precision_score(y_test, etc_pred, pos_label='stable')
print('Precision: %.4f' % precision)

# Recall
recall = recall_score(y_test, etc_pred, pos_label='stable')
print('Recall: %.4f' % recall)

# F1 Score
f1 = f1_score(y_test, etc_pred, pos_label='stable')
print('F1 Score: %.4f' % f1)

# Confusion Matrix
cm = confusion_matrix(y_test, etc_pred)
print('Extra Tree Confusion Matrix')
print(cm)

# Classification Report
etc_report = classification_report(y_test, etc_pred)
print("Extra Tree Classification Report")
print(etc_report)

Accuracy: 0.9280
Precision: 0.9410
Recall: 0.8511
F1 Score: 0.8938
Extra Tree Confusion Matrix
[[ 606  106]
 [  38 1250]]
Extra Tree Classification Report
              precision    recall  f1-score   support

      stable       0.94      0.85      0.89       712
    unstable       0.92      0.97      0.95      1288

    accuracy                           0.93      2000
   macro avg       0.93      0.91      0.92      2000
weighted avg       0.93      0.93      0.93      2000



In [27]:
# XGB Classifier
xgbc = XGBClassifier(max_depth=3, learning_rate=0.1, random_state=1)
xgbc.fit(x_train_scaled, y_train)

# make predictions
xgbc_pred = xgbc.predict(x_test_scaled)

In [28]:
# XGB Evaluation
# Accuracy
accuracy = accuracy_score(y_test, xgbc_pred)
print("Accuracy: %.4f" % accuracy)

# Precision
precision = precision_score(y_test, xgbc_pred, pos_label='stable')
print('Precision: %.4f' % precision)

# Recall
recall = recall_score(y_test, xgbc_pred, pos_label='stable')
print('Recall: %.4f' % recall)

# F1 Score
f1 = f1_score(y_test, xgbc_pred, pos_label='stable')
print('F1 Score: %.4f' % f1)

# Confusion Matrix
cm = confusion_matrix(y_test, xgbc_pred, labels=['stable', 'unstable'])
print('XGB Confusion Matrix')
print(cm)

# Classification Report
xgbc_report = classification_report(y_test, xgbc_pred)
print("XGB Classification Report")
print(xgbc_report)

Accuracy: 0.9195
Precision: 0.9206
Recall: 0.8469
F1 Score: 0.8822
XGB Confusion Matrix
[[ 603  109]
 [  52 1236]]
XGB Classification Report
              precision    recall  f1-score   support

      stable       0.92      0.85      0.88       712
    unstable       0.92      0.96      0.94      1288

    accuracy                           0.92      2000
   macro avg       0.92      0.90      0.91      2000
weighted avg       0.92      0.92      0.92      2000



In [29]:
# LightGBM Classifier
lgbm = LGBMClassifier(random_state=1)
lgbm.fit(x_train_scaled, y_train)

# make predictions
lgbm_pred = lgbm.predict(x_test_scaled)

In [30]:
# LGBM Evaluation
# Accuracy
accuracy = accuracy_score(y_test, lgbm_pred)
print("Accuracy: %.4f" % accuracy)

# Precision
precision = precision_score(y_test, lgbm_pred, pos_label='stable')
print('Precision: %.4f' % precision)

# Recall
recall = recall_score(y_test, lgbm_pred, pos_label='stable')
print('Recall: %.4f' % recall)

# F1 Score
f1 = f1_score(y_test, lgbm_pred, pos_label='stable')
print('F1 Score: %.4f' % f1)

# Confusion Matrix
cm = confusion_matrix(y_test, lgbm_pred, labels=['stable', 'unstable'])
print('LGBM Confusion Matrix')
print(cm)

# Classification Report
lgbm_report = classification_report(y_test, lgbm_pred)
print("LGBM Classification Report")
print(lgbm_report)

Accuracy: 0.9375
Precision: 0.9297
Recall: 0.8919
F1 Score: 0.9104
LGBM Confusion Matrix
[[ 635   77]
 [  48 1240]]
LGBM Classification Report
              precision    recall  f1-score   support

      stable       0.93      0.89      0.91       712
    unstable       0.94      0.96      0.95      1288

    accuracy                           0.94      2000
   macro avg       0.94      0.93      0.93      2000
weighted avg       0.94      0.94      0.94      2000



In [37]:
# Improving the Extra tree Classifier using RandomizedSearchCV
n_estimators = [50, 100, 300, 500, 1000]
min_samples_split = [2, 3, 5, 7, 9]
min_samples_leaf = [1, 2, 4, 6, 8]
max_features = ['auto', 'sqrt', 'log2', None]

hyperparameter_grid = {
    'n_estimators': n_estimators,
    'min_samples_leaf': min_samples_leaf,
    'min_samples_split': min_samples_split,
    'max_features': max_features
}

# create extra trees model
etc_opt = ExtraTreesClassifier(random_state=1)

random_cv = RandomizedSearchCV(estimator=etc_opt, param_distributions=hyperparameter_grid, cv=5, n_iter=10, 
                               scoring='accuracy', n_jobs=-1, verbose=1, random_state=1)
best_model = random_cv.fit(x_train_scaled, y_train)

# make predictions
pred = best_model.predict(x_test_scaled)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  1.2min finished


In [38]:
# improved Extra Trees Evaluation

# getting best parameters
from pprint import pprint
pprint(best_model.best_params_)
pprint(best_model.best_estimator_.get_params())

# Accuracy
accuracy = accuracy_score(y_test, pred)
print("Accuracy: %.4f" % accuracy)

# Precision
precision = precision_score(y_test, pred, pos_label='stable')
print('Precision: %.4f' % precision)

# Recall
recall = recall_score(y_test, pred, pos_label='stable')
print('Recall: %.4f' % recall)

# F1 Score
f1 = f1_score(y_test, pred, pos_label='stable')
print('F1 Score: %.4f' % f1)

# Confusion Matrix
cm = confusion_matrix(y_test, pred, labels=['stable', 'unstable'])
print('Improved Extra Trees Confusion Matrix')
print(cm)

# Classification Report
report = classification_report(y_test, pred)
print("Improved Extra Trees Classification Report")
print(report)

{'max_features': None,
 'min_samples_leaf': 8,
 'min_samples_split': 2,
 'n_estimators': 1000}
{'bootstrap': False,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': None,
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 8,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 1000,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 1,
 'verbose': 0,
 'warm_start': False}
Accuracy: 0.9270
Precision: 0.9211
Recall: 0.8694
F1 Score: 0.8945
Improved Extra Trees Confusion Matrix
[[ 619   93]
 [  53 1235]]
Improved Extra Trees Classification Report
              precision    recall  f1-score   support

      stable       0.92      0.87      0.89       712
    unstable       0.93      0.96      0.94      1288

    accuracy                           0.93      2000
   macro avg       0.93      0.91      0.92      2000
weighted avg       0.9

In [33]:
print(pred)

['unstable' 'unstable' 'stable' ... 'stable' 'unstable' 'unstable']


In [34]:
import xgboost

print(xgboost.__version__)

1.2.0


In [44]:
best_model.cv_results_

{'mean_fit_time': array([10.78156719,  0.47757134,  3.17141395,  3.22740989, 18.67879639,
         0.98374171,  0.52196832,  1.18272963,  8.57448087,  0.46917171]),
 'std_fit_time': array([0.08551014, 0.00813801, 0.03629411, 0.0898556 , 0.06982804,
        0.02815029, 0.01309861, 0.03875458, 0.06091464, 0.00877248]),
 'mean_score_time': array([1.02573962, 0.04499698, 0.23718572, 0.29438257, 0.67876158,
        0.04439721, 0.04839702, 0.09319444, 0.6502542 , 0.04439731]),
 'std_score_time': array([0.04159772, 0.00167351, 0.01391892, 0.06781487, 0.06048948,
        0.00535106, 0.00484073, 0.00658404, 0.01291078, 0.00119994]),
 'param_n_estimators': masked_array(data=[1000, 50, 300, 300, 1000, 50, 50, 100, 1000, 50],
              mask=[False, False, False, False, False, False, False, False,
                    False, False],
        fill_value='?',
             dtype=object),
 'param_min_samples_split': masked_array(data=[2, 7, 9, 7, 2, 5, 5, 3, 9, 7],
              mask=[False, False, F

In [52]:
best_model.scoring

'accuracy'