In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import f1_score,recall_score,accuracy_score, precision_score, confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
import xgboost as xgb

from sklearn import preprocessing

In [2]:
#Import data
data = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/HAMOYE INTERNSHIP/Regression/Project/Data_for_UCI_named.csv")
data.head()

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4,stab,stabf
0,2.95906,3.079885,8.381025,9.780754,3.763085,-0.782604,-1.257395,-1.723086,0.650456,0.859578,0.887445,0.958034,0.055347,unstable
1,9.304097,4.902524,3.047541,1.369357,5.067812,-1.940058,-1.872742,-1.255012,0.413441,0.862414,0.562139,0.78176,-0.005957,stable
2,8.971707,8.848428,3.046479,1.214518,3.405158,-1.207456,-1.27721,-0.920492,0.163041,0.766689,0.839444,0.109853,0.003471,unstable
3,0.716415,7.6696,4.486641,2.340563,3.963791,-1.027473,-1.938944,-0.997374,0.446209,0.976744,0.929381,0.362718,0.028871,unstable
4,3.134112,7.608772,4.943759,9.857573,3.525811,-1.125531,-1.845975,-0.554305,0.79711,0.45545,0.656947,0.820923,0.04986,unstable


## Data Exploration

In [3]:
#Check data
data.shape

(10000, 14)

In [4]:
data.isnull().sum()

tau1     0
tau2     0
tau3     0
tau4     0
p1       0
p2       0
p3       0
p4       0
g1       0
g2       0
g3       0
g4       0
stab     0
stabf    0
dtype: int64

In [5]:
data.duplicated().sum()

0

In [6]:
data.describe()

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4,stab
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,5.25,5.250001,5.250004,5.249997,3.75,-1.25,-1.25,-1.25,0.525,0.525,0.525,0.525,0.015731
std,2.742548,2.742549,2.742549,2.742556,0.75216,0.433035,0.433035,0.433035,0.274256,0.274255,0.274255,0.274255,0.036919
min,0.500793,0.500141,0.500788,0.500473,1.58259,-1.999891,-1.999945,-1.999926,0.050009,0.050053,0.050054,0.050028,-0.08076
25%,2.874892,2.87514,2.875522,2.87495,3.2183,-1.624901,-1.625025,-1.62496,0.287521,0.287552,0.287514,0.287494,-0.015557
50%,5.250004,5.249981,5.249979,5.249734,3.751025,-1.249966,-1.249974,-1.250007,0.525009,0.525003,0.525015,0.525002,0.017142
75%,7.62469,7.624893,7.624948,7.624838,4.28242,-0.874977,-0.875043,-0.875065,0.762435,0.76249,0.76244,0.762433,0.044878
max,9.999469,9.999837,9.99945,9.999443,5.864418,-0.500108,-0.500072,-0.500025,0.999937,0.999944,0.999982,0.99993,0.109403


In [7]:
#Encode classes
encoding = preprocessing.LabelEncoder()

enc_class = encoding.fit_transform(data["stabf"])

data['classes'] = enc_class

data.head()

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4,stab,stabf,classes
0,2.95906,3.079885,8.381025,9.780754,3.763085,-0.782604,-1.257395,-1.723086,0.650456,0.859578,0.887445,0.958034,0.055347,unstable,1
1,9.304097,4.902524,3.047541,1.369357,5.067812,-1.940058,-1.872742,-1.255012,0.413441,0.862414,0.562139,0.78176,-0.005957,stable,0
2,8.971707,8.848428,3.046479,1.214518,3.405158,-1.207456,-1.27721,-0.920492,0.163041,0.766689,0.839444,0.109853,0.003471,unstable,1
3,0.716415,7.6696,4.486641,2.340563,3.963791,-1.027473,-1.938944,-0.997374,0.446209,0.976744,0.929381,0.362718,0.028871,unstable,1
4,3.134112,7.608772,4.943759,9.857573,3.525811,-1.125531,-1.845975,-0.554305,0.79711,0.45545,0.656947,0.820923,0.04986,unstable,1


In [8]:
#check class imbalance
data["classes"].value_counts()

1    6380
0    3620
Name: classes, dtype: int64

In [9]:
# Separate features and target class
features_df = data.drop(["stab","stabf", "classes"], axis = 1)
predictor_df = data['classes']

features_df.shape

(10000, 12)

In [10]:
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from imblearn.combine import SMOTEENN

In [11]:
#Under Sampling

undersampler = RandomUnderSampler(sampling_strategy='majority')
X_under, y_under = undersampler.fit_resample(features_df, predictor_df)

y_under.value_counts()

0    3620
1    3620
Name: classes, dtype: int64

In [12]:
X_train_, X_test_, y_train, y_test = train_test_split(X_under, y_under, test_size = 0.20, random_state=1)

In [13]:
#Standardize data
std_enc = StandardScaler()

std_enc.fit(X_train_)#fit

X_train = std_enc.transform(X_train_) #transform train data
X_test = std_enc.transform(X_test_) #transform test data

## QUESTIONS

In [14]:
#WITH RANDOM FOREST

randomF = RandomForestClassifier(random_state = 1)  #fit model
randomF.fit(X_train, y_train)  #train model

y_pred_rd = randomF.predict(X_test) #use on test data


print("The accuracy of the Decision Tree is","{:.4f}".format(accuracy_score(y_test, y_pred_rd)))

The accuracy of the Decision Tree is 0.9130


In [15]:
## WITH XGBOOST

x_boost = xgb.XGBClassifier(random_state = 1)
x_boost.fit(X_train, y_train)

y_pred_x_boost = x_boost.predict(X_test)
accuracy_score(y_test,y_pred_x_boost)

print("The accuracy of the XGBoost is","{:.4f}".format(accuracy_score(y_test, y_pred_x_boost)))

The accuracy of the XGBoost is 0.9358


In [16]:
## WITH LGBM
import lightgbm as lgb
clf = lgb.LGBMClassifier(random_state = 1)
clf.fit(X_train, y_train)

y_pred_lgb = clf.predict(X_test)
print("The accuracy of the LGBM is","{:.4f}".format(accuracy_score(y_test, y_pred_lgb)))

The accuracy of the LGBM is 0.9282


In [17]:
# WITH EXTRA TREES


from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import ExtraTreesClassifier

model = ExtraTreesClassifier()

parameters = dict(n_estimators = [100,1000,500,300,],
                    min_samples_split = [7,2,5],
                    min_samples_leaf = [4,8,6],
                    max_features = ["auto","log2", None])

clf = RandomizedSearchCV(model, parameters, cv = 5, n_iter=10, n_jobs = -1, verbose = 1, random_state = 1)

search = clf.fit(X_train, y_train)
search.best_params_

Fitting 5 folds for each of 10 candidates, totalling 50 fits


{'n_estimators': 500,
 'min_samples_split': 2,
 'min_samples_leaf': 6,
 'max_features': 'log2'}

In [18]:
# BEFORE TUNING
model_ext = ExtraTreesClassifier(random_state = 1) #init model

model_ext.fit(X_train, y_train)  #train model

y_pred_ext = model_ext.predict(X_test)

print("The accuracy of the ExtraTrees classifier before tuning is","{:.4f}".format(accuracy_score(y_test, y_pred_ext)))

The accuracy of the ExtraTrees classifier before tuning is 0.9192


In [19]:
# AFTER TUNING
model_extr = ExtraTreesClassifier(n_estimators = 500, min_samples_split = 2,
                          min_samples_leaf = 6, max_features = 'log2', random_state = 1) #init model

model_extr.fit(X_train, y_train)  #train model

y_pred_extr = model_extr.predict(X_test)

print("The accuracy of the ExtraTrees classifier after tuning is","{:.4f}".format(accuracy_score(y_test, y_pred_extr)))

The accuracy of the ExtraTrees classifier after tuning is 0.9199


In [20]:
# FEATURE SELECTION
from sklearn.feature_selection import SequentialFeatureSelector

model = ExtraTreesClassifier(n_estimators = 100, min_samples_split = 2,
                          min_samples_leaf = 6, max_features = None) #init model

sfs = SequentialFeatureSelector(model, n_features_to_select=2) #init feature selection

sfs.fit(X_train, y_train)  #fit model

# Get the selected feature indices
selected_indices = sfs.get_support(indices=True)
not_indices = sfs.get_support(indices=False)


# Get the names of the selected features
selected_features = [feature_name for feature_name in features_df.columns[selected_indices]]


# Print the selected feature names
print("Selected features:", selected_features)


Selected features: ['tau1', 'tau4']


In [21]:
model = ExtraTreesClassifier(n_estimators = 100, min_samples_split = 2,
                          min_samples_leaf = 6, max_features = None) #init model

model.fit(X_train, y_train) #fit model

important_feat = model.feature_importances_ #important features

importance_pairs = [(feature, importance) for feature, importance in zip(features_df.columns, important_feat)]

importance_pairs.sort(key=lambda x: x[1], reverse=True) #sorted

print("Most Important F :eature : {} ; Least Important Feature : {}".format(importance_pairs[0][0], importance_pairs[-1][0]))

Most Important F :eature : tau4 ; Least Important Feature : p1
