In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier

In [2]:
#!pip install xgboost

In [3]:
#!pip install lightgbm

In [4]:
import xgboost as xgb
import lightgbm as ltb

In [5]:
data = pd.read_csv('Data_for_UCI_named.csv')

In [6]:
data.head()

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4,stab,stabf
0,2.95906,3.079885,8.381025,9.780754,3.763085,-0.782604,-1.257395,-1.723086,0.650456,0.859578,0.887445,0.958034,0.055347,unstable
1,9.304097,4.902524,3.047541,1.369357,5.067812,-1.940058,-1.872742,-1.255012,0.413441,0.862414,0.562139,0.78176,-0.005957,stable
2,8.971707,8.848428,3.046479,1.214518,3.405158,-1.207456,-1.27721,-0.920492,0.163041,0.766689,0.839444,0.109853,0.003471,unstable
3,0.716415,7.6696,4.486641,2.340563,3.963791,-1.027473,-1.938944,-0.997374,0.446209,0.976744,0.929381,0.362718,0.028871,unstable
4,3.134112,7.608772,4.943759,9.857573,3.525811,-1.125531,-1.845975,-0.554305,0.79711,0.45545,0.656947,0.820923,0.04986,unstable


In [7]:
data.isnull().sum()

tau1     0
tau2     0
tau3     0
tau4     0
p1       0
p2       0
p3       0
p4       0
g1       0
g2       0
g3       0
g4       0
stab     0
stabf    0
dtype: int64

In [8]:
data.dtypes

tau1     float64
tau2     float64
tau3     float64
tau4     float64
p1       float64
p2       float64
p3       float64
p4       float64
g1       float64
g2       float64
g3       float64
g4       float64
stab     float64
stabf     object
dtype: object

In [9]:
data.shape

(10000, 14)

#### Dropping the column stabf as mentioned in the description

In [10]:
X = data.drop(columns=['stab','stabf'])
y = data['stabf']

#### Split the data into an 80-20 train-test split with a random state of “1”

In [11]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [12]:
y.value_counts()

unstable    6380
stable      3620
Name: stabf, dtype: int64

import imblearn
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=1)
x_train, y_train = smote.fit_resample(x_train, y_train)

In [13]:
x_train.shape

(8000, 12)

In [14]:
x_test.shape

(2000, 12)

In [15]:
data.shape

(10000, 14)

In [16]:
y_train.value_counts()

unstable    5092
stable      2908
Name: stabf, dtype: int64

In [17]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
y_train = encoder.fit_transform(y_train)
y_test = encoder.transform(y_test)

In [18]:
scaler = StandardScaler()

In [19]:
x_train_scaled = scaler.fit_transform(x_train)
x_train_scaled = pd.DataFrame(x_train_scaled, columns=x_train.columns) 
x_test_scaled = scaler.transform(x_test)
x_test_scaled = pd.DataFrame(x_test_scaled, columns=x_test.columns) 

In [20]:
x_train_scaled.head()

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4
0,0.367327,-0.986042,0.650447,1.547527,-0.29149,0.061535,1.293862,-0.845074,0.160918,0.339859,0.585568,0.492239
1,-0.064659,0.089437,1.035079,-1.641494,0.619865,-0.067235,-1.502925,0.486613,-0.293143,-1.558488,1.429649,-1.443521
2,-1.46785,1.298418,-0.502536,1.166046,-0.180521,0.490603,0.68256,-0.855302,1.39935,1.451534,-1.045743,0.492489
3,0.820081,0.52992,1.299657,-1.141975,-0.812854,-0.763632,1.521579,0.65878,-0.958319,1.361958,1.60414,0.275303
4,0.665424,-1.425627,0.3123,0.919137,-1.614296,0.760315,1.422019,0.639243,1.676895,0.69566,1.137504,-1.312575


### What is the accuracy on the test set using the random forest classifier? In 4 decimal places.

In [21]:
rand_reg = RandomForestClassifier(n_estimators=100, max_depth=None, random_state=1)
rand_reg.fit(x_train_scaled,y_train)
prediction = rand_reg.predict(x_test_scaled)

In [22]:
rand_reg.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 1,
 'verbose': 0,
 'warm_start': False}

In [23]:
round(accuracy_score(y_test, prediction),4)

0.929

### What is the accuracy on the test set using the XGboost classifier? In 4 decimal places.

In [24]:
xgb_ = xgb.XGBClassifier(objective='binary:logistic',random_state=1)
xgb_.fit(x_train_scaled, y_train)
y_pred = xgb_.predict(x_test_scaled)
accuracy_score(y_test, y_pred)

0.9455

In [25]:
xgb_.get_params()

{'objective': 'binary:logistic',
 'use_label_encoder': False,
 'base_score': 0.5,
 'booster': 'gbtree',
 'callbacks': None,
 'colsample_bylevel': 1,
 'colsample_bynode': 1,
 'colsample_bytree': 1,
 'early_stopping_rounds': None,
 'enable_categorical': False,
 'eval_metric': None,
 'gamma': 0,
 'gpu_id': -1,
 'grow_policy': 'depthwise',
 'importance_type': None,
 'interaction_constraints': '',
 'learning_rate': 0.300000012,
 'max_bin': 256,
 'max_cat_to_onehot': 4,
 'max_delta_step': 0,
 'max_depth': 6,
 'max_leaves': 0,
 'min_child_weight': 1,
 'missing': nan,
 'monotone_constraints': '()',
 'n_estimators': 100,
 'n_jobs': 0,
 'num_parallel_tree': 1,
 'predictor': 'auto',
 'random_state': 1,
 'reg_alpha': 0,
 'reg_lambda': 1,
 'sampling_method': 'uniform',
 'scale_pos_weight': 1,
 'subsample': 1,
 'tree_method': 'exact',
 'validate_parameters': 1,
 'verbosity': None}

### What is the accuracy on the test set using the LGBM classifier? In 4 decimal places.

In [26]:
lgbm = ltb.LGBMClassifier(random_state=1, n_jobs=1)
lgbm.fit(x_train_scaled, y_train)
lgbm_pred = lgbm.predict(x_test_scaled)
round(accuracy_score(y_test, lgbm_pred),4)


0.9395

### Using the ExtraTreesClassifier as your estimator with cv=5, n_iter=10, scoring = 'accuracy', n_jobs = -1, verbose = 1 and random_state = 1. What are the best hyperparameters from the randomized search CV?

In [27]:
from sklearn.model_selection import RandomizedSearchCV

In [28]:
extra_tree = ExtraTreesClassifier()

In [29]:
n_estimators = [100,300,500,1000]
max_features = ['auto','None', 'log2']
min_samples_split = [2, 5, 7] 
min_samples_leaf = [4,6,8] 
random_grid = {'n_estimators': n_estimators,'min_samples_split': min_samples_split,'min_samples_leaf': min_samples_leaf}

In [30]:
rf_random = RandomizedSearchCV(estimator = extra_tree,param_distributions=random_grid, n_iter = 10, cv = 5, verbose=1, random_state=1, n_jobs = -1,scoring = 'accuracy')

In [31]:
rf_random.fit(x_train_scaled, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


In [32]:
rf_pred = rf_random.predict(x_test_scaled)

In [33]:
round(accuracy_score(y_test, rf_pred),4)

0.9235

In [34]:
print(f' The best parameters are {rf_random.best_params_}')

 The best parameters are {'n_estimators': 1000, 'min_samples_split': 2, 'min_samples_leaf': 4}


### Train a new ExtraTreesClassifier Model with the new Hyperparameters from the RandomizedSearchCV (with random_state = 1). Is the accuracy of the new optimal model higher or lower than the initial ExtraTreesClassifier model with no hyperparameter tuning?

In [35]:
ett = ExtraTreesClassifier(random_state=1)
ett.fit(x_train_scaled,y_train)
ett_pred = ett.predict(x_test_scaled)
ett_accuracy = accuracy_score(y_test, ett_pred)

In [36]:
n_estimators = [1000]
max_features = ['None'] 
min_samples_split = [2] 
min_samples_leaf = [4]
selected = {'n_estimators': n_estimators,'min_samples_split': min_samples_split,'min_samples_leaf': min_samples_leaf}

In [37]:
rf_ = RandomizedSearchCV(estimator = ett,param_distributions=selected, n_iter = 10, cv = 5, verbose=1, random_state=1, n_jobs = -1,scoring = 'accuracy')
rf_.fit(x_train_scaled,y_train)
rf_pred = rf_.predict(x_test_scaled)
rf_accuracy = accuracy_score(y_test, rf_pred)



Fitting 5 folds for each of 1 candidates, totalling 5 fits


In [38]:
print(f'Accuracy for ET is {ett_accuracy}')
print(f'Accuracy for Random Search  is {rf_accuracy}')

Accuracy for ET is 0.928
Accuracy for Random Search  is 0.9255


In [39]:
assert ett_accuracy != rf_accuracy

### Find the feature importance using the optimal ExtraTreesClassifier model. Which features are the most and least important respectively?

In [40]:
ett.feature_importances_

array([0.11739736, 0.11844468, 0.11316851, 0.11546569, 0.03950675,
       0.04037132, 0.04070628, 0.04057864, 0.08978291, 0.09367636,
       0.09688268, 0.09401882])

In [41]:
feat_labels = x_train_scaled.columns
feat_labels

Index(['tau1', 'tau2', 'tau3', 'tau4', 'p1', 'p2', 'p3', 'p4', 'g1', 'g2',
       'g3', 'g4'],
      dtype='object')

In [42]:
for f in range(x_train_scaled.shape[1]):
    print(feat_labels[f], round(ett.feature_importances_[f],4))


tau1 0.1174
tau2 0.1184
tau3 0.1132
tau4 0.1155
p1 0.0395
p2 0.0404
p3 0.0407
p4 0.0406
g1 0.0898
g2 0.0937
g3 0.0969
g4 0.094


The most important feature is tau2

The least important feature is p1