In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
import xgboost as xgb
import lightgbm as lgb
from sklearn.metrics import accuracy_score, classification_report
%matplotlib inline

In [2]:
# load csv file
df = pd.read_csv('Data_for_UCI_named.csv')
df.head()

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4,stab,stabf
0,2.95906,3.079885,8.381025,9.780754,3.763085,-0.782604,-1.257395,-1.723086,0.650456,0.859578,0.887445,0.958034,0.055347,unstable
1,9.304097,4.902524,3.047541,1.369357,5.067812,-1.940058,-1.872742,-1.255012,0.413441,0.862414,0.562139,0.78176,-0.005957,stable
2,8.971707,8.848428,3.046479,1.214518,3.405158,-1.207456,-1.27721,-0.920492,0.163041,0.766689,0.839444,0.109853,0.003471,unstable
3,0.716415,7.6696,4.486641,2.340563,3.963791,-1.027473,-1.938944,-0.997374,0.446209,0.976744,0.929381,0.362718,0.028871,unstable
4,3.134112,7.608772,4.943759,9.857573,3.525811,-1.125531,-1.845975,-0.554305,0.79711,0.45545,0.656947,0.820923,0.04986,unstable


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   tau1    10000 non-null  float64
 1   tau2    10000 non-null  float64
 2   tau3    10000 non-null  float64
 3   tau4    10000 non-null  float64
 4   p1      10000 non-null  float64
 5   p2      10000 non-null  float64
 6   p3      10000 non-null  float64
 7   p4      10000 non-null  float64
 8   g1      10000 non-null  float64
 9   g2      10000 non-null  float64
 10  g3      10000 non-null  float64
 11  g4      10000 non-null  float64
 12  stab    10000 non-null  float64
 13  stabf   10000 non-null  object 
dtypes: float64(13), object(1)
memory usage: 1.1+ MB


In [4]:
df.isnull().sum()

tau1     0
tau2     0
tau3     0
tau4     0
p1       0
p2       0
p3       0
p4       0
g1       0
g2       0
g3       0
g4       0
stab     0
stabf    0
dtype: int64

In [5]:
df.shape

(10000, 14)

In [6]:
df.columns

Index(['tau1', 'tau2', 'tau3', 'tau4', 'p1', 'p2', 'p3', 'p4', 'g1', 'g2',
       'g3', 'g4', 'stab', 'stabf'],
      dtype='object')

In [7]:
df.stabf.unique()

array(['unstable', 'stable'], dtype=object)

In [8]:
# Drop the 'stab' column as per the instructions
df.drop(columns=['stab'], inplace=True)

# Split the data into features (X) and the target variable (y)
X = df.drop(columns=['stabf'])
y = df['stabf']

# Label encode the target variable 'stabf'
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# Split the data into 80-20 train-test split with random state = 1
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

# Use StandardScaler to transform the train and test sets
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)


In [9]:
# Train a Random Forest Classifier
rf_classifier = RandomForestClassifier(random_state=1)
rf_classifier.fit(x_train_scaled, y_train)
rf_predictions = rf_classifier.predict(x_test_scaled)

# What is the accuracy on the test set using the random forest classifier? In 4 decimal places.

# Calculate accuracy on the test set
rf_accuracy = accuracy_score(y_test, rf_predictions)

# Print the accuracy rounded to 4 decimal places
print(f"Accuracy on the test set using Random Forest Classifier: {rf_accuracy:.4f}")


Accuracy on the test set using Random Forest Classifier: 0.9290


In [10]:
# Train an Extra Trees Classifier
et_classifier = ExtraTreesClassifier(random_state=1)
et_classifier.fit(x_train_scaled, y_train)
et_predictions = et_classifier.predict(x_test_scaled)

# Calculate accuracy on the test set
xgb_accuracy = accuracy_score(y_test, et_predictions)

# Print the accuracy rounded to 4 decimal places
print(f"Accuracy on the test set using Extra Trees Classifier: {xgb_accuracy:.4f}")

Accuracy on the test set using Extra Trees Classifier: 0.9280


In [11]:
# Train an XGBoost Classifier
xgb_classifier = xgb.XGBClassifier(random_state=1)
xgb_classifier.fit(x_train_scaled, y_train)
xgb_predictions = xgb_classifier.predict(x_test_scaled)

# What is the accuracy on the test set using the XGboost classifier? In 4 decimal places.

# Calculate accuracy on the test set
xgb_accuracy = accuracy_score(y_test, xgb_predictions)

# Print the accuracy rounded to 4 decimal places
print(f"Accuracy on the test set using XGBoost Classifier: {xgb_accuracy:.4f}")

Accuracy on the test set using XGBoost Classifier: 0.9455


In [12]:
# Train a LightGBM Classifier
lgb_classifier = lgb.LGBMClassifier(random_state=1)
lgb_classifier.fit(x_train_scaled, y_train)
lgb_predictions = lgb_classifier.predict(x_test_scaled)

# Calculate accuracy on the test set
xgb_accuracy = accuracy_score(y_test, lgb_predictions)

# Print the accuracy rounded to 4 decimal places
print(f"Accuracy on the test set using LightGBM Classifier: {xgb_accuracy:.4f}")

Accuracy on the test set using LightGBM Classifier: 0.9395


In [13]:
# Using the ExtraTreesClassifier as your estimator with cv=5, n_iter=10, scoring = 'accuracy', n_jobs = -1, verbose = 1 and random_state = 1. What are the best hyperparameters from the randomized search CV?

from sklearn.model_selection import RandomizedSearchCV
# Define the parameter grid for Randomized Search CV
param_grid = {
    'n_estimators': [50, 100, 300, 500, 1000],
    'min_samples_split': [2, 3, 5, 7, 9],
    'min_samples_leaf': [1, 2, 4, 6, 8],
    'max_features': ['auto', 'sqrt', 'log2', None]
}

# Perform Randomized Search CV
randomized_search = RandomizedSearchCV(estimator=et_classifier,
                                       param_distributions=param_grid,
                                       cv=5,
                                       n_iter=10,
                                       scoring='accuracy',
                                       n_jobs=-1,
                                       verbose=1,
                                       random_state=1)

# Fit the Randomized Search CV to the data
randomized_search.fit(x_train_scaled, y_train)

# Print the best hyperparameters
print("Best Hyperparameters from Randomized Search CV:")
print(randomized_search.best_params_)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best Hyperparameters from Randomized Search CV:
{'n_estimators': 1000, 'min_samples_split': 2, 'min_samples_leaf': 8, 'max_features': None}


In [14]:
from sklearn.ensemble import ExtraTreesClassifier

# Define the optimal hyperparameters obtained from Randomized Search CV
optimal_params = {
    'n_estimators': 100,
    'max_depth': None,
    'min_samples_split': 2,
    'min_samples_leaf': 1,
    'max_features': 'auto',
    'bootstrap': False
}

# Create a new ExtraTreesClassifier with the optimal hyperparameters
et_optimal_classifier = ExtraTreesClassifier(random_state=1, **optimal_params)

# Train the new ExtraTreesClassifier model
et_optimal_classifier.fit(x_train_scaled, y_train)

# Make predictions using the new model
et_optimal_predictions = et_optimal_classifier.predict(x_test_scaled)

# Calculate accuracy of the new model
et_optimal_accuracy = accuracy_score(y_test, et_optimal_predictions)

# Print the accuracy of the new model
print(f"Accuracy of the new ExtraTreesClassifier model: {et_optimal_accuracy:.4f}")


  warn(


Accuracy of the new ExtraTreesClassifier model: 0.9280


In [15]:
# Get the feature importances from the optimal ExtraTreesClassifier model
feature_importances = et_optimal_classifier.feature_importances_

# Create a dataframe to display the feature importances
importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': feature_importances})

# Sort the dataframe by importance in descending order
importance_df = importance_df.sort_values('Importance', ascending=False)

# Print the most important and least important features
print("Most Important Features:")
print(importance_df.head(1))

print("\nLeast Important Features:")
print(importance_df.tail(1))


Most Important Features:
  Feature  Importance
1    tau2    0.118445

Least Important Features:
  Feature  Importance
4      p1    0.039507
