In [1]:
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import  GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, mean_squared_error,confusion_matrix,classification_report
from sklearn.preprocessing import MinMaxScaler
import os
from sklearn.linear_model import LinearRegression,LogisticRegression
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
!pip install xgboost
from xgboost import XGBClassifier




In [2]:
# data inialize
from google.colab import drive
drive.mount('/content/drive')
data = pd.read_pickle('/content/drive/My Drive/ass2.pickle')

train = data['train']
dev = data['dev']
test = data['test']


# preliminary data analysis
print(f"Number of samples: {train.shape[0]}")
print(f"Number of features: {train.shape[1] -1}\n")
for part, d in data.items():
    print(part, "set:")
    print("HEAD:")
    print(d.head()) # prints first 5 rows of the data
    print("STATISTICS:")
    print(d.describe()) # prints descriptive statistics (count, mean, std, min, 25%, 50%, 75%, max)
    print("IS NULL:")
    print(d.isnull().sum()) # checks if there are NaN or None in the columns
    print("-------------------\n")


Mounted at /content/drive
Number of samples: 40533
Number of features: 42

train set:
HEAD:
       f0  f1  f2  f3  f4  f5  f6  f7  f8  f9  ...  f33  f34  f35  f36  f37  \
51905   1   0   0   0   0   0   2   1   2   2  ...    0    0    0    2    0   
52612   0   0   0   0   0   0   2   1   0   0  ...    0    0    0    2    0   
61699   2   1   2   1   1   0   2   2   0   0  ...    0    0    0    1    0   
6291    0   0   0   0   0   0   0   0   0   0  ...    0    0    0    2    0   
17484   0   0   0   0   0   0   1   1   2   0  ...    0    0    0    2    1   

       f38  f39  f40  f41  target  
51905    0    0    0    0       2  
52612    0    0    0    0       2  
61699    0    0    0    0       2  
6291     0    0    0    0       2  
17484    2    0    0    0       2  

[5 rows x 43 columns]
STATISTICS:
                 f0            f1            f2            f3            f4  \
count  40533.000000  40533.000000  40533.000000  40533.000000  40533.000000   
mean       0.959539     

In [3]:
# Split the data to X and y
X_train = train.iloc[:, :-1]
y_train = train.iloc[:, -1]
X_dev = dev.iloc[:, :-1]
y_dev = dev.iloc[:, -1]
X_test = test.iloc[:, :-1]
y_test = test.iloc[:, -1]

# **Decision Tree Classifier**

---



### **hyperparameter search**

**'criterion'** - which function to use to measure the impurity of a split.  
**'splitter'** - which strategy to split each node. best - best split, random - the best random split. gives randomness into process.  
**'max_depth'** - maximum depth of the tree.  
**'min_samples_split**' - minimum number of samples required to split an internal node.  
**'min_samples_leaf'** - minimum number of samples required to be at a leaf node.  


In [4]:
model_default = DecisionTreeClassifier()
model_default.fit(X_train, y_train)
y_pred = model_default.predict(X_dev)
accuracy = accuracy_score(y_dev, y_pred)
print(f'Dev Accuracy before the hyperparameter search : {accuracy}')

model = DecisionTreeClassifier()
params = {
    'criterion': ['gini', 'entropy'],
    'splitter': ['best', 'random'],
    'max_depth': [None, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 5],
}
grid_search = GridSearchCV(estimator=model, param_grid=params,n_jobs=-1)
grid_search.fit(X_train, y_train)
print(f"The best Parameters:{grid_search.best_params_}\n")
grid_model = grid_search.best_estimator_
y_pred = grid_model.predict(X_dev)

# Evaluate model
accuracy = accuracy_score(y_dev, y_pred)
precision = precision_score(y_dev, y_pred,average='weighted')
recall = recall_score(y_dev, y_pred,average='weighted')
f1 = f1_score(y_dev, y_pred,average='weighted')
mse = mean_squared_error(y_dev, y_pred)
conf_matrix = confusion_matrix(y_dev, y_pred)

print("Metrics evaluate after hyperparameter search :")
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-score: {f1}")
print(f"Mean Squared Error: {mse}")
print(f"Confusion Matrix:\n{conf_matrix}")

Dev Accuracy before the hyperparameter search : 0.7285375962107756
The best Parameters:{'criterion': 'entropy', 'max_depth': None, 'min_samples_leaf': 5, 'min_samples_split': 5, 'splitter': 'random'}

Metrics evaluate after hyperparameter search :
Accuracy: 0.7264653641207816
Precision: 0.7116094627733407
Recall: 0.7264653641207816
F1-score: 0.7183230819840649
Mean Squared Error: 0.5102131438721137
Confusion Matrix:
[[ 216  396  658]
 [ 281 2084 1008]
 [ 408  945 7516]]


# **Random Forest Classifier**

---



### **hyperparameter search**

**'n_estimators'** - number of trees to build.  
**'criterion'** - which function to use to measure the impurity of a split.   
**'max_depth'** - maximum depth of the tree.  
**'min_samples_split**' - minimum number of samples required to split an internal node.  



In [5]:
model_default = RandomForestClassifier()
model_default.fit(X_train, y_train)
y_pred = model_default.predict(X_dev)
accuracy = accuracy_score(y_dev, y_pred)
print(f'Dev Accuracy before the hyperparameter search : {accuracy}')

model = RandomForestClassifier()
params = {
    'n_estimators' : [100, 150],
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 10],
    'min_samples_split': [2, 5]
}

grid_search = GridSearchCV(estimator=model, param_grid=params,n_jobs=-1)
grid_search.fit(X_train, y_train)

print(f"The best Parameters:{grid_search.best_params_}\n")
grid_model = grid_search.best_estimator_
y_pred = grid_model.predict(X_dev)

# Evaluate model
accuracy = accuracy_score(y_dev, y_pred)
precision = precision_score(y_dev, y_pred,average='weighted')
recall = recall_score(y_dev, y_pred,average='weighted')
f1 = f1_score(y_dev, y_pred,average='weighted')
mse = mean_squared_error(y_dev, y_pred)
conf_matrix = confusion_matrix(y_dev, y_pred)

print("Metrics evaluate:")
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-score: {f1}")
print(f"Mean Squared Error: {mse}")
print(f"Confusion Matrix:\n{conf_matrix}")

Dev Accuracy before the hyperparameter search : 0.8115748963883955
The best Parameters:{'criterion': 'entropy', 'max_depth': None, 'min_samples_split': 2, 'n_estimators': 150}

Metrics evaluate:
Accuracy: 0.8128330373001776
Precision: 0.7887127517970387
Recall: 0.8128330373001776
F1-score: 0.7857517203311998
Mean Squared Error: 0.3761101243339254
Confusion Matrix:
[[ 158  325  787]
 [  90 2307  976]
 [  64  287 8518]]


# **K-Nearest Neighbors Classifier**

---

### **hyperparameter search**

**'n_neighbors'** - Number of neighbors.  
**'weights'** -  The weight function use.  
**'algorithm'** - Algorithm used to compute the nearest neighbors.  






In [6]:
# Normalizing the' data: The algorithm computes distances between data points. The features should be in the same scale to get the same importance.
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_dev_scaled = scaler.transform(X_dev)

model_default = KNeighborsClassifier()
model_default.fit(X_train_scaled, y_train)
y_pred = model_default.predict(X_dev_scaled)
accuracy = accuracy_score(y_dev, y_pred)
print(f'Dev Accuracy before the hyperparameter search : {accuracy}')

model = KNeighborsClassifier()

params = {
    'n_neighbors': [5, 10],
    'weights': ['uniform', 'distance'],
    'algorithm': ['auto', 'ball_tree'],
}

grid_search = GridSearchCV(estimator=model, param_grid=params,n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

print(f"The best Parameters:{grid_search.best_params_}\n")
grid_model = grid_search.best_estimator_
y_pred = grid_model.predict(X_dev_scaled)

# Evaluate model
accuracy = accuracy_score(y_dev, y_pred)
precision = precision_score(y_dev, y_pred,average='weighted')
recall = recall_score(y_dev, y_pred,average='weighted')
f1 = f1_score(y_dev, y_pred,average='weighted')
mse = mean_squared_error(y_dev, y_pred)
conf_matrix = confusion_matrix(y_dev, y_pred)

print("Metrics evaluate:")
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-score: {f1}")
print(f"Mean Squared Error: {mse}")
print(f"Confusion Matrix:\n{conf_matrix}")


Dev Accuracy before the hyperparameter search : 0.7336441681468324
The best Parameters:{'algorithm': 'auto', 'n_neighbors': 10, 'weights': 'distance'}

Metrics evaluate:
Accuracy: 0.7695381882770871
Precision: 0.7417385734764471
Recall: 0.7695381882770871
F1-score: 0.7384686857435047
Mean Squared Error: 0.436056838365897
Confusion Matrix:
[[ 143  295  832]
 [  90 1853 1430]
 [  94  373 8402]]


# **Linear Regression**


---



In [7]:
print("Linear Regression")

lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)

y_pred_dev = lin_reg.predict(X_dev)
mse_dev = mean_squared_error(y_dev, y_pred_dev)
print(f'\tMean Squared Error on dev set: {mse_dev}')


Linear Regression
	Mean Squared Error on dev set: 0.41757970734274974


# **Logistic Regression**

---

### **hyperparameter search**

**'C'** - the inverse of regularization strength. when C is smaller, the regularization is strogner (C=1/$\lambda$)\
**'solver** - specifies which algorithm to use in the optimization problem
* liblinear - handles L1 reg. Used mainly for small datasets.
* saga - supports both L1 and L2 reg. Used mainly for large datasets.


In [8]:
print("Logistic Regression")

log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train, y_train)
y_pred_dev = log_reg.predict(X_dev)
accuracy_dev = accuracy_score(y_dev, y_pred_dev)
print(f'\tDev Accuracy: {accuracy_dev}')


param_grid_log_reg = {
    'C': [1, 10, 100],
    'solver': ['liblinear', 'saga']
}
grid_search_log_reg = GridSearchCV(LogisticRegression(max_iter=1000), param_grid_log_reg, cv=5, scoring='accuracy')
grid_search_log_reg.fit(X_train, y_train)

print(f'\tOptimal Parameters: {grid_search_log_reg.best_params_}')
best_log_reg = grid_search_log_reg.best_estimator_

y_pred_dev_best_log_reg = best_log_reg.predict(X_dev)

# Evaluate model
accuracy_dev_best_log_reg = accuracy_score(y_dev, y_pred_dev_best_log_reg)
print(f'\tDev Accuracy (using optimal params): {accuracy_dev_best_log_reg}')

precision_dev_best_log_reg = precision_score(y_dev, y_pred_dev_best_log_reg,average='weighted')
print(f'\tDev precision (using optimal params): {precision_dev_best_log_reg}')

recall_score_dev_best_log_reg = recall_score(y_dev, y_pred_dev_best_log_reg,average='weighted')
print(f'\tDev precision (using optimal params): {recall_score_dev_best_log_reg}')

f1_score_dev_best_log_reg = f1_score(y_dev, y_pred_dev_best_log_reg,average='weighted')
print(f'\tDev precision (using optimal params): {f1_score_dev_best_log_reg}')

mean_squared_error = mean_squared_error(y_dev, y_pred_dev_best_log_reg)
print(f'\tDev mean_squared_error (using optimal params): {mean_squared_error}')

conf_matrix = confusion_matrix(y_dev, y_pred_dev_best_log_reg)
print(f'\tDev conf_matrix (using optimal params):\n {conf_matrix}')




Logistic Regression
	Dev Accuracy: 0.6596358792184724
	Optimal Parameters: {'C': 1, 'solver': 'liblinear'}
	Dev Accuracy (using optimal params): 0.659117821195974
	Dev precision (using optimal params): 0.5655829743778172
	Dev precision (using optimal params): 0.659117821195974
	Dev precision (using optimal params): 0.5356495486534003
	Dev mean_squared_error (using optimal params): 0.6170811130846655
	Dev conf_matrix (using optimal params):
 [[   0   26 1244]
 [   0  106 3267]
 [   0   69 8800]]


  _warn_prf(average, modifier, msg_start, len(result))


# **SVM**

---

**hyperparameter search**

**'C'** - represents the penalty parameter; controls the trade-off between smooth decision boundaries and classifying training.\
**'kernel'** - specifies the type of kernel to be used in the selected algorithm.
* linear - Linear kernel, best for cases when data is linearly separable.
* rbf - Radial Basis Function kernel, handles non-linear relationships.

NOTE : We decided to omit this model due to resource shortage

In [9]:
# print("SVM")

# svm_model = SVC()
# svm_model.fit(X_train, y_train)

# y_pred_dev = svm_model.predict(X_dev)
# accuracy_dev_svm = accuracy_score(y_dev, y_pred_dev)
# print(f'\tDev Accuracy: {accuracy_dev_svm}')

# param_grid_svm = {
#     'C': [1, 10, 100],
#     'kernel': ['linear', 'rbf']
# }
# grid_search_svm = GridSearchCV(SVC(), param_grid_svm, cv=5, scoring='accuracy')
# grid_search_svm.fit(X_train, y_train)

# print(f'\tOptimal Parameters: {grid_search_svm.best_params_}')
# best_svm = grid_search_svm.best_estimator_

# y_pred_dev_best_svm = best_svm.predict(X_dev)

# # Evaluate model
# accuracy_dev_best_log_reg = accuracy_score(y_dev, y_pred_dev_best_log_reg)
# print(f'\tDev Accuracy (using optimal params): {accuracy_dev_best_log_reg}')

# precision_dev_best_log_reg = precision_score(y_dev, y_pred_dev_best_log_reg,average='weighted')
# print(f'\tDev precision (using optimal params): {precision_dev_best_log_reg}')

# recall_score_dev_best_log_reg = recall_score(y_dev, y_pred_dev_best_log_reg,average='weighted')
# print(f'\tDev precision (using optimal params): {recall_score_dev_best_log_reg}')

# f1_score_dev_best_log_reg = f1_score(y_dev, y_pred_dev_best_log_reg,average='weighted')
# print(f'\tDev precision (using optimal params): {f1_score_dev_best_log_reg}')

# mean_squared_error = mean_squared_error(y_dev, y_pred_dev_best_log_reg)
# print(f'\tDev mean_squared_error (using optimal params): {mean_squared_error}')

# conf_matrix = confusion_matrix(y_dev, y_pred_dev_best_log_reg)
# print(f'\tDev conf_matrix (using optimal params):\n {conf_matrix}')



## **AdaBoost, GradientBoosting and XGBoost**

---


In [10]:
# Initialize models
models = {
    'AdaBoost': AdaBoostClassifier(),
    'GradientBoosting': GradientBoostingClassifier(),
    'XGBoost': XGBClassifier()
}

# Train and evaluate models
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_dev)
    accuracy = accuracy_score(y_dev, y_pred)
    print(f'{name} Accuracy: {accuracy:.4f}')
    print(classification_report(y_dev, y_pred))

AdaBoost Accuracy: 0.7312
              precision    recall  f1-score   support

           0       0.00      0.00      0.00      1270
           1       0.71      0.42      0.53      3373
           2       0.74      0.95      0.83      8869

    accuracy                           0.73     13512
   macro avg       0.48      0.46      0.45     13512
weighted avg       0.66      0.73      0.68     13512



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


GradientBoosting Accuracy: 0.7552
              precision    recall  f1-score   support

           0       0.73      0.01      0.02      1270
           1       0.73      0.50      0.60      3373
           2       0.76      0.96      0.85      8869

    accuracy                           0.76     13512
   macro avg       0.74      0.49      0.49     13512
weighted avg       0.75      0.76      0.71     13512

XGBoost Accuracy: 0.8336
              precision    recall  f1-score   support

           0       0.55      0.16      0.24      1270
           1       0.78      0.78      0.78      3373
           2       0.86      0.95      0.91      8869

    accuracy                           0.83     13512
   macro avg       0.73      0.63      0.64     13512
weighted avg       0.81      0.83      0.81     13512



In [11]:
# Define hyperparameters for tuning
param_grids = {
    'AdaBoost': {'n_estimators': [50, 100, 200]},
    'GradientBoosting': {'n_estimators': [50, 100, 200], 'learning_rate': [0.01, 0.1, 0.2]},
    'XGBoost': {'n_estimators': [50, 100, 200], 'learning_rate': [0.01, 0.1, 0.2], 'max_depth': [3, 4, 5]}
}

# Perform GridSearchCV for each model
best_estimators = {}
for name, param_grid in param_grids.items():
    model = models[name]
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, scoring='accuracy', cv=5, n_jobs=-1)
    grid_search.fit(X_train, y_train)
    best_estimators[name] = grid_search.best_estimator_
    print(f'Best parameters for {name}: {grid_search.best_params_}')
    print(f'Best cross-validated accuracy for {name}: {grid_search.best_score_:.4f}')


Best parameters for AdaBoost: {'n_estimators': 200}
Best cross-validated accuracy for AdaBoost: 0.7494
Best parameters for GradientBoosting: {'learning_rate': 0.2, 'n_estimators': 200}
Best cross-validated accuracy for GradientBoosting: 0.7862
Best parameters for XGBoost: {'learning_rate': 0.2, 'max_depth': 5, 'n_estimators': 200}
Best cross-validated accuracy for XGBoost: 0.8171


In [13]:
for name ,model in best_estimators.items():
  y_dev_pred = model.predict(X_dev)
  test_accuracy = accuracy_score(y_dev, y_dev_pred)
  print(f'The Model: {name}')
  print(f'Test Accuracy: {test_accuracy:.4f}')
  print(classification_report(y_dev, y_dev_pred))

The Model: AdaBoost
Test Accuracy: 0.7506
              precision    recall  f1-score   support

           0       0.00      0.00      0.00      1270
           1       0.69      0.55      0.61      3373
           2       0.77      0.93      0.84      8869

    accuracy                           0.75     13512
   macro avg       0.49      0.49      0.48     13512
weighted avg       0.67      0.75      0.71     13512

The Model: GradientBoosting
Test Accuracy: 0.7925
              precision    recall  f1-score   support

           0       0.55      0.05      0.09      1270
           1       0.73      0.67      0.70      3373
           2       0.81      0.94      0.87      8869

    accuracy                           0.79     13512
   macro avg       0.70      0.56      0.56     13512
weighted avg       0.77      0.79      0.76     13512

The Model: XGBoost
Test Accuracy: 0.8261
              precision    recall  f1-score   support

           0       0.57      0.13      0.22      1

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# Test set evaluation

After training and evaluating these models, we compared their performance using accuracy as the primary metric. The XGBoost model outperformed the others, achieving the highest accuracy on the development set.

During hyperparameter tuning, we found the best parameters for XGBoost to be:



*   learning_rate: 0.2
*   max_depth: 5
*   n_estimators: 200







In [14]:
best_model = best_estimators['XGBoost']
y_test_pred = best_model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f'Test Accuracy: {test_accuracy:.4f}')
print(classification_report(y_test, y_test_pred))

Test Accuracy: 0.8204
              precision    recall  f1-score   support

           0       0.55      0.12      0.20      1262
           1       0.77      0.75      0.76      3380
           2       0.85      0.95      0.89      8870

    accuracy                           0.82     13512
   macro avg       0.72      0.61      0.62     13512
weighted avg       0.80      0.82      0.79     13512

