In [2]:
from sklearn.datasets import make_classification
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
import pandas as pd
import numpy as np

In [3]:
# Load the TF-IDF data from CSV files
X_train = pd.read_csv('train_tfidf.csv')
X_test = pd.read_csv('test_tfidf.csv')

X_train.drop(X_train.columns[0], axis=1, inplace=True)
X_test.drop(X_test.columns[0], axis=1, inplace=True)

X_train = X_train.to_numpy()
X_test = X_test.to_numpy()

# Loading the labels
pd_train = pd.read_csv('train_labels.csv').values.ravel()
pd_test = pd.read_csv('test_labels.csv').values.ravel()

pd_train = list(pd_train)
pd_test = list(pd_test)

y_train = [1 if x == "non-suicide" else 0 for x in pd_train]
y_test = [1 if x == "non-suicide" else 0 for x in pd_test]
y_train = np.array(y_train)
y_test = np.array(y_test)

## ADA Boost

In [4]:
clf = AdaBoostClassifier(n_estimators=100, random_state=0)
clf.fit(X_train, y_train)

In [5]:
clf.predict(X_test)

array([0, 0, 0, ..., 1, 1, 0])

## Can comment on bias and variance using train and test accuracies

In [6]:
print(f"ADA Boost Test Accuracy (no hyperparameter tuning): {clf.score(X_test,y_test)}")

ADA Boost Test Accuracy (no hyperparameter tuning): 0.8924


In [7]:
print(f"ADA Boost Train Accuracy (no hyperparameter tuning): {clf.score(X_train,y_train)}")

ADA Boost Train Accuracy (no hyperparameter tuning): 0.8948444444444444


In [17]:
base_classifier_1 = DecisionTreeClassifier()
base_classifier_2 = RandomForestClassifier()
base_classifier_3 = SVC(probability=True)  # Note: SVC with probability=True for AdaBoost

# Define the parameter grid
param_grid = {
    'estimator': [base_classifier_1, base_classifier_2, base_classifier_3],  # Multiple base classifiers
    'n_estimators': [50, 100, 200],              # Number of weak learners
    'learning_rate': [0.01, 0.1, 0.5, 1.0],      # Learning rate
    'algorithm': ['SAMME', 'SAMME.R']           # AdaBoost algorithm
}

In [None]:
grid_search = GridSearchCV(clf, param_grid, cv=5)
grid_search.fit(X_train, y_train)

# Print the best parameters found
print("Best Parameters: ", grid_search.best_params_)

## XG Boost

In [21]:
!pip install xgboost

Collecting xgboost
  Obtaining dependency information for xgboost from https://files.pythonhosted.org/packages/76/e8/260752c1bc8dc856b27cfefaa3fca29ba02a15f649bbf6e9c15ed54e5480/xgboost-2.0.2-py3-none-macosx_12_0_arm64.whl.metadata
  Downloading xgboost-2.0.2-py3-none-macosx_12_0_arm64.whl.metadata (2.0 kB)
Downloading xgboost-2.0.2-py3-none-macosx_12_0_arm64.whl (1.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m13.0 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: xgboost
Successfully installed xgboost-2.0.2


In [8]:
from xgboost import XGBClassifier

In [9]:
bst_xgb = XGBClassifier(n_estimators=100, max_depth=10, learning_rate=0.01, objective='binary:logistic')
bst_xgb.fit(X_train, y_train)
preds = bst_xgb.predict(X_test)

In [10]:
preds

array([0, 0, 0, ..., 1, 1, 0])

In [11]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, preds)
print(f'Accuracy: {accuracy:.2f}')

Accuracy: 0.87


## Initial

In [12]:
bst_xgb_poor = XGBClassifier(n_estimators=2, max_depth=2, learning_rate=1, objective='binary:logistic')
bst_xgb_poor.fit(X_train, y_train)
preds_poor = bst_xgb_poor.predict(X_test)
print(f'Accuracy Poor: {accuracy_score(y_test, preds_poor):.2f}')

Accuracy Poor: 0.82
