In [None]:
from google.colab import files

uploaded = files.upload()

Saving in_domain_dev.tsv to in_domain_dev.tsv
Saving in_domain_train.tsv to in_domain_train.tsv
Saving out_of_domain_dev.tsv to out_of_domain_dev.tsv


In [None]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, train_test_split
from sklearn import metrics
from imblearn.over_sampling import ADASYN
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, matthews_corrcoef

train_data = pd.read_csv("in_domain_train.tsv", sep='\t')
train_texts = train_data.iloc[:, 3]
train_labels = train_data.iloc[:, 1]

valid_data = pd.read_csv("in_domain_dev.tsv", sep='\t')
val_texts = valid_data.iloc[:, 3]
val_labels = valid_data.iloc[:, 1]

test_data = pd.read_csv("out_of_domain_dev.tsv", sep='\t')
test_texts = test_data.iloc[:, 3]
test_labels = test_data.iloc[:, 1]

# Vectorize text data
vectorizer = CountVectorizer(ngram_range=(1, 3))
train_vec = vectorizer.fit_transform(train_texts)
val_vec = vectorizer.transform(val_texts)
test_vec = vectorizer.transform(test_texts)

# Resample using ADASYN
ada = ADASYN(random_state=42)
train_vec_resampled, train_labels_resampled = ada.fit_resample(X=train_vec, y=train_labels)

# Random Hyperparameter Grid
n_estimators = [int(x) for x in np.linspace(start=200, stop=2000, num=10)]
max_features = ['log2', 'sqrt']
max_depth = [int(x) for x in np.linspace(10, 110, num=11)]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]
random_grid = {
    'n_estimators': n_estimators,
    'max_features': max_features,
    'max_depth': max_depth,
    'min_samples_split': min_samples_split,
    'min_samples_leaf': min_samples_leaf,
    'bootstrap': bootstrap
}

# Random Search Training
rf_classifier_random = RandomForestClassifier(random_state=42)
rf_random = RandomizedSearchCV(
    estimator=rf_classifier_random,
    param_distributions=random_grid,
    n_iter=100,
    cv=3,
    verbose=2,
    random_state=42,
    n_jobs=-1
)
rf_random.fit(train_vec_resampled, train_labels_resampled)

print("Best Hyperparameters from Random Search:")
print(rf_random.best_params_)


def evaluate(model, test_features, test_labels):
    predictions = model.predict(test_features)

    accuracy = accuracy_score(test_labels, predictions)
    precision = precision_score(test_labels, predictions)
    recall = recall_score(test_labels, predictions)
    roc_auc = roc_auc_score(test_labels, predictions)
    mcc = matthews_corrcoef(test_labels, predictions)

    print('Model Performance:')
    print('Accuracy = {:0.2f}%'.format(accuracy * 100))
    print('Precision = {:0.2f}%'.format(precision * 100))
    print('Recall = {:0.2f}%'.format(recall * 100))
    print('ROC AUC = {:0.2f}%'.format(roc_auc * 100))
    print('MCC = {:0.2f}'.format(mcc))

    return accuracy, precision, recall, roc_auc, mcc

best_random = rf_random.best_estimator_
evaluate(best_random, test_vec, test_labels)


# Grid Search with Cross Validation
param_grid = {
    'bootstrap': [True],
    'max_depth': [10, 20, 30, 40],
    'max_features': ['log2', 'sqrt'],
    'min_samples_leaf': [1, 3, 5],
    'min_samples_split': [2, 3, 4],
    'n_estimators': [900, 1000, 1100]
}

rf_classifier_grid = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(
    estimator=rf_classifier_grid,
    param_grid=param_grid,
    cv=3,
    n_jobs=-1,
    verbose=2
)
grid_search.fit(train_vec_resampled, train_labels_resampled)

print("Best Hyperparameters from Grid Search:")
print(grid_search.best_params_)

best_grid = grid_search.best_estimator_
evaluate(best_grid, test_vec, test_labels)



Fitting 3 folds for each of 100 candidates, totalling 300 fits
