### Importing Libraries:

In [22]:
import pandas as pd
import ast  
from ast import literal_eval
import numpy as np
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import precision_recall_curve, auc
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
import mlflow
import mlflow.sklearn


### Loading Data:

In [2]:
test = pd.read_csv("test.csv")
train = pd.read_csv("train.csv")
validation = pd.read_csv("validation.csv")

In [14]:
X_train = train["text"]
X_test = test["text"]
X_val = validation["text"]

### Vectorizing:

In [3]:
def tokenizer_(text):
    tokens = CountVectorizer().build_tokenizer()(text)
    return tokens

In [15]:
vectorizer = CountVectorizer(tokenizer=tokenizer_)

X_train_vectorized = vectorizer.fit_transform(X_train)
X_val_vectorized = vectorizer.transform(X_val)
X_test_vectorized = vectorizer.transform(X_test)


In [6]:
mlflow.set_experiment("Assignment_2")

2024/02/24 00:25:21 INFO mlflow.tracking.fluent: Experiment with name 'Assignment_2' does not exist. Creating a new experiment.


<Experiment: artifact_location='file:///D:/Cmi/Applied%20ML/Assignment_2/mlruns/325378683522626974', creation_time=1708714521064, experiment_id='325378683522626974', last_update_time=1708714521064, lifecycle_stage='active', name='Assignment_2', tags={}>

### Naive Bayes Model:

In [20]:
# Start MLflow run
model_name = "Naive Bayes"
with mlflow.start_run(run_name=model_name):
    # Build and train the model
    classifier = MultinomialNB()
    classifier.fit(X_train_vectorized, y_train)

    # Predict probabilities on the validation set
    val_predictions = classifier.predict(X_val_vectorized)
    
    # Calculate precision-recall curve for validation set
    precision, recall, _ = precision_recall_curve(y_val, val_predictions)

    # Calculate AUC-PR for validation set
    auc_pr_val = auc(recall, precision)

    # Log parameters and metrics for validation set
    mlflow.log_params(classifier.get_params())
    mlflow.log_metric("AUCPR_val", auc_pr_val)
    mlflow.sklearn.log_model(classifier, "model")

    print(f"{model_name} Validation AUC-PR: {auc_pr_val}")

    # Predict probabilities on the test set 
    test_predictions = classifier.predict(X_test_vectorized)

    # Calculate precision-recall curve for test set
    precision_test, recall_test, _ = precision_recall_curve(y_test, test_predictions)

    # Calculate AUC-PR for test set
    auc_pr_test = auc(recall_test, precision_test)

    # Log metrics for test set
    mlflow.log_metric("AUCPR_test", auc_pr_test)

    print(f"{model_name} Test AUC-PR: {auc_pr_test}")

    # Register the model
    mlflow.register_model(mlflow.get_artifact_uri("model"), "Naive Bayes")




Naive Bayes Validation AUC-PR: 0.9833177961924675
Naive Bayes Test AUC-PR: 0.9842469896318956


Successfully registered model 'Naive Bayes'.
Created version '1' of model 'Naive Bayes'.


### SVM:

In [23]:
# Start MLflow run
model_name_svm = "SVM"
with mlflow.start_run(run_name=model_name_svm):
    # Build and train the SVM model
    svm_classifier = SVC(probability=True)  
    svm_classifier.fit(X_train_vectorized, y_train)

    # Predict probabilities on the validation set
    val_predictions_svm = svm_classifier.predict(X_val_vectorized)

    # Calculate precision-recall curve for validation set
    precision_svm, recall_svm, _ = precision_recall_curve(y_val, val_predictions_svm)

    # Calculate AUC-PR for validation set
    auc_pr_val_svm = auc(recall_svm, precision_svm)

    # Log parameters and metrics for validation set
    mlflow.log_params(svm_classifier.get_params())
    mlflow.log_metric("AUCPR_val", auc_pr_val_svm)
    mlflow.sklearn.log_model(svm_classifier, "model")

    print(f"{model_name_svm} Validation AUC-PR: {auc_pr_val_svm}")

    # Predict probabilities on the test set
    test_predictions_svm = svm_classifier.predict(X_test_vectorized)

    # Calculate precision-recall curve for test set
    precision_test_svm, recall_test_svm, _ = precision_recall_curve(y_test, test_predictions_svm)

    # Calculate AUC-PR for test set
    auc_pr_test_svm = auc(recall_test_svm, precision_test_svm)

    # Log metrics for test set
    mlflow.log_metric("AUCPR_test", auc_pr_test_svm)

    print(f"{model_name_svm} Test AUC-PR: {auc_pr_test_svm}")

    # Register the model
    mlflow.register_model(mlflow.get_artifact_uri("model"), model_name_svm)


SVM Validation AUC-PR: 0.9632421740848977
SVM Test AUC-PR: 0.9632509728503631


Successfully registered model 'SVM'.
Created version '1' of model 'SVM'.


### Random Forest:

In [24]:
# Start MLflow run
model_name_rf = "RandomForest"
with mlflow.start_run(run_name=model_name_rf):
    # Build and train the Random Forest model
    rf_classifier = RandomForestClassifier()
    rf_classifier.fit(X_train_vectorized, y_train)

    # Predict on the validation set
    val_predictions_rf = rf_classifier.predict(X_val_vectorized)

    # Calculate precision-recall curve for validation set
    precision_rf, recall_rf, _ = precision_recall_curve(y_val, val_predictions_rf)

    # Calculate AUC-PR for validation set
    auc_pr_val_rf = auc(recall_rf, precision_rf)

    # Log parameters and metrics for validation set
    mlflow.log_params(rf_classifier.get_params())
    mlflow.log_metric("AUCPR_val", auc_pr_val_rf)
    mlflow.sklearn.log_model(rf_classifier, "model")

    print(f"{model_name_rf} Validation AUC-PR: {auc_pr_val_rf}")

    # Predict on the test set
    test_predictions_rf = rf_classifier.predict(X_test_vectorized)

    # Calculate precision-recall curve for test set
    precision_test_rf, recall_test_rf, _ = precision_recall_curve(y_test, test_predictions_rf)

    # Calculate AUC-PR for test set
    auc_pr_test_rf = auc(recall_test_rf, precision_test_rf)

    # Log metrics for test set
    mlflow.log_metric("AUCPR_test", auc_pr_test_rf)

    print(f"{model_name_rf} Test AUC-PR: {auc_pr_test_rf}")

    # Register the model
    mlflow.register_model(mlflow.get_artifact_uri("model"), model_name_rf)


RandomForest Validation AUC-PR: 0.9740529848957085
RandomForest Test AUC-PR: 0.9760570310718529


Successfully registered model 'RandomForest'.
Created version '1' of model 'RandomForest'.
