## Loading data

In [3]:
import pandas as pd

In [4]:
data = pd.read_csv("../data/raw_data.csv")

 ## Preprocessing the data

In [5]:
from sklearn.model_selection import train_test_split
import nltk
import string
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package stopwords to /home/sbose/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/sbose/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [6]:
stopwords = set(stopwords.words('english'))

In [13]:
# Preprocessing
lemmatizer = WordNetLemmatizer()

# Lower casing
data['text'] = data['text'].str.lower()

# Removing punctuation
data['text'] = data['text'].apply(lambda x: ''.join([char for char in x if char not in string.punctuation]))

# Tokenization
data['text'] = data['text'].apply(lambda x: x.split())

# Removing stop words
data['text'] = data['text'].apply(lambda x: [word for word in x if word not in stopwords])

# Lemmatization
data['text'] = data['text'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])

# Joining the tokens
data['text'] = data['text'].apply(lambda x: ' '.join(x))

# Removing empty strings
data = data[data['text'] != '']

# Count Vectorization
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
temp = cv.fit_transform(data.text)
data = pd.concat([data, pd.DataFrame(temp.toarray())], axis=1)
data = data.drop('text', axis=1)

In [14]:
# Splitting the data
# 80% for training and 20% for testing
train_data, temp_data = train_test_split(data, test_size=0.2, random_state=42)
# Split temp set into validation and test sets
validation_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42)

In [15]:
# helper function for plotting the aucpr curve
import matplotlib.pyplot as plt
from sklearn.metrics import precision_recall_curve, auc

def plot_aucpr_curve(y_true, y_probs, model_name):

    # Calculate precision-recall curve
    precision, recall, _ = precision_recall_curve(y_true, y_probs)

    # Calculate AUC-PR
    aucpr = auc(recall, precision)

    # Plot the AUC-PR curve
    plt.figure(figsize=(8, 6))
    plt.plot(recall, precision, color='darkorange', lw=2, label=f'{model_name} (AUC-PR = {aucpr:.2f})')
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('Precision-Recall Curve')
    plt.legend(loc='lower right')
    plt.show()


## Attaching mlflow to the current file

In [16]:
import mlflow
from mlflow.models import infer_signature
mlflow.set_tracking_uri(uri="http://127.0.0.1:5000")

In [17]:
X_train, y_train = train_data.drop('spam', axis=1), train_data.spam

In [18]:
X_validation, y_validation = validation_data.drop('spam', axis=1), validation_data.spam

In [19]:
X_test, y_test = test_data.drop('spam', axis=1), test_data.spam

## SVC Model

In [20]:
from sklearn.svm import SVC

In [None]:
with mlflow.start_run() as run:
    model = SVC()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_validate)
    signature = infer_signature(X_validate, y_validate)
    mlflow.log_params(params)
    mlflow.log_metrics({"mse": mean_squared_error(y_validate, y_pred)})
    mlflow.sklearn.log_model(
        sk_model=model,
        artifact_path="sklearn-model",
        signature=signature,
        registered_model_name="SVC Model",
    )

## Naive Bayes Model

In [None]:
from sklearn.naive_bayes import MultinomialNB

In [None]:
with mlflow.start_run() as run:
    model = MultinomialNB()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_validate)
    signature = infer_signature(X_validate, y_validate)
    mlflow.log_params(params)
    mlflow.log_metrics({"mse": mean_squared_error(y_validate, y_pred)})
    mlflow.sklearn.log_model(
        sk_model=model,
        artifact_path="sklearn-model",
        signature=signature,
        registered_model_name="Naive Bayes Model",
    )

## Random Forest Model

In [1]:
from sklearn.ensemble import RandomForestClassifier

In [2]:
with mlflow.start_run() as run:
    model = RandomForestClassifier()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_validate)
    signature = infer_signature(X_validate, y_validate)
    mlflow.log_params(params)
    mlflow.log_metrics({"mse": mean_squared_error(y_validate, y_pred)})
    mlflow.sklearn.log_model(
        sk_model=model,
        artifact_path="sklearn-model",
        signature=signature,
        registered_model_name="Random Forest Model",
    )

NameError: name 'mlflow' is not defined