In [1]:
# Import necessary libraries
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report
from sklearn.model_selection import train_test_split

# Define file paths for data and model saving
data_path = "../data/"
train_file = data_path + "train_data.csv"
val_file = data_path + "val_data.csv"
model_save_path = "../models/"


In [2]:

# Load training and validation data
def load_data(train_path, val_path):
    """
    Load training and validation data from CSV files.
    """
    return pd.read_csv(train_path), pd.read_csv(val_path)

# Load the data into DataFrames
train_df, val_df = load_data(train_file, val_file)

# Handle missing values in 'clean_text'
train_df = train_df.dropna(subset=['clean_text'])
val_df = val_df.dropna(subset=['clean_text'])

# Extract features and labels
X_train, y_train = train_df['clean_text'], train_df['category']
X_val, y_val = val_df['clean_text'], val_df['category']

In [3]:

# Convert text into TF-IDF features
def vectorize_text(X_train, X_val):
    """
    Vectorize text data using TF-IDF.
    """
    vectorizer = TfidfVectorizer(max_features=10000, ngram_range=(1, 2))  # Unigrams and bigrams
    X_train_tfidf = vectorizer.fit_transform(X_train)
    X_val_tfidf = vectorizer.transform(X_val)
    return X_train_tfidf, X_val_tfidf, vectorizer

# Apply vectorization to training and validation data
X_train_tfidf, X_val_tfidf, tfidf_vectorizer = vectorize_text(X_train, X_val)

In [None]:
# Train and evaluate baseline models: Logistic Regression, Naive Bayes, SVM
def train_baseline_models(X_train, y_train, X_val, y_val):
    """
    Train and evaluate baseline models: Logistic Regression, Naive Bayes, SVM.
    """
    models = {
        "Logistic Regression": LogisticRegression(max_iter=500),
        "Naive Bayes": MultinomialNB(),
        "SVM": SVC(kernel='linear', probability=True)
    }
    results = {}

    for model_name, model in models.items():
        print(f"\nTraining {model_name}...")
        
        # Fit model to training data
        model.fit(X_train, y_train)
        
        # Make predictions on validation set
        y_val_preds = model.predict(X_val)

        # Calculate metrics
        results[model_name] = {
            "Accuracy": accuracy_score(y_val, y_val_preds),
            "F1-score": f1_score(y_val, y_val_preds, average='weighted'),
            "Precision": precision_score(y_val, y_val_preds, average='weighted'),
            "Recall": recall_score(y_val, y_val_preds, average='weighted')
        }

        # Save model to file
        model_file = f"{model_save_path}{model_name.replace(' ', '_')}.pkl"
        pd.to_pickle(model, model_file)
        print(f"{model_name} saved to {model_file}")

        # Print classification report
        print(f"\nClassification Report for {model_name}:\n{classification_report(y_val, y_val_preds)}")

    return results

# Train models and get results
baseline_results = train_baseline_models(X_train_tfidf, y_train, X_val_tfidf, y_val)



Training Logistic Regression...
Logistic Regression saved to ../models/Logistic_Regression.pkl

Classification Report for Logistic Regression:
              precision    recall  f1-score   support

        -1.0       0.91      0.80      0.85      5682
         0.0       0.90      0.98      0.94      8832
         1.0       0.93      0.93      0.93     11560

    accuracy                           0.92     26074
   macro avg       0.92      0.90      0.91     26074
weighted avg       0.92      0.92      0.92     26074


Training Naive Bayes...
Naive Bayes saved to ../models/Naive_Bayes.pkl

Classification Report for Naive Bayes:
              precision    recall  f1-score   support

        -1.0       0.85      0.47      0.61      5682
         0.0       0.84      0.71      0.77      8832
         1.0       0.69      0.92      0.79     11560

    accuracy                           0.75     26074
   macro avg       0.79      0.70      0.72     26074
weighted avg       0.77      0.75    

In [None]:
# Display baseline model performance metrics
print("\nBaseline Model Results:")
for model, metrics in baseline_results.items():
    print(f"{model}: {metrics}")
