In [1]:
from utils import *
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
import mlflow
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_validate
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.dummy import DummyClassifier
from sklearn.metrics import f1_score, recall_score
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.utils.class_weight import compute_class_weight
import mlflow
from utils import cleantxt

# Constants
RANDOM_STATE_SEED = 42
DATA_FOLDER = '../../data/'
TRAIN_FILE = DATA_FOLDER + 'train/train.csv'
TEST_FILE = DATA_FOLDER + 'test/test_label.csv'
TEST_RAW_FILE = DATA_FOLDER + 'test/test_raw.csv'
KFOLD_SPLITS = 5

# Initialize MLFlow
mlflow.set_tracking_uri("http://localhost:5000")
mlflow.set_experiment("baseline-models")

# Set up cross-validator
cv = KFold(n_splits=KFOLD_SPLITS, random_state=RANDOM_STATE_SEED, shuffle=True)

# Load and preprocess data
def load_and_preprocess_data():
    # Load data
    train = pd.read_csv(TRAIN_FILE,usecols=['text','label'])
    test = pd.read_csv(TEST_FILE,usecols=['text','label'])
    test_raw = pd.read_csv(TEST_RAW_FILE,usecols=['text', 'label'])
    
    # Preprocess data
    test['text'] = test['text'].apply(lambda x: cleantxt(x,lower=True))
    X_train, y_train = train['text'], train['label']
    X_test, y_test = test['text'], test['label']
    X_test_raw, y_test_raw = test_raw['text'], test_raw['label']

    # Create validation set from training data
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=RANDOM_STATE_SEED)
    
    return X_train, y_train, X_val, y_val, X_test, y_test, X_test_raw, y_test_raw

# Vectorize the text data using TF-IDF
def tfidf_vectorize(data, vectorizer=None):
    if vectorizer is None:
        vectorizer = TfidfVectorizer(ngram_range=(1, 2), max_features=10_000)
        data = vectorizer.fit_transform(data)
    else:
        data = vectorizer.transform(data)
    return data, vectorizer

# Train and evaluate model
def train_and_evaluate_model(model, X_train, y_train, X_test, y_test, run_name):
    print(f'Starting run {run_name}')
    with mlflow.start_run(run_name=run_name, nested=True):
        # Fit model
        model.fit(X_train, y_train)
        
        # Predict on validation set
        y_pred =  model.predict(X_test)
        # f1 = f1_score(y_val, y_pred)

        # Cross-validation
        # scores = cross_validate(model, X_train, y_train, scoring=['f1', 'recall'], cv=cv, n_jobs=-1)
        
        # Log metrics
        mlflow.log_metrics({
            'f1_weigth': f1_score(y_test,y_pred, average='weighted'),
            'f1_macro': f1_score(y_test,y_pred, average='macro'),
            'recall_pos_class': recall_score(y_test, y_pred, pos_label=1),
        })
        
        # Log model
        mlflow.sklearn.log_model(model, run_name)

# Main execution
def main():
    # Load and preprocess data
    X_train, y_train, X_val, y_val, X_test, y_test, X_test_raw, y_test_raw = load_and_preprocess_data()
    
    # Vectorize data
    X_train, vectorizer = tfidf_vectorize(X_train)
    X_val, _ = tfidf_vectorize(X_val, vectorizer)
    X_test, _ = tfidf_vectorize(X_test, vectorizer)
    X_test_raw, _ = tfidf_vectorize(X_test_raw, vectorizer)
    
    # Train and evaluate Naive Bayes model
    nb_model = MultinomialNB(alpha=0.1)
    train_and_evaluate_model(nb_model, X_train, y_train, X_test, y_test, 'naive_bayes')
    
    # Train and evaluate random baseline model
    baseline_model = DummyClassifier(strategy='uniform', random_state=RANDOM_STATE_SEED)
    train_and_evaluate_model(baseline_model, X_train, y_train, X_test, y_test, 'random_baseline')

    # now run again for X_test_raw
    # Train and evaluate Naive Bayes model
    train_and_evaluate_model(nb_model, X_train, y_train, X_test_raw, y_test_raw, 'naive_bayes_raw')

    # Train and evaluate random baseline model
    train_and_evaluate_model(baseline_model, X_train, y_train, X_test_raw, y_test_raw, 'random_baseline_raw')


if __name__ == "__main__":
    main()

2023/07/28 18:25:08 INFO mlflow.tracking.fluent: Experiment with name 'baseline-models' does not exist. Creating a new experiment.


Starting run naive_bayes


Starting run random_baseline
Starting run naive_bayes_raw
Starting run random_baseline_raw
