In [0]:
import dataiku
from dataiku import pandasutils as pdu
import pandas as pd
from datetime import datetime

In [0]:
tweets_train = dataiku.Dataset("tweets_train")
train_df = tweets_train.get_dataframe()

In [0]:
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer

def preprocess_data(df: pd.DataFrame, tfidf: TfidfVectorizer, label_col: str = 'label') -> tuple[pd.DataFrame, pd.Series]:

    features = ['tweet_length_chars', 'tweet_length_words', 'repetitive_letters',
                      'mention_only', 'unreadable', 'too_many_numbers']
    scaler = StandardScaler()
    X = scaler.fit_transform(df[features].fillna(0))
    X = pd.DataFrame(X, columns=features, index=df.index)

    # Encode label
    y = df[label_col]
    label_encoder = LabelEncoder()
    y = pd.Series(label_encoder.fit_transform(y))

    # Step 3: Apply TF-IDF transformation on the text column
    if tfidf is None:
        tfidf = TfidfVectorizer(
            min_df=0.01,  # Adjusted min_df to a lower value
            max_df=0.9,   # Adjusted max_df to a higher value
            ngram_range=(1, 1),
            stop_words=None
        )

        try:
            X_tfidf = tfidf.fit_transform(df['text'].fillna(''))
            
        except ValueError as e:
            print(f"Error during TF-IDF transformation: {e}")
            return None, None, None, None
    else:
        X_tfidf = tfidf.transform(df['text'].fillna(''))

    # Combine TF-IDF features with other features
    X_combined = np.hstack((X.values, X_tfidf.toarray()))

    return X_combined, y, tfidf


In [0]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score, precision_score, recall_score, f1_score
from sklearn.model_selection import StratifiedKFold
import matplotlib.pyplot as plt
import seaborn as sns

def train_model_with_tfidf_stratified_kfold(df: pd.DataFrame, n_splits: int = 5, max_features: int = 5000)->tuple[pd.DataFrame, list, LogisticRegression]:
    """
    Applies a logistic regression model using TF-IDF features and evaluates it with stratified K-fold cross-validation.

    Parameters:
    df (pd.DataFrame): The input DataFrame containing the dataset.
    n_splits (int): The number of splits for stratified K-fold cross-validation. Default is 5.
    max_features (int): The maximum number of features to consider for the TF-IDF vectorizer. Default is 5000.

    Returns:
    tuple: A tuple containing the list of accuracies, classification reports, ROC AUC scores for each fold, and the trained model.
    """
    
    X_combined, y, tfidf = preprocess_data(df, None)

    # Step 4: Stratified K-Fold Cross Validation
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    accuracies = []
    precisions = []
    recalls = []
    f1s = []
    reports = []
    roc_aucs = []

    for train_index, test_index in skf.split(X_combined, y):

        X_train, X_test = X_combined[train_index], X_combined[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        # Define the model
        model = LogisticRegression(max_iter=1000)
        # Train the model
        model.fit(X_train, y_train)

        # Predict on the test set
        y_pred = model.predict(X_test)
        y_pred_proba = model.predict_proba(X_test)

        # Evaluate the model
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
        recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
        f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)
        roc_auc = roc_auc_score(y_test, y_pred_proba, multi_class='ovr')
        report = classification_report(y_test, y_pred, output_dict=True, zero_division=0)

        accuracies.append(accuracy)
        precisions.append(precision)
        recalls.append(recall)
        f1s.append(f1)
        roc_aucs.append(roc_auc)
        reports.append(report)

    metrics_df = pd.DataFrame({
        'Mean Accuracy': [np.mean(accuracies)],
        'Mean Precision': [np.mean(precisions)],
        'Mean Recall': [np.mean(recalls)],
        'Mean F1 Score': [np.mean(f1s)],
        'Mean ROC AUC': [np.mean([auc for auc in roc_aucs if auc is not None])],
        'Model Name': ['Logistic Regression'],
        'Date and Time': [datetime.now().strftime('%Y-%m-%d %H:%M:%S')]
    })

    return metrics_df, reports, model, tfidf

# Train tweets_encryption_train_df

In [0]:
lr_metrics_df, reports, lr_model, tfidf = train_model_with_tfidf_stratified_kfold(train_df)

In [0]:
metrics_per_fold_df = pd.DataFrame()

for i, report in enumerate(reports):
    if isinstance(report, dict):
        # Remove 'accuracy', 'macro avg', and 'weighted avg' from the report
        report.pop('accuracy', None)
        report.pop('macro avg', None)
        report.pop('weighted avg', None)

        report_df = pd.DataFrame.from_dict(report).transpose()
        report_df['class'] = report_df.index  # Save the key of each dictionary into a new column called "class"
        report_df['Fold'] = i + 1  # Add the fold number to the DataFrame
        metrics_per_fold_df = pd.concat([metrics_per_fold_df, report_df], ignore_index=True)
    else:
        print(f"Warning: Report for Fold {i+1} is not a dictionary and cannot be converted to a DataFrame.")

## Save pickle

In [0]:
date_time = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
lr_artefact_name = f"lr_{date_time}"
metrics_per_fold_df["model"] = lr_artefact_name
metrics_per_fold_df["date_time"] = date_time

In [0]:
# Pipeline for encrypted data
artefact_pickle_name = f"{lr_artefact_name}.pkl"
models_data_folder = dataiku.Folder("VQ6fLov2")

with tempfile.TemporaryDirectory() as temp_dir:

    local_file_path = os.path.join(temp_dir, artefact_pickle_name)

    with open(local_file_path, 'wb') as file:
        pickle.dump(lr_model, file)

    models_data_folder.upload_file(artefact_pickle_name, local_file_path)

# Create output datasets

In [0]:
# Recipe outputs
lr_metrics = dataiku.Dataset("lr_metrics")
lr_metrics.write_with_schema(lr_metrics_df)

metrics_per_fold = dataiku.Dataset("metrics_per_fold")
metrics_per_fold.write_with_schema(metrics_per_fold_df)
