In [0]:
import dataiku
from dataiku import pandasutils as pdu
import pandas as pd

In [0]:
prepared_tweets_encryption = dataiku.Dataset("prepared_tweets_encryption")
prepared_tweets_encryption_df = prepared_tweets_encryption.get_dataframe()

prepared_tweets_removal = dataiku.Dataset("prepared_tweets_removal")
prepared_tweets_removal_df = prepared_tweets_removal.get_dataframe()

In [0]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
import numpy as np

def apply_and_evaluate_model_with_tfidf_stratified_kfold(df, label_col='label', n_splits=5, max_features=5000):

    # Step 2: Split dataset into features and target
    # Handle date, user, and language as dummy variables
    X = pd.get_dummies(df[['date', 'user', 'language']], drop_first=True)

    # Handle numerical columns with standard scaling
    numerical_cols = ['tweet_length_chars', 'tweet_length_words', 'repetitive_letters',
                      'mention_only', 'unreadable', 'too_many_numbers']
    scaler = StandardScaler()
    X_numerical = scaler.fit_transform(df[numerical_cols].fillna(0))
    X_numerical = pd.DataFrame(X_numerical, columns=numerical_cols, index=df.index)

    # Concatenate all features
    X = pd.concat([X, X_numerical], axis=1)
    y = df[label_col]

    # Step 3: Apply TF-IDF transformation on the text column
    tfidf = TfidfVectorizer(
        min_df=0.01,  # Adjusted min_df to a lower value
        max_df=0.9,   # Adjusted max_df to a higher value
        ngram_range=(1, 1),
        stop_words=None
    )

    try:
        X_tfidf = tfidf.fit_transform(df['text'].fillna(''))
    except ValueError as e:
        print(f"Error during TF-IDF transformation: {e}")
        return None, None, None

    # Combine TF-IDF features with other features
    X_combined = np.hstack((X.values, X_tfidf.toarray()))

    # Step 4: Stratified K-Fold Cross Validation
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    accuracies = []
    reports = []
    roc_aucs = []

    for train_index, test_index in skf.split(X_combined, y):
        X_train, X_test = X_combined[train_index], X_combined[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        # Define the model
        model = LogisticRegression(max_iter=1000)

        # Train the model
        model.fit(X_train, y_train)

        # Predict on the test set
        y_pred = model.predict(X_test)
        y_pred_proba = model.predict_proba(X_test)

        # Evaluate the model
        accuracy = accuracy_score(y_test, y_pred)
        report = classification_report(y_test, y_pred, output_dict=True)

        try:
            roc_auc = roc_auc_score(y_test, y_pred_proba, multi_class='ovr')
        except ValueError as e:
            print(f"Error during ROC AUC calculation: {e}")
            roc_auc = None

        accuracies.append(accuracy)
        reports.append(report)
        roc_aucs.append(roc_auc)

    return np.mean(accuracies), reports, np.mean([auc for auc in roc_aucs if auc is not None])

# Apply to prepared_tweets_encryption_df

In [0]:
# Apply the function to each DataFrame
accuracy_encrypted, report_encrypted, roc_auc_encrypted = apply_and_evaluate_model_with_tfidf_stratified_kfold(prepared_tweets_encryption_df)

In [0]:
if accuracy_encrypted is not None:
    encrypted_metrics_df = pd.DataFrame({
        'Metric': ['Average Accuracy', 'Average ROC AUC'],
        'Value': [accuracy_encrypted, roc_auc_encrypted]
    })
    print(encrypted_metrics_df)

In [0]:
encrypted_metrics_per_fold_df = pd.DataFrame()

for i, report in enumerate(report_encrypted):
    if isinstance(report, dict):
        # Remove 'accuracy', 'macro avg', and 'weighted avg' from the report
        report.pop('accuracy', None)
        report.pop('macro avg', None)
        report.pop('weighted avg', None)

        report_df = pd.DataFrame.from_dict(report).transpose()
        report_df['class'] = report_df.index  # Save the key of each dictionary into a new column called "class"
        report_df['Fold'] = i + 1  # Add the fold number to the DataFrame
        encrypted_metrics_per_fold_df = pd.concat([encrypted_metrics_per_fold_df, report_df], ignore_index=True)
    else:
        print(f"Warning: Report for Fold {i+1} is not a dictionary and cannot be converted to a DataFrame.")

# Apply to prepared_tweets_removal_df

In [0]:
accuracy_removed, report_removed, roc_auc_removed = apply_and_evaluate_model_with_tfidf_stratified_kfold(prepared_tweets_removal_df)

In [0]:
if accuracy_removed is not None:
    removed_metrics_df = pd.DataFrame({
        'Metric': ['Average Accuracy', 'Average ROC AUC'],
        'Value': [accuracy_removed, roc_auc_removed]
    })
    print(removed_metrics_df)

In [0]:
removed_metrics_per_fold_df = pd.DataFrame()

for i, report in enumerate(report_removed):
    if isinstance(report, dict):
        # Remove 'accuracy', 'macro avg', and 'weighted avg' from the report
        report.pop('accuracy', None)
        report.pop('macro avg', None)
        report.pop('weighted avg', None)

        report_df = pd.DataFrame.from_dict(report).transpose()
        report_df['class'] = report_df.index  # Save the key of each dictionary into a new column called "class"
        report_df['Fold'] = i + 1  # Add the fold number to the DataFrame
        removed_metrics_per_fold_df = pd.concat([removed_metrics_per_fold_df, report_df], ignore_index=True)
    else:
        print(f"Warning: Report for Fold {i+1} is not a dictionary and cannot be converted to a DataFrame.")

In [0]:
# Recipe outputs
encrypted_metrics = dataiku.Dataset("encrypted_metrics")
encrypted_metrics.write_with_schema(encrypted_metrics_df)

encrypted_metrics_per_fold = dataiku.Dataset("encrypted_metrics_per_fold")
encrypted_metrics_per_fold.write_with_schema(encrypted_metrics_per_fold_df)

removed_metrics = dataiku.Dataset("removed_metrics")
removed_metrics.write_with_schema(removed_metrics_df)

removed_metrics_per_fold = dataiku.Dataset("removed_metrics_per_fold")
removed_metrics_per_fold.write_with_schema(removed_metrics_per_fold_df)

In [0]:
# Parameter grid for GridSearchCV
param_grid = {
    'lr__alpha': [0.01, 0.1, 1, 10],
    'lr__l1_ratio': [0, 0.4, 0.8, 1]
}

In [0]:
# GridSearchCV
grid_search = GridSearchCV(pipeline,
                           param_grid,
                           cv=5,
                           scoring='neg_mean_squared_error',
                           verbose=1)
grid_search.fit(X_train, y_train);

In [0]:
# Format results
cv_results = pd.DataFrame(grid_search.cv_results_)

# select columns
selected_cols = ["mean_fit_time"] + [c for c in cv_results if "param_" in c or "_test_score" in c]
selected_cols.remove("std_test_score")
cv_results = cv_results[selected_cols]

# rank experiments
cv_results = cv_results.sort_values("rank_test_score")

# pretty rf params
cv_results = cv_results.rename({"param_lr__alpha": "alpha",
                                "param_lr__l1_ratio": "l1_ratio"},
                                axis=1)
cv_results["date"] = datetime.now().date()
cv_results

# Outputs

In [0]:
artefact_name = f"lr_{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}"

In [0]:
# Pipeline
pipeline_local_path = f"{artefact_name}.pkl"
pipeline_remote_path = f"{artefact_name}.pkl"
remote_output_folder = dataiku.Folder("2T1uAdOy")

with tempfile.TemporaryDirectory() as local_tmp_dir:

    local_file_path = os.path.join(local_tmp_dir, pipeline_local_path)

    with open(local_file_path, 'wb') as file:
        pickle.dump(grid_search.best_estimator_, file)

    remote_output_folder.upload_file(pipeline_remote_path, local_file_path)

In [0]:
# Training stats
cv_results["model"] = artefact_name  # save model name
dataiku\
    .Dataset("lr_training_stats")\
    .write_with_schema(cv_results[:1].drop("rank_test_score", axis=1))

In [0]:
# Artefacts
fi_local_path = f"{artefact_name}_feature_importance.png"
fi_remote_path = f"{artefact_name}_feature_importance.png"
output_folder = dataiku.Folder("JPo1Lx1F")

with tempfile.TemporaryDirectory() as tmp_dir_name:
    local_file_path = os.path.join(tmp_dir_name, fi_local_path)
    fig = ax.get_figure()
    fig.savefig(fi_local_path)
    output_folder.upload_file(fi_remote_path, fi_local_path)
    plt.close(fig)