## Import packages

In [1]:
import pandas as pd
import numpy as n
import os

import string
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.multiclass import OneVsRestClassifier 
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import accuracy_score, precision_score, f1_score, roc_auc_score, average_precision_score, recall_score


#### Load the data

In [None]:
import pandas as pd
import os

# Directory where the folds are saved
folds_dir = '/path/to/folds'

# Initialize lists to store data for each fold
train_data_list = []
test_data_list = []

# Load train and test folds for each fold
for fold in range(1, 6):
    train_file = os.path.join(folds_dir, f'fold_{fold}_train.csv')
    test_file = os.path.join(folds_dir, f'fold_{fold}_test.csv')
    
    train_data = pd.read_csv(train_file)
    test_data = pd.read_csv(test_file)
    
    train_data_list.append(train_data)
    test_data_list.append(test_data)
    
    print(f"Fold {fold} - Train data shape: {train_data.shape}")
    print(f"Fold {fold} - Test data shape: {test_data.shape}")

# Access individual fold data using train_data_list[fold_index] and test_data_list[fold_index]

### Preprocess

In [5]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

In [6]:
def preprocess_text(text):
    """
    Function to preprocess text data.
    """
    # Convert text to lowercase
    text = text.lower()

    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))

    # Remove numbers
    text = re.sub(r'\d+', '', text)

    # Tokenization
    tokens = text.split()

    # Remove stopwords and lemmatize
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]

    # Re-join tokens into a string
    text = ' '.join(tokens)

    return text



In [3]:
# Apply preprocessing to each train and test fold
for i in range(len(train_data_list)):
    train_data_list[i]['Text_desc'] = train_data_list[i]['Text_desc'].apply(preprocess_text)
    test_data_list[i]['Text_desc'] = test_data_list[i]['Text_desc'].apply(preprocess_text)

#### Train

In [4]:

def train_classifier(X_train, y_train, C, regularisation):
    """
    Trains a logistic regression classifier for multi-label classification.

    Parameters:
    - X_train: Features for training
    - y_train: Labels for training
    - C: Regularization strength
    - regularisation: Type of regularization ('l1' or 'l2')

    Returns:
    - Trained MultiOutputClassifier model
    """
    model = MultiOutputClassifier(LogisticRegression(penalty=regularisation, C=C, class_weight='balanced', max_iter=10000))
    model.fit(X_train, y_train)
    return model




In [5]:
from skmultilearn.model_selection import IterativeStratification
import numpy as np

# Initialize lists to store probability logits
probability_logits_bow = []
probability_logits_tfidf = []

# New directory to save the DataFrames
new_dir = '/Users/jeremybalch/Desktop/BoW-TFIDF_logits_v10'
os.makedirs(new_dir, exist_ok=True)

# Use the predefined train and test splits
for fold_index, (train_data, test_data) in enumerate(zip(train_data_list, test_data_list)):
    X_train = train_data['Text_desc']
    y_train = train_data.drop(columns=['Text_desc'])
    X_test = test_data['Text_desc']
    y_test = test_data.drop(columns=['Text_desc'])
    
    # Initialize CountVectorizer and TfidfVectorizer
    bow_vectorizer = CountVectorizer(max_features=5000, ngram_range=(1, 2))
    tfidf_vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
    
    # Fit and transform train data
    X_train_bow = bow_vectorizer.fit_transform(X_train)
    X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
    
    # Transform test data
    X_test_bow = bow_vectorizer.transform(X_test)
    X_test_tfidf = tfidf_vectorizer.transform(X_test)
    
    # Train classifiers
    classifier_mybag = train_classifier(X_train_bow, y_train, C=1.0, regularisation='l2')
    classifier_tfidf = train_classifier(X_train_tfidf, y_train, C=10, regularisation='l2')
    
    # Get probability logits for the positive class (class 1)
    y_test_predicted_scores_mybag = np.hstack([proba[:, 1].reshape(-1, 1) for proba in classifier_mybag.predict_proba(X_test_bow)])
    y_test_predicted_scores_tfidf = np.hstack([proba[:, 1].reshape(-1, 1) for proba in classifier_tfidf.predict_proba(X_test_tfidf)])
    
    # Debug: Print shapes
    print("Shape of y_test_predicted_scores_mybag:", y_test_predicted_scores_mybag.shape)
    print("Shape of y_test_predicted_scores_tfidf:", y_test_predicted_scores_tfidf.shape)
    
    # Ensure the predicted scores have the correct number of classes
    if y_test_predicted_scores_mybag.shape[1] != 21 or y_test_predicted_scores_tfidf.shape[1] != 21:
        raise ValueError("Predicted scores do not match the expected number of classes (21).")

    # Save the probability logits for each fold
    df_bow_fold = pd.DataFrame(y_test_predicted_scores_mybag)
    df_tfidf_fold = pd.DataFrame(y_test_predicted_scores_tfidf)
    
    df_bow_fold.to_csv(f'{new_dir}/probability_logits_bow_fold_{fold_index}.csv', index=False)
    df_tfidf_fold.to_csv(f'{new_dir}/probability_logits_tfidf_fold_{fold_index}.csv', index=False)
    
    # Save the true labels for each fold
    df_y_test = pd.DataFrame(y_test)
    df_y_test.to_csv(f'{new_dir}/true_labels_fold_{fold_index}.csv', index=False)

In [None]:
### Evaluate

In [7]:
from sklearn.metrics import f1_score
import pandas as pd

# Initialize a list to store F1 scores for each label and fold
f1_scores_per_label = []

# Perform the split again to access y_test for each fold
for fold_index in range(5):  # Assuming 5 folds
    # Load the saved logits
    df_bow_fold = pd.read_csv(f'path/to/probability_logits_bow_fold_{fold_index}.csv')
    df_tfidf_fold = pd.read_csv(f'/path/to/probability_logits_tfidf_fold_{fold_index}.csv')
    
    # Load the true labels for the current fold
    df_y_test = pd.read_csv(f'path/to/true_labels_fold_{fold_index}.csv')
    y_test = df_y_test.values
    
    # Convert logits to binary predictions using a threshold of 0.5
    y_test_pred_mybag = (df_bow_fold.values >= 0.11).astype(int)
    y_test_pred_tfidf = (df_tfidf_fold.values >= 0.11).astype(int)
    
    # Calculate F1 scores for each label
    f1_scores_mybag = f1_score(y_test, y_test_pred_mybag, average=None)
    f1_scores_tfidf = f1_score(y_test, y_test_pred_tfidf, average=None)
    
    # Store F1 scores in a DataFrame
    df_f1_scores = pd.DataFrame({
        'Fold': fold_index,
        'Label': range(y_test.shape[1]),
        'F1_Score_BoW': f1_scores_mybag,
        'F1_Score_TFIDF': f1_scores_tfidf
    })
    
    f1_scores_per_label.append(df_f1_scores)

# Concatenate all DataFrames into a single DataFrame
f1_scores_df = pd.concat(f1_scores_per_label, ignore_index=True)

# Optionally, save the DataFrame to a CSV file
f1_scores_df.to_csv('path/to/f1_scores_per_label.csv', index=False)

# Display the DataFrame
print(f1_scores_df)

     Fold  Label  F1_Score_BoW  F1_Score_TFIDF
0       0      0      0.222222        0.082192
1       0      1      0.571429        0.128205
2       0      2      0.745763        0.474227
3       0      3      0.545455        0.520000
4       0      4      0.652632        0.666667
..    ...    ...           ...             ...
100     4     16      0.773333        0.558559
101     4     17      0.181818        0.117647
102     4     18      0.740741        0.608696
103     4     19      0.818182        0.241758
104     4     20      0.461538        0.098765

[105 rows x 4 columns]


### Get min, max, and avg F1 scores

In [None]:
# Assuming `y_cols` is defined as follows:
# y_cols = list(data_clean.columns[1:])

# Calculate min, max, and average F1 scores for each label
f1_summary = f1_scores_df.groupby('Label').agg(
    Min_F1_Score_BoW=('F1_Score_BoW', 'min'),
    Max_F1_Score_BoW=('F1_Score_BoW', 'max'),
    Avg_F1_Score_BoW=('F1_Score_BoW', 'mean'),
    Min_F1_Score_TFIDF=('F1_Score_TFIDF', 'min'),
    Max_F1_Score_TFIDF=('F1_Score_TFIDF', 'max'),
    Avg_F1_Score_TFIDF=('F1_Score_TFIDF', 'mean')
).reset_index()

# Map label indices to names using y_cols
f1_summary['Label_Name'] = f1_summary['Label'].map(lambda x: y_cols[x])

# Reorder columns to have Label_Name first
f1_summary = f1_summary[['Label_Name', 'Label', 'Min_F1_Score_BoW', 'Max_F1_Score_BoW', 'Avg_F1_Score_BoW', 
                         'Min_F1_Score_TFIDF', 'Max_F1_Score_TFIDF', 'Avg_F1_Score_TFIDF']]

# Optionally, save the summary DataFrame to a CSV file
f1_summary.to_csv('path/to/f1_summary_per_label.csv', index=False)

# Display the summary DataFrame
print(f1_summary)

### Hamming Loss Across All Labels for Each Fold


In [12]:
from sklearn.metrics import hamming_loss
import numpy as np

# Initialize lists to store true labels and predictions for the whole dataset
all_true_labels = []
all_pred_labels_mybag = []
all_pred_labels_tfidf = []

# Initialize lists to store Hamming loss for each fold
hamming_loss_per_fold_mybag = []
hamming_loss_per_fold_tfidf = []

# Perform the split again to access y_test for each fold
for fold_index in range(5):  # Assuming 5 folds
    # Load the saved logits
    df_bow_fold = pd.read_csv(f'path/to/probability_logits_bow_fold_{fold_index}.csv')
    df_tfidf_fold = pd.read_csv(f'path/to/probability_logits_tfidf_fold_{fold_index}.csv')
    
    # Load the true labels for the current fold
    df_y_test = pd.read_csv(f'path/to/true_labels_fold_{fold_index}.csv')
    y_test = df_y_test.values
    
    # Convert logits to binary predictions using a threshold of 0.15
    y_test_pred_mybag = (df_bow_fold.values >= 0.15).astype(int)
    y_test_pred_tfidf = (df_tfidf_fold.values >= 0.15).astype(int)
    
    # Append true labels and predictions to the lists
    all_true_labels.append(y_test)
    all_pred_labels_mybag.append(y_test_pred_mybag)
    all_pred_labels_tfidf.append(y_test_pred_tfidf)
    
    # Calculate Hamming loss for the current fold
    hamming_loss_mybag = hamming_loss(y_test, y_test_pred_mybag)
    hamming_loss_tfidf = hamming_loss(y_test, y_test_pred_tfidf)
    
    # Append Hamming loss for the current fold to the lists
    hamming_loss_per_fold_mybag.append(hamming_loss_mybag)
    hamming_loss_per_fold_tfidf.append(hamming_loss_tfidf)

# Concatenate all folds to get the complete dataset
all_true_labels = np.vstack(all_true_labels)
all_pred_labels_mybag = np.vstack(all_pred_labels_mybag)
all_pred_labels_tfidf = np.vstack(all_pred_labels_tfidf)

# Calculate Hamming loss for the whole dataset
hamming_loss_mybag = hamming_loss(all_true_labels, all_pred_labels_mybag)
hamming_loss_tfidf = hamming_loss(all_true_labels, all_pred_labels_tfidf)

print(f"Hamming Loss for BoW: {hamming_loss_mybag}")
print(f"Hamming Loss for TFIDF: {hamming_loss_tfidf}")

# Print Hamming loss for each fold
for fold_index in range(5):
    print(f"Hamming Loss for BoW (Fold {fold_index}): {hamming_loss_per_fold_mybag[fold_index]}")
    print(f"Hamming Loss for TFIDF (Fold {fold_index}): {hamming_loss_per_fold_tfidf[fold_index]}")

Hamming Loss for BoW: 0.1391561806069578
Hamming Loss for TFIDF: 0.7510486059708857
Hamming Loss for BoW (Fold 0): 0.16216216216216217
Hamming Loss for TFIDF (Fold 0): 0.7458172458172458
Hamming Loss for BoW (Fold 1): 0.12825396825396826
Hamming Loss for TFIDF (Fold 1): 0.746031746031746
Hamming Loss for BoW (Fold 2): 0.1391941391941392
Hamming Loss for TFIDF (Fold 2): 0.7411477411477412
Hamming Loss for BoW (Fold 3): 0.12839059674502712
Hamming Loss for TFIDF (Fold 3): 0.7631103074141049
Hamming Loss for BoW (Fold 4): 0.1386904761904762
Hamming Loss for TFIDF (Fold 4): 0.7583333333333333


### All metrics for each label

In [None]:
from sklearn.metrics import roc_auc_score, average_precision_score, accuracy_score, f1_score
import pandas as pd

# Initialize a list to store metrics for each label and fold
metrics_per_label = []

# Perform the split again to access y_test for each fold
for fold_index in range(5):  # Assuming 5 folds
    # Load the saved logits
    df_bow_fold = pd.read_csv(f'path/to0/probability_logits_bow_fold_{fold_index}.csv')
    df_tfidf_fold = pd.read_csv(f'path/to/probability_logits_tfidf_fold_{fold_index}.csv')
    
    # Load the true labels for the current fold
    df_y_test = pd.read_csv(f'path/to/true_labels_fold_{fold_index}.csv')
    y_test = df_y_test.values
    
    # Convert logits to binary predictions using a threshold of 0.3
    y_test_pred_mybag = (df_bow_fold.values >= 0.3).astype(int)
    y_test_pred_tfidf = (df_tfidf_fold.values >= 0.3).astype(int)
    
    # Calculate metrics for each label
    for label_index in range(y_test.shape[1]):
        # Initialize metrics
        auroc_mybag = None
        auroc_tfidf = None
        
        # Calculate AUROC and AUPRC
        try:
            auroc_mybag = roc_auc_score(y_test[:, label_index], df_bow_fold.values[:, label_index])
        except ValueError:
            pass  # Skip AUROC calculation if only one class is present
        
        auprc_mybag = average_precision_score(y_test[:, label_index], df_bow_fold.values[:, label_index])
        
        try:
            auroc_tfidf = roc_auc_score(y_test[:, label_index], df_tfidf_fold.values[:, label_index])
        except ValueError:
            pass  # Skip AUROC calculation if only one class is present
        
        auprc_tfidf = average_precision_score(y_test[:, label_index], df_tfidf_fold.values[:, label_index])
        
        # Calculate accuracy and F1
        accuracy_mybag = accuracy_score(y_test[:, label_index], y_test_pred_mybag[:, label_index])
        f1_mybag = f1_score(y_test[:, label_index], y_test_pred_mybag[:, label_index])
        
        accuracy_tfidf = accuracy_score(y_test[:, label_index], y_test_pred_tfidf[:, label_index])
        f1_tfidf = f1_score(y_test[:, label_index], y_test_pred_tfidf[:, label_index])
        
        # Store metrics in a DataFrame
        metrics_per_label.append({
            'Label': label_index,
            'AUROC_BoW': auroc_mybag,
            'AUPRC_BoW': auprc_mybag,
            'Accuracy_BoW': accuracy_mybag,
            'F1_BoW': f1_mybag,
            'AUROC_TFIDF': auroc_tfidf,
            'AUPRC_TFIDF': auprc_tfidf,
            'Accuracy_TFIDF': accuracy_tfidf,
            'F1_TFIDF': f1_tfidf
        })

# Convert the list of metrics to a DataFrame
metrics_df = pd.DataFrame(metrics_per_label)

# Calculate the average metrics for each label
average_metrics_df = metrics_df.groupby('Label').mean().reset_index()

# Optionally, save the average metrics DataFrame to a CSV file
average_metrics_df.to_csv('/Users/jeremybalch/Desktop/BoW-TFIDF_logits_v10/BoW_TFIDF_metrics_per_label.csv', index=False)

# Display the average metrics DataFrame
print(average_metrics_df)