In [1]:
# pd.set_option('display.max_colwidth', None)

In [2]:
import pandas as pd
import re
from textblob import TextBlob
import numpy as np
from scipy.stats import shapiro


from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.dummy import DummyClassifier
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from scipy.stats import ttest_rel
from sklearn.model_selection import cross_val_score

from imblearn.over_sampling import SMOTE


### Lyric preprocessing & Sentiment annotation

Read data from local csv

In [3]:
df = pd.read_csv("songs_with_lyrics_clean.csv")

Clean lyrics from the following:
* \n
* [something]

Also explored invalid urls

In [4]:
def clean_lyrics(lyrics):
    # Remove text within brackets
    cleaned_lyrics = re.sub(r"\[.*?\]|\(.*?\)", "", lyrics)

    # Capitalize the first letter of each line
    cleaned_lyrics = cleaned_lyrics.replace('\n', " ")
    cleaned_lyrics = cleaned_lyrics.strip(" ")

    return cleaned_lyrics

In [5]:
# Apply the cleaning function to your DataFrame
df['lyrics'] = df['lyrics'].apply(clean_lyrics)

After data exploration, we realize that some extra and more custom cleaning is needed

In [6]:
# this url is not the url of a song
df.drop(df.loc[df['lyrics_url'] == 'https://genius.com/Scopey-almost-every-album-ive-listened-to-lyrics'].index, inplace = True)
df.drop(df.loc[df['lyrics_url'] == 'https://genius.com/Hossein-amini-drive-annotated'].index, inplace = True)
df.drop(df.loc[df['lyrics_url'] == 'https://genius.com/Genius-valentines-day-playlists-lyrics'].index, inplace = True)


Label songs based on their lyric sentiments.

Used textblob library

In [7]:
# Function to apply sentiment analysis using TextBlob
def get_sentiment(lyrics):
    try:
        blob = TextBlob(lyrics)
        # TextBlob returns polarity and subjectivity, you can use just polarity for a simple positive/negative/neutral sentiment
        polarity = blob.sentiment.polarity
        if polarity > 0:
            return "Positive"
        elif polarity < 0:
            return "Negative"
        else:
            return "Neutral"
    except Exception as e:
        print(f"Error processing lyrics: {e}")
        return None
    

def clean_lyrics(lyric):
    cleaned_lyrics = re.sub()

In [8]:
%%time
df['sentiment'] = df['lyrics'].apply(lambda lyrics: get_sentiment(lyrics))


CPU times: total: 14.8 s
Wall time: 26.6 s


In [9]:
df.sample(10)

Unnamed: 0,song_id,song_name,artist_id,artist_name,popularity,playlist,lyrics_url,lyrics,language,sentiment
693,1cQld05IcUDw3RCFt7uymW,Be The One,5lVNSw2GPci8kebrAQpZqU,Eli Brown,75,"['2zMZv11a981pTJORORVKbz', '0LJk5MTU4Jn2AlgLTd...",https://genius.com/Eli-brown-be-the-one-lyrics,"I know that you're lonely, want someone to hol...",en,Negative
9099,3EraWxocoFkg6PTjxII85U,Too Strange for the Circus,21YCHE0ZFflbHVTsyrCpgh,Debbii Dawson,47,['37i9dQZF1DX0MLFaUdXnjA'],https://genius.com/Debbii-dawson-too-strange-f...,"Head spins, tooths fall Heartache in the bathr...",en,Negative
1577,2I28t30wCQQDmR9do0knxp,Closer to You,2MjEBovZ1pZmzlQJCQjgS9,Christian McKinney,28,['2fyE3mKj0FoQ1klcyRb9yH'],https://genius.com/Christian-mckinney-closer-t...,Creator of the universe Lord you take my breat...,en,Positive
2326,0Ut7NNgAIlY9RGeXIvNfXF,Enemy Brain,2BQWHuvxG4kMYnfghdaCIy,Fox Stevenson,43,['37i9dQZF1DWTfrr8pte1rT'],https://genius.com/Fox-stevenson-enemy-brain-l...,"Just another day in this hell world, hey I don...",en,Positive
5161,3wjD3Ufjll7UKrEDPdGuX3,Maybe It Was Memphis,1SX44N5qjWuFcCK8WTO0c6,Pam Tillis,61,['3NKthdgxid2S9fHhN54Vp3'],https://genius.com/Pam-tillis-maybe-it-was-mem...,Looking at you through a misty moonlight Katy ...,en,Positive
405,7MRyJPksH3G2cXHN8UKYzP,American Girl,4tX2TplrkIP4v05BNC903e,Tom Petty and the Heartbreakers,74,"['2lk6445GUpENJLdDyJFMHJ', '37i9dQZF1EIcVkEtbz...",https://genius.com/Tom-petty-and-the-heartbrea...,"Well, she was an American girl Raised on promi...",en,Positive
3772,1UaHt8XVYOmbvc8MEiVdfI,I DONT WANNA BE MYSELF,6YqsG5YZEACC698v66oGMm,cloverscars,44,['4PWQV9dQpT7As9OTZBqrR8'],https://genius.com/Cloverscars-i-dont-wanna-be...,I don't wanna be myself I ain't too happy with...,en,Positive
8858,3H3r2nKWa3Yk5gt8xgmsEt,This City,6L1XC7NrmgWRlwAeLJvVtA,Sam Fischer,78,"['0hEjaQYvUZKzV3JGzxgcwx', '6UQ4WyqgIFEJe95T02...",https://genius.com/Sam-fischer-this-city-lyrics,I've been seeing lonely people in crowded room...,en,Positive
4132,3ihIZrJreMJPjQdNLrEXnP,In A Minute,5f7VJjfbwm532GiveGC0ZK,Lil Baby,68,['2ktsU5qFJ4ugmC8SvwXdN2'],https://genius.com/Lil-baby-in-a-minute-lyrics,"( Damn, Kai, you goin' crazy ) I be in the lo...",en,Positive
3165,2xSC2Tvr2I44VZmCXg60dW,Had Enough,1kF0gYnHLUJvFuPdoowO02,Lower Than Atlantis,40,['53mEsGlH3Ou6RFpa77T8yo'],https://genius.com/Lower-than-atlantis-had-eno...,I hate everyone that I meet But I'm getting be...,en,Negative


#### Lyric descriptives

In [10]:
rows = len(df)
average_length = df['lyrics'].str.len().mean()
average_words = df['lyrics'].apply(lambda x: len(x.split())).mean()
print("Lyric descriptives")
print("--------------------------")
print(f"Number of songs: {rows}")
print(f"Average number of characters in a song's lyrics: {average_length}")
print(f"Average number of words in a song's lyrics: {average_words}")

Lyric descriptives
--------------------------
Number of songs: 10514
Average number of characters in a song's lyrics: 1895.2707818147233
Average number of words in a song's lyrics: 376.53671295415637


In [11]:
print("Dataset Distribution")
print("---------------------")
print(f"Precentage of negative labels: {len(df[df['sentiment']=='Negative'])/len(df) * 100}%")
print(f"Precentage of positive labels: {len(df[df['sentiment']=='Positive'])/len(df) * 100}%")
print(f"Precentage of neutral labels: {len(df[df['sentiment']=='Neutral'])/len(df) * 100}%")

Dataset Distribution
---------------------
Precentage of negative labels: 31.12992200875024%
Precentage of positive labels: 67.26269735590641%
Precentage of neutral labels: 1.6073806353433517%


### Train classifiers

#### Vectorization
TFIDF

In [12]:
# encode labels to integers
label_encoder = LabelEncoder()
df['encoded_sentiment'] = label_encoder.fit_transform(df['sentiment'])

# Preprocessing with TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=5000)  # You can adjust the number of features
X_tfidf = tfidf_vectorizer.fit_transform(df['lyrics'])
y = df['encoded_sentiment'].values

X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

#### Balancing our dataset

In [15]:
# Initialize SMOTE
sampling_strategy = {0: int(y_train.sum() * 0.5), 'neutral': int(y_train.sum() * 0.5)}  # Adjust these ratios as needed
smote = SMOTE(sampling_strategy=sampling_strategy)

# Apply SMOTE to the training data
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

ValueError: The {'neutral', 'negative'} target class is/are not present in the data.

**Algorithm selectio**

We keep in mind the computational resources needed for the algorithms. Maybe we don't end up with the stringest model, but we aim for the most feasibly reliable one.

| Algorithm | Pros | Cons |
|-----------|------|------|
| **SVM (Support Vector Machine)** | - Effective in high-dimensional spaces<br> - Works well with a clear margin of separation<br> - Less prone to overfitting | - Not suitable for very large datasets<br> - Requires feature scaling<br> - Can be less effective with overlapping classes |
| **Naive Bayes** | - Fast and efficient<br> - Works well with high-dimensional data<br> - Effective for text classification | - Based on the assumption of feature independence<br> - Can be outperformed by more complex models |
| **Logistic Regression** | - Simple and easy to implement<br> - Efficient for binary classification tasks<br> - Provides probabilities for outcomes | - Can struggle with complex relationships in data<br> - Not the best choice for non-linear problems |
| **Random Forest** | - Handles non-linear data well<br> - Less prone to overfitting<br> - Good for classification and regression | - Can be slow on large datasets<br> - Model interpretability can be challenging |
| **LSTM (Long Short-Term Memory)** | - Excellent for sequence data like text<br> - Can capture long-term dependencies<br> - Good for complex language modeling | - Computationally intensive<br> - Requires large training datasets<br> - Longer training times |
| **BERT (Bidirectional Encoder Representations from Transformers)** | - State-of-the-art for NLP tasks<br> - Understands word context and nuances<br> - Highly accurate for various language tasks | - Requires significant computational resources<br> - Complex and requires fine-tuning<br> - Overkill for simpler tasks |


**SVM, Naive Bayes, Logistic Regression, and Random Forest**: These are traditional machine learning models and are generally less complex and computationally intensive compared to LSTM and BERT. They can be effective for smaller datasets or less complex sentiment analysis tasks but might not capture the intricacies of language as effectively as LSTM or BERT.

**LSTM and BERT**: These are advanced deep learning models that excel in understanding language context and complexities. They are more suitable for large datasets and complex NLP tasks, but their need for significant computational resources and longer training times can be a drawback, especially in resource-constrained environments.

#### SVM

In [14]:
X_train_smote

<16917x5000 sparse matrix of type '<class 'numpy.float64'>'
	with 2126859 stored elements in Compressed Sparse Row format>

Grid search

In [None]:
%%time
# Define the model
svm_model = SVC()

# Define the parameter grid
param_grid = {
    'C': [0.1, 1, 10],  # Regularization parameter
    'kernel': ['linear', 'rbf'],  # Kernel type
    'gamma': ['scale', 'auto']  # Kernel coefficient
}

# Grid search with cross-validation
grid_search = GridSearchCV(svm_model, param_grid, cv=5, verbose=2, n_jobs=-1)

# Fit the grid search to the data
grid_search.fit(X_train_smote, y_train_smote)

Fitting 5 folds for each of 12 candidates, totalling 60 fits


Best parameter selection

In [40]:
# Best parameters and best score
svm_best_parameters = grid_search.best_params_
svm_best_score = grid_search.best_score_

print("Best Parameters:", svm_best_parameters)
print("Best Score:", svm_best_score)

Best Parameters: {'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}
Best Score: 0.8211861215498342


Model Evaluation

In [41]:
best_svm = grid_search.best_estimator_
test_accuracy = best_svm.score(X_test, y_test)
print("Test Accuracy:", test_accuracy)

Test Accuracy: 0.8440323347598668


#### Naive Bayes

Grid search

In [36]:
%%time
# Define the model
nb_model = MultinomialNB()

# Define the parameter grid
# Naive Bayes usually has fewer hyperparameters to tune, but you can experiment with alpha
param_grid = {
    'alpha': [0.01, 0.1, 1, 10, 100]  # Additive (Laplace/Lidstone) smoothing parameter
}

# Grid search with cross-validation
grid_search = GridSearchCV(nb_model, param_grid, cv=5, verbose=2, n_jobs=-1)

# Fit the grid search to the data
grid_search.fit(X_train_smote, y_train_smote)

Fitting 5 folds for each of 5 candidates, totalling 25 fits
CPU times: total: 78.1 ms
Wall time: 309 ms


Best parameter selection

In [37]:
nb_best_parameters = grid_search.best_params_
nb_best_score = grid_search.best_score_

print("Best Parameters:", nb_best_parameters)
print("Best Score:", nb_best_score)

Best Parameters: {'alpha': 0.01}
Best Score: 0.70265147099448


Model Evaluation

In [38]:
best_nb = grid_search.best_estimator_
test_accuracy = best_nb.score(X_test, y_test)
print("Test Accuracy:", test_accuracy)

Test Accuracy: 0.7208749405611032


#### Logistic Regression

Grid search

In [33]:
log_reg_model = LogisticRegression()


param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],  # Inverse of regularization strength
    'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']  # Algorithm to use in optimization
}

# Grid search with cross-validation
grid_search = GridSearchCV(log_reg_model, param_grid, cv=5, verbose=2, n_jobs=-1)

# Fit the grid search to the data
grid_search.fit(X_train, y_train)


Fitting 5 folds for each of 25 candidates, totalling 125 fits


Best parameter selection

In [34]:
lr_best_parameters = grid_search.best_params_
lr_best_score = grid_search.best_score_

print("Best Parameters:", lr_best_parameters)
print("Best Score:", lr_best_score)

Best Parameters: {'C': 10, 'solver': 'saga'}
Best Score: 0.8310532759927739


Model Evaluation

In [35]:
best_lr = grid_search.best_estimator_
test_accuracy = best_lr.score(X_test, y_test)
print("Test Accuracy:", test_accuracy)

Test Accuracy: 0.8549690917736567


#### Random Forest

Grid search

In [27]:
# Define the model
rf_model = RandomForestClassifier()

# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],  # Number of trees in the forest
    'max_depth': [10, 20, 30, None],  # Maximum depth of the tree
    'min_samples_split': [2, 5, 10],  # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 4]     # Minimum number of samples required to be at a leaf node
}

# Grid search with cross-validation
grid_search = GridSearchCV(rf_model, param_grid, cv=5, verbose=2, n_jobs=-1)

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 108 candidates, totalling 540 fits


Best parameter selection

In [31]:
# Best parameters and best score
forest_best_parameters = grid_search.best_params_
forest_best_score = grid_search.best_score_

print("Best Parameters:", forest_best_parameters)
print("Best Score:", forest_best_score)

Best Parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 300}
Best Score: 0.7499685248653563


Model Evaluation

In [32]:
# Evaluate on test data (optional)
best_rf = grid_search.best_estimator_
test_accuracy = best_rf.score(X_test, y_test)
print("Test Accuracy:", test_accuracy)

Test Accuracy: 0.7689015691868759


### Model comparison

In [42]:
# dict of all the models
best_estimators = {
    'SVM': best_svm,
    'Naive Bayes': best_nb,
    'Logistic Regression': best_lr,
    'Random Forest': best_rf
}

Dummy classifier as baseline model

In [44]:
# Assuming X_train, X_test, y_train, y_test are already defined
# Implement the Dummy Classifier
dummy_clf = DummyClassifier(strategy='most_frequent')
dummy_clf.fit(X_train, y_train)
dummy_predictions = dummy_clf.predict(X_test)

# Add the Dummy Classifier to your best estimators dictionary
best_estimators['Dummy Classifier'] = dummy_clf

#### Metrics table

In [45]:
# Generate predictions for all models
predictions = {model: estimator.predict(X_test) for model, estimator in best_estimators.items()}

# Initialize a dictionary to hold the metrics
metrics_summary = {}

# Calculate metrics for all models
for model, model_predictions in predictions.items():
    accuracy = accuracy_score(y_test, model_predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(y_test, model_predictions, average='weighted')
    metrics_summary[model] = {
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1
    }

# Display the metrics for each model including the Dummy Classifier
for model, metrics in metrics_summary.items():
    print(f"{model} - Accuracy: {metrics['Accuracy']}, Precision: {metrics['Precision']}, Recall: {metrics['Recall']}, F1 Score: {metrics['F1 Score']}")

SVM - Accuracy: 0.8440323347598668, Precision: 0.8385714980166168, Recall: 0.8440323347598668, F1 Score: 0.835704406363735
Naive Bayes - Accuracy: 0.7208749405611032, Precision: 0.7149331227896696, Recall: 0.7208749405611032, F1 Score: 0.6783429621715981
Logistic Regression - Accuracy: 0.8549690917736567, Precision: 0.8496069060849031, Recall: 0.8549690917736567, F1 Score: 0.8503726335406443
Random Forest - Accuracy: 0.7689015691868759, Precision: 0.806421920536521, Recall: 0.7689015691868759, F1 Score: 0.7248527080214562
Dummy Classifier - Accuracy: 0.6814075130765573, Precision: 0.4643161988771786, Recall: 0.6814075130765573, F1 Score: 0.5522946641621643


  _warn_prf(average, modifier, msg_start, len(result))


Metrict table to be copy pasted into overleaf to save time <3 

In [46]:
# Convert metrics_summary to a DataFrame
metrics_df = pd.DataFrame(metrics_summary).transpose()

# Convert the DataFrame to a LaTeX table
latex_table = metrics_df.to_latex(float_format="%.2f", header=True, index=True)

# Printing the LaTeX table
print(latex_table)

\begin{tabular}{lrrrr}
\toprule
 & Accuracy & Precision & Recall & F1 Score \\
\midrule
SVM & 0.84 & 0.84 & 0.84 & 0.84 \\
Naive Bayes & 0.72 & 0.71 & 0.72 & 0.68 \\
Logistic Regression & 0.85 & 0.85 & 0.85 & 0.85 \\
Random Forest & 0.77 & 0.81 & 0.77 & 0.72 \\
Dummy Classifier & 0.68 & 0.46 & 0.68 & 0.55 \\
\bottomrule
\end{tabular}



**Explanation**

These results provide a comparison of the performance of various machine learning models (SVM, Naive Bayes, Logistic Regression, Random Forest) and a baseline Dummy Classifier on your dataset, based on different metrics. Let's break down what each metric means and what the results indicate:

**Accuracy**

*Definition*: The proportion of correct predictions among the total number of cases evaluated.
*Results*: Logistic Regression performed the best with an accuracy of about 85.50%, followed closely by SVM. The Dummy Classifier, as expected, has the lowest accuracy.

**Precision**

*Definition*: The ratio of correctly predicted positive observations to the total predicted positives. High precision relates to a low false positive rate.
*Results*: Logistic Regression again leads in precision, suggesting it's better at minimizing false positives. Naive Bayes has the lowest precision among the advanced models, indicating more false positives.

**Recall (Sensitivity)**

*Definition*: The ratio of correctly predicted positive observations to all observations in the actual class. It shows how many of the actual positives were captured by the model.
*Results*: Similar to accuracy, Logistic Regression and SVM have high recall, meaning they are good at capturing actual positives. The Dummy Classifier has a high recall too, but this is misleading since it always predicts the most frequent class, ignoring the actual class distribution.

**F1 Score**

*Definition*: The weighted average of Precision and Recall. It takes both false positives and false negatives into account. An F1 score reaches its best value at 1 (perfect precision and recall) and worst at 0.
*Results*: Logistic Regression has the highest F1 score, indicating a good balance between precision and recall. Naive Bayes has a significantly lower F1 score, suggesting it doesn't balance precision and recall as well as the other models.

**Overall Analysis**

Logistic Regression appears to be the most effective model for your dataset, performing well across all metrics. It seems to offer a good balance between identifying relevant instances and minimizing incorrect classifications.
SVM also performs well, especially in terms of recall and accuracy, making it a strong contender.
Naive Bayes, while faster and simpler, doesn't perform as well in this context, particularly in terms of precision and F1 score.
Random Forest shows moderate performance, but it's outperformed by both Logistic Regression and SVM.
Dummy Classifier serves as a baseline, and as expected, it has the lowest performance. However, its results are important to understand the minimum threshold any sophisticated model should surpass.

#### Confusion Matrix

In [55]:
from sklearn.metrics import confusion_matrix


In [58]:
# Mapping for your labels
label_mapping = {0: 'negative', 1: 'neutral', 2: 'positive'}

for model_name, y_pred in predictions.items():
    # Generate the confusion matrix for the current model
    cm = confusion_matrix(y_test, y_pred)

    # Calculate the mislabeling percentage for each label
    mislabeling_percentages = np.sum(cm, axis=1) - np.diag(cm)
    mislabeling_percentages = mislabeling_percentages / np.sum(cm, axis=1) * 100

    print(f"\nMislabeling percentages for {model_name}:")
    for label_idx, mislabel_pct in enumerate(mislabeling_percentages):
        label_name = label_mapping.get(label_idx, f"Label {label_idx}")
        print(f"Label {label_name}: {mislabel_pct:.2f}% mislabeled")


Mislabeling percentages for SVM:
Label negative: 33.02% mislabeled
Label neutral: 83.78% mislabeled
Label positive: 6.14% mislabeled

Mislabeling percentages for Naive Bayes:
Label negative: 74.72% mislabeled
Label neutral: 64.86% mislabeled
Label positive: 6.28% mislabeled

Mislabeling percentages for Logistic Regression:
Label negative: 25.91% mislabeled
Label neutral: 78.38% mislabeled
Label positive: 7.82% mislabeled

Mislabeling percentages for Random Forest:
Label negative: 71.09% mislabeled
Label neutral: 56.76% mislabeled
Label positive: 1.05% mislabeled

Mislabeling percentages for Dummy Classifier:
Label negative: 100.00% mislabeled
Label neutral: 100.00% mislabeled
Label positive: 0.00% mislabeled


In [69]:
print("Dataset Distribution")
print("---------------------")
print(f"Precentage of negative labels: {len(df[df['sentiment']=='Negative'])/len(df) * 100}%")
print(f"Precentage of positive labels: {len(df[df['sentiment']=='Positive'])/len(df) * 100}%")
print(f"Precentage of neutral labels: {len(df[df['sentiment']=='Neutral'])/len(df) * 100}%")

Dataset Distribution
---------------------
Precentage of negative labels: 31.12992200875024%
Precentage of positive labels: 67.26269735590641%
Precentage of neutral labels: 1.6073806353433517%


The high mislabeling percentages for certain labels alongside overall high accuracy for models like Logistic Regression (LR) and Support Vector Machine (SVM) can be explained by a few factors, particularly the distribution of classes (labels) in your dataset. 

Let's focus on our two best performars, svm and lr. The mislabelling percetages concern mostly the Neutral class, which respresents less than 2% of the whole dataset.  In an imbalanced dataset, the minority classes (like 'neutral' -1.6%- or 'negative' -31%) have fewer instances, so even a high percentage of mislabeling in these classes might not significantly impact the overall accuracy if the model performs exceedingly well on the majority class.

How could we fix that? Idealy we would have used oversamling, undersampling or generation (smote) techniques but I realized that very late, which hightlights the importance of exploring your dataset before actually going forward with your models 🙃.

#### Statistical testing

We are conducting further statistical testing between our two top persorming models.

First we are conducting a Shapiro-Wilk test with H0: "Differences between the two models' scores seem to be normally distributed (fail to reject H0)". If we failt to reject H0 then we can proceed with the paired t-test for these models, as the normality assumption required for the paired t-test is satisfied.

In [50]:
# Obtain cross-validation scores for each model
scores_lr = cross_val_score(best_lr, X_tfidf, y, cv=5)
scores_svm = cross_val_score(best_svm, X_tfidf, y, cv=5)

# Calculate differences between sets of scores
score_diffs = scores_lr - scores_svm

# Shapiro-Wilk Test for Normality
stat, p = shapiro(score_diffs)
alpha = 0.05
if p > alpha:
    print('Differences seem to be normally distributed (fail to reject H0)')
else:
    print('Differences do not appear to be normally distributed (reject H0)')

Differences seem to be normally distributed (fail to reject H0)


Interpreting the Shapiro-Wilk test result "Differences seem to be normally distributed (fail to reject H0)" in the context of comparing your best_lr (Logistic Regression) and best_svm (Support Vector Machine) models indicates that the differences in their cross-validation scores do not significantly deviate from a normal distribution. This means you can proceed with the paired t-test for these models, as the normality assumption required for the paired t-test is satisfied.

Now let's perform a paired t-test to determine if the differences in performance between the models are statistically significant

In [53]:
t_stat, p_value = ttest_rel(scores_lr, scores_svm)

print(f"Paired t-test between Logistic Regression and SVM:\nT-statistic: {t_stat}, P-value: {p_value}")

Paired t-test between Logistic Regression and SVM:
T-statistic: 6.581093552269545, P-value: 0.002759946835786729


**Statistical Significance**: Given that the p-value is less than 0.05, you can reject the null hypothesis. This means there is a statistically significant difference in the performance of the Logistic Regression and SVM models on your dataset.

**Model Performance**: The positive t-statistic value indicates that the mean cross-validation score of the Logistic Regression model is higher than that of the SVM model. This suggests that Logistic Regression performs better than SVM on your dataset, with this difference being statistically significant.

------------------------
Corresponding latex table:

\begin{table}[h]
\centering
\begin{tabular}{|l|l|l|}
\hline
\textbf{Model Comparison} & \textbf{T-Statistic} & \textbf{P-Value} \\ \hline
Logistic Regression vs SVM & 6.581 & 0.00276 \\ \hline
% Add more rows for other model comparisons if you have them
% Example: Model A vs Model B & T-Statistic Value & P-Value \\
\end{tabular}
\caption{Paired T-Test Results Between Models}
\label{tab:my_label}
\end{table}

In summary, this result indicates that Logistic Regression is not only performing better than SVM on average, but that this better performance is statistically significant and not likely due to random chance or variability in the dataset.