In [3]:
# pd.set_option('display.max_colwidth', None)

In [47]:
import pandas as pd
import re
from textblob import TextBlob
import numpy as np
from scipy.stats import shapiro


from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.dummy import DummyClassifier
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from scipy.stats import ttest_rel
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix

from imblearn.under_sampling import RandomUnderSampler


### Lyric preprocessing & Sentiment annotation

Read data from local csv

In [5]:
df = pd.read_csv("songs_with_lyrics_clean.csv")

Clean lyrics from the following:
* \n
* [something]

Also explored invalid urls

In [6]:
def clean_lyrics(lyrics):
    # Remove text within brackets
    cleaned_lyrics = re.sub(r"\[.*?\]|\(.*?\)", "", lyrics)

    # Capitalize the first letter of each line
    cleaned_lyrics = cleaned_lyrics.replace('\n', " ")
    cleaned_lyrics = cleaned_lyrics.strip(" ")

    return cleaned_lyrics

In [7]:
# Apply the cleaning function to your DataFrame
df['lyrics'] = df['lyrics'].apply(clean_lyrics)

After data exploration, we realize that some extra and more custom cleaning is needed

In [8]:
# this url is not the url of a song
df.drop(df.loc[df['lyrics_url'] == 'https://genius.com/Scopey-almost-every-album-ive-listened-to-lyrics'].index, inplace = True)
df.drop(df.loc[df['lyrics_url'] == 'https://genius.com/Hossein-amini-drive-annotated'].index, inplace = True)
df.drop(df.loc[df['lyrics_url'] == 'https://genius.com/Genius-valentines-day-playlists-lyrics'].index, inplace = True)


Label songs based on their lyric sentiments.

Used textblob library

In [9]:
# Function to apply sentiment analysis using TextBlob
def get_sentiment(lyrics):
    try:
        blob = TextBlob(lyrics)
        # TextBlob returns polarity and subjectivity, you can use just polarity for a simple positive/negative/neutral sentiment
        polarity = blob.sentiment.polarity
        if polarity > 0:
            return "Positive"
        elif polarity < 0:
            return "Negative"
        else:
            return "Neutral"
    except Exception as e:
        print(f"Error processing lyrics: {e}")
        return None
    

def clean_lyrics(lyric):
    cleaned_lyrics = re.sub()

In [10]:
%%time
df['sentiment'] = df['lyrics'].apply(lambda lyrics: get_sentiment(lyrics))


CPU times: total: 19.7 s
Wall time: 23.2 s


In [11]:
df.sample(10)

Unnamed: 0,song_id,song_name,artist_id,artist_name,popularity,playlist,lyrics_url,lyrics,language,sentiment
10034,1CYBnHYYbOAsuDCGa0zrw0,Without a Light,4RwbDag6jWIYJnEGH6Wte9,Drew Holcomb & The Neighbors,64,['37i9dQZF1DX9crXQ0wuuXE'],https://genius.com/Drew-holcomb-and-the-neighb...,Stay a while Maybe I will find the words to sa...,en,Positive
3286,2Dct3GykKZ58hpWRFfe2Qd,Heading South,40ZNYROS4zLfyyBSs2PGe2,Zach Bryan,85,"['37i9dQZF1DX7aUUBCKwo4Y', '37i9dQZF1DX13ZzXoo...",https://genius.com/Zach-bryan-heading-south-ly...,Was a boy who was a dreamer and he flew so hig...,en,Positive
5079,7xNQMVeEgXQf0xUsTlcpMP,Made to Love You,3wrdNgjTSLLQZ382sPyoA5,Dan Owen,53,['37i9dQZF1DWXxauMBOQPxX'],https://genius.com/Dan-owen-made-to-love-you-l...,I never thought it could be And there was noth...,en,Positive
2337,6gHzcDHbcoGgvoGAtcTq94,Eric After Hours Blues,6PAt558ZEZl0DmdXlnjMgD,Eric Clapton,34,"['37i9dQZF1DX9tJFUKjeDqu', '3a54WQYSUPwjgGmfd4...",https://genius.com/Scipio-kurupt-welcome-2-la-...,Welcome to LA Welcome to LA Welcome to LA We...,en,Positive
7523,1Az0fhiWi0EVS4cZ3FF20X,Sometimes It Snows in April,5a2EaR3hamoenG9rDuVn8j,Prince,49,['5qwDfhyhvk4fr5XZEDRpWc'],https://genius.com/Prince-and-the-revolution-s...,Tracy died soon after a long fought civil war...,en,Positive
7667,3kSXn1osC89W8JcPLozTzs,Stand By You,3QLIkT4rD2FMusaqmkepbq,Rachel Platten,66,"['37i9dQZF1DX0KGZxcPEEqa', '4zihGV0dKifyihlMLs...",https://genius.com/Rachel-platten-stand-by-you...,"Even if we can't find Heaven, Heaven, Heaven H...",en,Positive
3111,5hRyy1AIJMc5wyizJcSIEa,Got Your Name On It,2nTzAHwCk0swkDdIPj2FIP,Jade Eagleson,54,['4QdmaN439k41ihsIw7B9r3'],https://genius.com/Jade-eagleson-got-your-name...,There's a spot in my driveway Where you can pa...,en,Positive
6284,7mitXLIMCflkhZiD34uEQI,Party Rock Anthem,3sgFRtyBnxXD5ESfmbK4dl,LMFAO,76,"['6vJy4OUcFt0akV2QE9kAx1', '7og4tbbFFTkB7FW205...",https://genius.com/Lmfao-party-rock-anthem-lyrics,Party Rock Yeah Woo! Let's go! Party rock is ...,en,Positive
2527,2hPyG7NMpHAmOrmlSziAx2,Farther Up The Road,48nwxUvPJZkm8uPa7xMzmj,"Bobby ""Blue"" Bland",39,['37i9dQZF1DXcu3QLJudo4X'],https://genius.com/Bobby-blue-bland-farther-up...,"Further on up the road, someone gonna hurt you...",en,Positive
6996,5TvE3pk05pyFIGdSY9j4DJ,Say Something,5xKp3UyavIBUsGy3DQdXeF,A Great Big World,72,"['6ZAg4XZOoHgt5AWwuat8Kk', '3B1yxHl5pbyORZ9bjW...",https://genius.com/A-great-big-world-and-chris...,"Say something, I'm giving up on you I'll be th...",en,Negative


#### Lyric descriptives

In [12]:
rows = len(df)
average_length = df['lyrics'].str.len().mean()
average_words = df['lyrics'].apply(lambda x: len(x.split())).mean()
print("Lyric descriptives")
print("--------------------------")
print(f"Number of songs: {rows}")
print(f"Average number of characters in a song's lyrics: {average_length}")
print(f"Average number of words in a song's lyrics: {average_words}")

Lyric descriptives
--------------------------
Number of songs: 10514
Average number of characters in a song's lyrics: 1895.2707818147233
Average number of words in a song's lyrics: 376.53671295415637


In [13]:
print("Dataset Distribution")
print("---------------------")
print(f"Precentage of negative labels: {len(df[df['sentiment']=='Negative'])/len(df) * 100}%")
print(f"Precentage of positive labels: {len(df[df['sentiment']=='Positive'])/len(df) * 100}%")
print(f"Precentage of neutral labels: {len(df[df['sentiment']=='Neutral'])/len(df) * 100}%")

Dataset Distribution
---------------------
Precentage of negative labels: 31.12992200875024%
Precentage of positive labels: 67.26269735590641%
Precentage of neutral labels: 1.6073806353433517%


### Train classifiers

#### Balancing our dataset & vectorization

Since the neutral reviews are so few there is no point in keeping them in the dataset

In [14]:
df_neg_pos = df[df['sentiment'] !='Neutral']

In [16]:
print("New Dataset Distribution")
print("---------------------")
print(f"Precentage of negative labels: {len(df_neg_pos[df_neg_pos['sentiment']=='Negative'])/len(df_neg_pos) * 100}%")
print(f"Precentage of positive labels: {len(df_neg_pos[df_neg_pos['sentiment']=='Positive'])/len(df_neg_pos) * 100}%")

New Dataset Distribution
---------------------
Precentage of negative labels: 31.6384726921218%
Precentage of positive labels: 68.3615273078782%


In [18]:
# Encode labels to integers
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(df_neg_pos['sentiment'])

# Preprocessing with TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=5000)  # Adjust the number of features as needed
X_tfidf = tfidf_vectorizer.fit_transform(df_neg_pos['lyrics'])
y = y_encoded

# Split your data into training and testing set
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

# Initialize RandomUnderSampler
under_sampler = RandomUnderSampler(random_state=42)

# Apply undersampling to the training data
X_train_under, y_train_under = under_sampler.fit_resample(X_train, y_train)

**Algorithm selection**

We keep in mind the computational resources needed for the algorithms. Maybe we don't end up with the stringest model, but we aim for the most feasibly reliable one.

| Algorithm | Pros | Cons |
|-----------|------|------|
| **SVM (Support Vector Machine)** | - Effective in high-dimensional spaces<br> - Works well with a clear margin of separation<br> - Less prone to overfitting | - Not suitable for very large datasets<br> - Requires feature scaling<br> - Can be less effective with overlapping classes |
| **Naive Bayes** | - Fast and efficient<br> - Works well with high-dimensional data<br> - Effective for text classification | - Based on the assumption of feature independence<br> - Can be outperformed by more complex models |
| **Logistic Regression** | - Simple and easy to implement<br> - Efficient for binary classification tasks<br> - Provides probabilities for outcomes | - Can struggle with complex relationships in data<br> - Not the best choice for non-linear problems |
| **Random Forest** | - Handles non-linear data well<br> - Less prone to overfitting<br> - Good for classification and regression | - Can be slow on large datasets<br> - Model interpretability can be challenging |
| **LSTM (Long Short-Term Memory)** | - Excellent for sequence data like text<br> - Can capture long-term dependencies<br> - Good for complex language modeling | - Computationally intensive<br> - Requires large training datasets<br> - Longer training times |
| **BERT (Bidirectional Encoder Representations from Transformers)** | - State-of-the-art for NLP tasks<br> - Understands word context and nuances<br> - Highly accurate for various language tasks | - Requires significant computational resources<br> - Complex and requires fine-tuning<br> - Overkill for simpler tasks |


**SVM, Naive Bayes, Logistic Regression, and Random Forest**: These are traditional machine learning models and are generally less complex and computationally intensive compared to LSTM and BERT. They can be effective for smaller datasets or less complex sentiment analysis tasks but might not capture the intricacies of language as effectively as LSTM or BERT.

**LSTM and BERT**: These are advanced deep learning models that excel in understanding language context and complexities. They are more suitable for large datasets and complex NLP tasks, but their need for significant computational resources and longer training times can be a drawback, especially in resource-constrained environments.

#### SVM

Grid search

In [25]:
%%time
# Define the model
svm_model = SVC()

# Define the parameter grid
param_grid = {
    'C': [0.1, 1, 10],  # Regularization parameter
    'kernel': ['linear', 'rbf'],  # Kernel type
    'gamma': ['scale', 'auto']  # Kernel coefficient
}

# Grid search with cross-validation
grid_search = GridSearchCV(svm_model, param_grid, cv=5, verbose=2, n_jobs=-1)

# Fit the grid search to the data
grid_search.fit(X_train_under, y_train_under)

Fitting 5 folds for each of 12 candidates, totalling 60 fits
CPU times: total: 28.4 s
Wall time: 6min 19s


Best parameter selection

In [26]:
# Best parameters and best score
svm_best_parameters = grid_search.best_params_
svm_best_score = grid_search.best_score_

print("Best Parameters:", svm_best_parameters)
print("Best Score:", svm_best_score)

Best Parameters: {'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}
Best Score: 0.8016125578734175


Model Evaluation

In [27]:
best_svm = grid_search.best_estimator_
test_accuracy = best_svm.score(X_test, y_test)
print("Test Accuracy:", test_accuracy)

Test Accuracy: 0.8144030932817786


#### Naive Bayes

Grid search

In [28]:
%%time
# Define the model
nb_model = MultinomialNB()

# Define the parameter grid
# Naive Bayes usually has fewer hyperparameters to tune, but you can experiment with alpha
param_grid = {
    'alpha': [0.01, 0.1, 1, 10, 100]  # Additive (Laplace/Lidstone) smoothing parameter
}

# Grid search with cross-validation
grid_search = GridSearchCV(nb_model, param_grid, cv=5, verbose=2, n_jobs=-1)

# Fit the grid search to the data
grid_search.fit(X_train_under, y_train_under)

Fitting 5 folds for each of 5 candidates, totalling 25 fits
CPU times: total: 15.6 ms
Wall time: 240 ms


Best parameter selection

In [29]:
nb_best_parameters = grid_search.best_params_
nb_best_score = grid_search.best_score_

print("Best Parameters:", nb_best_parameters)
print("Best Score:", nb_best_score)

Best Parameters: {'alpha': 1}
Best Score: 0.7013873824542189


Model Evaluation

In [30]:
best_nb = grid_search.best_estimator_
test_accuracy = best_nb.score(X_test, y_test)
print("Test Accuracy:", test_accuracy)

Test Accuracy: 0.7414209763170614


#### Logistic Regression

Grid search

In [31]:
%%time
log_reg_model = LogisticRegression()


param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],  # Inverse of regularization strength
    'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']  # Algorithm to use in optimization
}

# Grid search with cross-validation
grid_search = GridSearchCV(log_reg_model, param_grid, cv=5, verbose=2, n_jobs=-1)

# Fit the grid search to the data
grid_search.fit(X_train_under, y_train_under)

Fitting 5 folds for each of 25 candidates, totalling 125 fits
CPU times: total: 688 ms
Wall time: 7.39 s


Best parameter selection

In [32]:
lr_best_parameters = grid_search.best_params_
lr_best_score = grid_search.best_score_

print("Best Parameters:", lr_best_parameters)
print("Best Score:", lr_best_score)

Best Parameters: {'C': 10, 'solver': 'saga'}
Best Score: 0.8122904008227765


Model Evaluation

In [33]:
best_lr = grid_search.best_estimator_
test_accuracy = best_lr.score(X_test, y_test)
print("Test Accuracy:", test_accuracy)

Test Accuracy: 0.8231029482841953


#### Random Forest

Grid search

In [34]:
# # Define the model
# rf_model = RandomForestClassifier()

# # Define the parameter grid
# param_grid = {
#     'n_estimators': [100, 200, 300],  # Number of trees in the forest
#     'max_depth': [10, 20, 30, None],  # Maximum depth of the tree
#     'min_samples_split': [2, 5, 10],  # Minimum number of samples required to split an internal node
#     'min_samples_leaf': [1, 2, 4]     # Minimum number of samples required to be at a leaf node
# }

# # Grid search with cross-validation
# grid_search = GridSearchCV(rf_model, param_grid, cv=5, verbose=2, n_jobs=-1)

# # Fit the grid search to the data
# grid_search.fit(X_train, y_train)

Best parameter selection

In [35]:
# # Best parameters and best score
# forest_best_parameters = grid_search.best_params_
# forest_best_score = grid_search.best_score_

# print("Best Parameters:", forest_best_parameters)
# print("Best Score:", forest_best_score)

Model Evaluation

In [36]:
# # Evaluate on test data (optional)
# best_rf = grid_search.best_estimator_
# test_accuracy = best_rf.score(X_test, y_test)
# print("Test Accuracy:", test_accuracy)

### Model comparison

In [37]:
# dict of all the models
best_estimators = {
    'SVM': best_svm,
    'Naive Bayes': best_nb,
    'Logistic Regression': best_lr,
#     'Random Forest': best_rf
}

Dummy classifier as baseline model

In [38]:
# Assuming X_train, X_test, y_train, y_test are already defined
# Implement the Dummy Classifier
dummy_clf = DummyClassifier(strategy='most_frequent')
dummy_clf.fit(X_train, y_train)
dummy_predictions = dummy_clf.predict(X_test)

# Add the Dummy Classifier to your best estimators dictionary
best_estimators['Dummy Classifier'] = dummy_clf

#### Metrics table

In [39]:
# Generate predictions for all models
predictions = {model: estimator.predict(X_test) for model, estimator in best_estimators.items()}

# Initialize a dictionary to hold the metrics
metrics_summary = {}

# Calculate metrics for all models
for model, model_predictions in predictions.items():
    accuracy = accuracy_score(y_test, model_predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(y_test, model_predictions, average='weighted')
    metrics_summary[model] = {
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1
    }

# Display the metrics for each model including the Dummy Classifier
for model, metrics in metrics_summary.items():
    print(f"{model} - Accuracy: {metrics['Accuracy']}, Precision: {metrics['Precision']}, Recall: {metrics['Recall']}, F1 Score: {metrics['F1 Score']}")

SVM - Accuracy: 0.8144030932817786, Precision: 0.8366713244592203, Recall: 0.8144030932817786, F1 Score: 0.8200886257226082
Naive Bayes - Accuracy: 0.7414209763170614, Precision: 0.7493681326869763, Recall: 0.7414209763170614, F1 Score: 0.7447846190703241
Logistic Regression - Accuracy: 0.8231029482841953, Precision: 0.8419559537515556, Recall: 0.8231029482841953, F1 Score: 0.8281002651608682
Dummy Classifier - Accuracy: 0.7080715321411309, Precision: 0.5013652946286886, Recall: 0.7080715321411309, F1 Score: 0.5870542131221035


  _warn_prf(average, modifier, msg_start, len(result))


Metrict table to be copy pasted into overleaf to save time <3 

In [40]:
# Convert metrics_summary to a DataFrame
metrics_df = pd.DataFrame(metrics_summary).transpose()

# Convert the DataFrame to a LaTeX table
latex_table = metrics_df.to_latex(float_format="%.2f", header=True, index=True)

# Printing the LaTeX table
print(latex_table)

\begin{tabular}{lrrrr}
\toprule
 & Accuracy & Precision & Recall & F1 Score \\
\midrule
SVM & 0.81 & 0.84 & 0.81 & 0.82 \\
Naive Bayes & 0.74 & 0.75 & 0.74 & 0.74 \\
Logistic Regression & 0.82 & 0.84 & 0.82 & 0.83 \\
Dummy Classifier & 0.71 & 0.50 & 0.71 & 0.59 \\
\bottomrule
\end{tabular}



**Explanation**

**Results from Undersampled Models**

*SVM (Support Vector Machine)*:

Accuracy: 81.44% - A good percentage of the predictions are correct.
Precision: 83.67% - When it predicts a label, it is correct about 83.67% of the time.
Recall: 81.44% - It correctly identifies 81.44% of all true positive and negative instances.
F1 Score: 82.01% - A balance between precision and recall, indicating good overall performance.

*Naive Bayes*:

Accuracy: 74.14% - Lower than SVM and Logistic Regression.
Precision: 74.94% - Slightly higher precision compared to its accuracy.
Recall: 74.14% - Similar to accuracy, indicating consistent performance across classes.
F1 Score: 74.48% - Reflects a balance between precision and recall.

*Logistic Regression*:

Accuracy: 82.31% - The highest among the models.
Precision: 84.20% - Indicates a strong ability to correctly label positive and negative instances.
Recall: 82.31% - Consistent with its accuracy.
F1 Score: 82.81% - Shows a good balance between precision and recall.

*Dummy Classifier*:

Accuracy: 70.81% - As a baseline, it's predictably lower but not by a wide margin.
Precision: 50.14% - Low precision indicates a high number of false positives.
Recall: 70.81% - Identical to accuracy, expected for a model predicting the most frequent class.
F1 Score: 58.71% - Lower, reflecting the imbalance between precision and recall.

**Interpretation and Comparison**

Improved Balanced Performance: The SVM and Logistic Regression models show an overall balanced performance in terms of accuracy, precision, recall, and F1 score. This suggests effective handling of both positive and negative classes after undersampling.

Logistic Regression's Superiority: Logistic Regression slightly outperforms SVM in all metrics, making it potentially the best model among those tested.

Naive Bayes' Lower Performance: Naive Bayes lags behind SVM and Logistic Regression, indicating it might be less suited for this particular task or data distribution.

Dummy Classifier as Baseline: The Dummy Classifier serves as a baseline, and its lower performance compared to other models 
validates their effectiveness.

Effect of Undersampling: The improved balance in precision and recall in the SVM and Logistic Regression models compared to the Dummy Classifier suggests that undersampling helped in addressing the class imbalance issue.
Comparison with Initial Attempt: If these results show an improvement over your initial attempt without sampling, it indicates that addressing the class imbalance was beneficial. Specifically, look for improvements in precision and recall for the minority class, which are often most affected by class imbalance.

In summary, undersampling appears to have helped in balancing the performance of your models across different metrics, with Logistic Regression showing the best overall performance. These models now likely offer a more reliable understanding of their ability to generalize to new data, particularly in a more balanced real-world scenario.

#### Confusion Matrix

In [48]:
# Mapping for your labels
label_mapping = {0: 'negative', 1: 'positive'}

for model_name, y_pred in predictions.items():
    # Generate the confusion matrix for the current model
    cm = confusion_matrix(y_test, y_pred)

    # Calculate the mislabeling percentage for each label
    mislabeling_percentages = np.sum(cm, axis=1) - np.diag(cm)
    mislabeling_percentages = mislabeling_percentages / np.sum(cm, axis=1) * 100

    print(f"\nMislabeling percentages for {model_name}:")
    for label_idx, mislabel_pct in enumerate(mislabeling_percentages):
        label_name = label_mapping.get(label_idx, f"Label {label_idx}")
        print(f"Label {label_name}: {mislabel_pct:.2f}% mislabeled")


Mislabeling percentages for SVM:
Label negative: 17.88% mislabeled
Label positive: 18.84% mislabeled

Mislabeling percentages for Naive Bayes:
Label negative: 39.40% mislabeled
Label positive: 20.27% mislabeled

Mislabeling percentages for Logistic Regression:
Label negative: 17.88% mislabeled
Label positive: 17.61% mislabeled

Mislabeling percentages for Dummy Classifier:
Label negative: 100.00% mislabeled
Label positive: 0.00% mislabeled


The adjustments made to balance the classes have led to a more equitable performance between the negative and positive labels. Previously, the models were more biased towards correctly identifying the positive (majority) class, while the negative (minority) class saw higher rates of mislabeling. The balanced approach has notably improved the ability of models, especially Naive Bayes and SVM, to correctly classify the negative instances, leading to a more reliable and fair performance across classes.

This analysis underscores the importance of addressing class imbalance in datasets, especially when dealing with models that are sensitive to such disparities. It highlights how balancing techniques can lead to a more accurate and unbiased assessment of a model's predictive capabilities.

**Comparison with Imbalanced Results**
1. Improved Balance: The balanced models show a more even performance between the negative and positive classes. The imbalanced dataset had a tendency for lower mislabeling in the positive class (the majority class) and higher mislabeling in the negative class.
2. Overall Improvement: In the balanced dataset, all models (except for the Dummy Classifier) show a reduced mislabeling rate for the negative class, indicating that balancing the dataset has improved their ability to correctly identify negative instances.
3. Consistent Performance in Positive Class: The performance in identifying positive labels remains relatively good and more balanced after undersampling, as seen in the reduced mislabeling rates for SVM and Logistic Regression.
4. Naive Bayes: This model shows the most significant improvement in the negative class, indicating that the balancing had a notable positive effect on its performance.

#### Statistical testing

We are conducting further statistical testing between our two top persorming models.

First we are conducting a Shapiro-Wilk test with H0: "Differences between the two models' scores seem to be normally distributed (fail to reject H0)". If we failt to reject H0 then we can proceed with the paired t-test for these models, as the normality assumption required for the paired t-test is satisfied.

In [50]:
# Obtain cross-validation scores for each model
scores_lr = cross_val_score(best_lr, X_tfidf, y, cv=5)
scores_svm = cross_val_score(best_svm, X_tfidf, y, cv=5)

# Calculate differences between sets of scores
score_diffs = scores_lr - scores_svm

# Shapiro-Wilk Test for Normality
stat, p = shapiro(score_diffs)
alpha = 0.05
if p > alpha:
    print('Differences seem to be normally distributed (fail to reject H0)')
else:
    print('Differences do not appear to be normally distributed (reject H0)')

Differences seem to be normally distributed (fail to reject H0)


Interpreting the Shapiro-Wilk test result "Differences seem to be normally distributed (fail to reject H0)" in the context of comparing your best_lr (Logistic Regression) and best_svm (Support Vector Machine) models indicates that the differences in their cross-validation scores do not significantly deviate from a normal distribution. This means you can proceed with the paired t-test for these models, as the normality assumption required for the paired t-test is satisfied.

Now let's perform a paired t-test to determine if the differences in performance between the models are statistically significant

In [51]:
t_stat, p_value = ttest_rel(scores_lr, scores_svm)

print(f"Paired t-test between Logistic Regression and SVM:\nT-statistic: {t_stat}, P-value: {p_value}")

Paired t-test between Logistic Regression and SVM:
T-statistic: 6.491434997263605, P-value: 0.002904180302547974


**Statistical Significance**: Since the p-value is less than the typical alpha level of 0.05, you can reject the null hypothesis. This means that the difference in performance between the Logistic Regression and SVM models is statistically significant.

**Model Comparison**: The positive t-statistic suggests that the mean cross-validation score of the Logistic Regression model is higher than that of the SVM model. This indicates that Logistic Regression performs better than SVM on your dataset, and the difference in their performances is not just due to random chance.


In summary, this result suggests that Logistic Regression not only performs better than SVM on average but that this better performance is statistically significant and unlikely to be due to random fluctuations in the dataset. This provides a solid basis for preferring Logistic Regression over SVM in this particular case, assuming the other factors like model interpretability, complexity, and computational requirements also align with your project's needs.

------------------------
Corresponding latex table:

\begin{table}[h]
\centering
\begin{tabular}{|l|l|l|}
\hline
\textbf{Model Comparison} & \textbf{T-Statistic} & \textbf{P-Value} \\ \hline
Logistic Regression vs SVM & 6.491 & 0.0029 \\ \hline
\end{tabular}
\caption{Paired T-Test Results Between Logistic Regression and SVM}
\label{tab:paired_t_test}
\end{table}


In summary, this result indicates that Logistic Regression is not only performing better than SVM on average, but that this better performance is statistically significant and not likely due to random chance or variability in the dataset.