In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# from google.colab import drive
# drive.mount('/content/drive')

In [2]:
# Import the datasets
# results_path = '/content/drive/My Drive/Online MSDS/MOD C2/Political Polarization/results/'
results_path = 'results/'

regression_results = pd.read_csv(f"{results_path}regression_results.csv")
classification_results = pd.read_csv(f"{results_path}classification_results.csv")
kmeans_clustering_results = pd.read_csv(f"{results_path}kmeans_clustering_results.csv")

## Standardize the data

In [3]:
regression_results

Unnamed: 0,Model,Dataset,RMSE,R2
0,Linear Regression,Convabuse,0.648422,0.79207
1,Linear Regression,Dynamically Generated Hate Speech,0.329406,0.565965
2,Linear Regression,Online Abusive Attacks,25.184477,0.247075
3,Linear Regression,US Elections 2020 Hate Speech,0.468635,0.121211
4,Linear Regression,MLMA Hate Speech,0.681719,0.06274
5,Lasso Regularization,Online Abusive Attacks,25.179175,0.247392
6,Ridge Regularization,Online Abusive Attacks,25.184473,0.247076
7,ElasticNet Regularization,Online Abusive Attacks,25.178478,0.247434
8,Linear Lasso Regression,Online Abusive Attacks,25.184477,0.247075
9,Linear Ridge Regression,Online Abusive Attacks,25.070423,0.25388


In [4]:
# Remove "Dynamically Generated Hate Speech" and "US Elections 2020 Hate Speech" from the dataset column of the regression_results dataframe
regression_results = regression_results[~regression_results['Dataset'].isin(['Dynamically Generated Hate Speech', 'US Elections 2020 Hate Speech'])]

In [5]:
# Get the best model based on R-squared grouped by dataset
best_regression_models = regression_results.loc[regression_results.groupby('Dataset')['R2'].idxmax()]
best_regression_models

Unnamed: 0,Model,Dataset,RMSE,R2
0,Linear Regression,Convabuse,0.648422,0.79207
4,Linear Regression,MLMA Hate Speech,0.681719,0.06274
19,K-Nearest Neighbors,Online Abusive Attacks,22.016796,0.424568


In [6]:
classification_results

Unnamed: 0,Model,Dataset,Accuracy,F1-score,L1 Ratio
0,Logistic Regression,Convabuse,0.544822,0.527693,0.0
1,Logistic Regression,Convabuse,0.548002,0.533691,0.0
2,Logistic Regression,Convabuse,0.548002,0.533691,0.0
3,Logistic Regression,Convabuse,0.548002,0.533691,0.0
4,Logistic Regression,Convabuse,0.548002,0.533691,0.0
...,...,...,...,...,...
91,K-Nearest Neighbors,MLMA Hate Speech,0.602837,0.508122,0.0
92,Gradient Boosting,Convabuse,0.580402,0.577396,0.0
93,Gradient Boosting,Dynamically Generated Hate Speech,0.852399,0.851308,0.0
94,Gradient Boosting,US Elections 2020 Hate Speech,0.671402,0.671325,0.0


In [7]:
# Get the best model based on accuracy grouped by dataset
best_classification_models = classification_results.loc[classification_results.groupby('Dataset')['Accuracy'].idxmax()]
best_classification_models

Unnamed: 0,Model,Dataset,Accuracy,F1-score,L1 Ratio
88,K-Nearest Neighbors,Convabuse,0.622341,0.62068,0.0
81,Decision Tree,Dynamically Generated Hate Speech,0.852399,0.851308,0.0
91,K-Nearest Neighbors,MLMA Hate Speech,0.602837,0.508122,0.0
82,Decision Tree,US Elections 2020 Hate Speech,0.703598,0.702632,0.0


In [8]:
kmeans_clustering_results

Unnamed: 0,Dataset,Best Model Silhouette Score,Best Model Inertia,Chosen Model Silhouette Score,Chosen Model Inertia
0,Convabuse,0.653482,167.6557,0.682418,40.26329
1,Dynamically Generated Hate Speech,0.592235,899.7484,0.656848,129.4781
2,US Elections 2020 Hate Speech,0.525683,7.981054e-31,0.525683,3.34249e-29
3,MLMA Hate Speech,0.115003,1034.922,0.293387,668.5472


In [9]:
combined_kmeans_results = []

# Loop through each row in the results dataframe
for index, row in kmeans_clustering_results.iterrows():
    # Get the best model silhouette score, best model inertia, chosen model silhouette score, and chosen model inertia for each dataset
    best_model_silhouette_score = row['Best Model Silhouette Score']
    best_model_inertia = row['Best Model Inertia']
    chosen_model_silhouette_score = row['Chosen Model Silhouette Score']
    chosen_model_inertia = row['Chosen Model Inertia']

    # Append the dataset's name, best model silhouette score and best model silhouette score to the combined_kmeans_results list
    combined_kmeans_results.append((
        row['Dataset'], best_model_silhouette_score, best_model_inertia, "Best"
    ))

    # Append the dataset's name, chosen model inertia and best model parameters to the combined_kmeans_results list
    combined_kmeans_results.append((
        row['Dataset'], chosen_model_silhouette_score, chosen_model_inertia, "Chosen"
    ))

print(combined_kmeans_results)

[('Convabuse', 0.6534823780821838, 167.655658254129, 'Best'), ('Convabuse', 0.6824183576117808, 40.26328855956639, 'Chosen'), ('Dynamically Generated Hate Speech', 0.5922348082232475, 899.7484133403477, 'Best'), ('Dynamically Generated Hate Speech', 0.6568481613250476, 129.4781226169749, 'Chosen'), ('US Elections 2020 Hate Speech', 0.5256828404026201, 7.981053689540705e-31, 'Best'), ('US Elections 2020 Hate Speech', 0.5256828404026201, 3.3424899370829356e-29, 'Chosen'), ('MLMA Hate Speech', 0.1150029449182097, 1034.922427960983, 'Best'), ('MLMA Hate Speech', 0.2933872551244078, 668.5472346624641, 'Chosen')]


In [10]:
# Convert combined_kmeans_results to a dataframe
combined_kmeans_results_df = pd.DataFrame(combined_kmeans_results, columns=['Dataset', 'Silhouette Score', 'Inertia', 'Model'])
combined_kmeans_results_df

Unnamed: 0,Dataset,Silhouette Score,Inertia,Model
0,Convabuse,0.653482,167.6557,Best
1,Convabuse,0.682418,40.26329,Chosen
2,Dynamically Generated Hate Speech,0.592235,899.7484,Best
3,Dynamically Generated Hate Speech,0.656848,129.4781,Chosen
4,US Elections 2020 Hate Speech,0.525683,7.981054e-31,Best
5,US Elections 2020 Hate Speech,0.525683,3.34249e-29,Chosen
6,MLMA Hate Speech,0.115003,1034.922,Best
7,MLMA Hate Speech,0.293387,668.5472,Chosen


In [11]:
best_kmeans_results = []

# Choose the best silhouette score grouped by dataset
for dataset in combined_kmeans_results_df['Dataset'].unique():
    dataset_df = combined_kmeans_results_df[combined_kmeans_results_df['Dataset'] == dataset]
    best_row = dataset_df.loc[dataset_df['Silhouette Score'].idxmax()]
    best_kmeans_results.append(best_row)

best_kmeans_results_df = pd.DataFrame(best_kmeans_results, columns=['Dataset', 'Silhouette Score', 'Inertia', 'Model'])
best_kmeans_results_df

Unnamed: 0,Dataset,Silhouette Score,Inertia,Model
1,Convabuse,0.682418,40.26329,Chosen
3,Dynamically Generated Hate Speech,0.656848,129.4781,Chosen
4,US Elections 2020 Hate Speech,0.525683,7.981054e-31,Best
7,MLMA Hate Speech,0.293387,668.5472,Chosen


In [12]:
best_classification_models.to_csv(f"{results_path}best_classification_models.csv", index=False)
best_regression_models.to_csv(f"{results_path}best_regression_models.csv", index=False)
best_kmeans_results_df.to_csv(f"{results_path}best_kmeans_clustering.csv", index=False)