In [None]:
%pip install matplotlib pandas seaborn

In [None]:
import os
import re
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

In [None]:
log_dir_mapping = {
    'crime': './results/crime',
    'science': './results/science',
    'health': './results/health',
    'politics': './results/politics',
    'social_media': './results/social_media'
}

dataset_mapping = {
  'fa-kes': '../data/crime/FA-KES-Dataset.feather',  
  'snope': '../data/crime/snope.feather',  
  'covid_claims': '../data/health/covid_claims.feather',  
  'covid_fake_news': '../data/health/covid_fake_news_dataset.feather',  
  'covid_FNIR': '../data/health/covid_FNIR.feather',  
  'fake_news': '../data/politics/fake_news_dataset.feather',  
  'isot_dataset': '../data/politics/isot_dataset.feather',  
  'liar_dataset': '../data/politics/liar_dataset.feather',  
  'pheme': '../data/politics/pheme.feather',  
  'politifact': '../data/politics/politifact_dataset.feather',  
  'climate': '../data/science/climate_dataset.feather',  
  'gossipcop': '../data/social_media/gossipcop.feather',  
  'isot_social': '../data/social_media/isot_dataset.feather',  
  'isot_multipurpose': '../data/isot_multipurpose_small.feather'
}

combined_dataset_mappings = {
  'crime': ['fa-kes', 'snope'],
  'health': ['covid_claims', 'covid_fake_news', 'covid_FNIR'],
  'politics': ['fake_news', 'isot_dataset', 'liar_dataset', 'pheme', 'politifact'],
  'science': ['climate', 'isot_multipurpose'],
  'social_media': ['gossipcop', 'isot_social']
}

In [None]:
log_data = []

for category, directory in log_dir_mapping.items():
    if os.path.exists(directory):
        log_files = [f for f in os.listdir(directory) if f.endswith('.log')]
        
        if log_files:
            for log_file in log_files:
                with open(os.path.join(directory, log_file), 'r') as file:
                    lines = file.readlines()
                    log_info = {}
                    skip_file = False
                    datasets_detailed = ""
                    datasets_list = ""
                    for line in lines:
                        parts = line.split(' - ')
                        if len(parts) > 1:
                            key_value = parts[1].split(': ', maxsplit=1)
                            if len(key_value) == 2:
                                key, value = key_value
                                log_info[key.strip()] = value.strip()
                                # Check for zero values in specific keys
                                if key.strip() in ['Precision', 'Recall', 'F-score', 'Accuracy', 'G-mean']:
                                    if float(value.strip()) == 0.0:
                                        skip_file = True
                                # Capture category
                                if key.strip() == 'Datasets list':
                                    datasets_match = re.match(r'(.+)$', value.strip())
                                    datasets_match_more = re.match(r'(.*?)\b(Train: \[.*?\]), (Test: \[.*?\])', value.strip())
                                    if datasets_match_more:
                                        datasets_list = datasets_match_more.group(1).strip()
                                        train_dataset_name = datasets_match_more.group(2)
                                        test_dataset_name = datasets_match_more.group(3)
                                        if train_dataset_name and test_dataset_name:
                                            datasets_detailed = f"{train_dataset_name}, {test_dataset_name}"
                                    elif datasets_match:
                                        datasets_list = datasets_match.group(1).strip()
                                # Capture Class weights information
                                if key.strip() == 'Class weights':
                                    class_weights_match = re.match(r'.*Class weights: (.*)', line.strip())
                                    if class_weights_match:
                                        log_info['Class weights'] = class_weights_match.group(1)
                    if log_info and not skip_file:
                        log_info['Datasets list'] = datasets_list
                        log_info['Datasets list detailed'] = datasets_detailed
                        log_data.append(log_info)

# Creating DataFrame
df = pd.DataFrame(log_data)
df.head()


In [None]:
df.to_csv('result_analysis.csv', index=False)

In [None]:
df['Accuracy'] = pd.to_numeric(df['Accuracy'], errors='coerce')
df['Accuracy'] = df['Accuracy'].round(4)
df['G-mean'] = pd.to_numeric(df['G-mean'], errors='coerce')
df['G-mean'] = df['G-mean'].round(4)
df['F-score'] = pd.to_numeric(df['F-score'], errors='coerce')
df['F-score'] = df['F-score'].round(4)
df['Recall'] = pd.to_numeric(df['Recall'], errors='coerce')
df['Recall'] = df['Recall'].round(4)
df['Precision'] = pd.to_numeric(df['Precision'], errors='coerce')
df['Precision'] = df['Precision'].round(4)
best_models = df.groupby('Datasets list')['Accuracy'].idxmax()
best_models_info = df.loc[best_models, ['Datasets list', 'Accuracy', 'G-mean', 'F-score', 'Recall', 'Precision', 'Model saved to', 'Model name']]
print(best_models_info)


In [None]:
# Initialize empty lists to store data for plotting
dataset_names = []
num_entries_list = []
fake_counts = []
not_fake_counts = []
fake_percentages = []
not_fake_percentages = []

# Count number of entries and the class distribution in each dataset
for dataset_name, file_path in dataset_mapping.items():
    try:
        data = pd.read_feather(file_path)
        num_entries = len(data)
        class_distribution = data['label'].value_counts(normalize=True) * 100
        fake_count = data['label'].value_counts().get(0, 0)  # Count of 'Fake' entries
        not_fake_count = data['label'].value_counts().get(1, 0)  # Count of 'Not Fake' entries
        
        dataset_names.append(dataset_name)
        num_entries_list.append(num_entries)
        fake_counts.append(fake_count)
        not_fake_counts.append(not_fake_count)
        fake_percentages.append(class_distribution.get(0, 0))
        not_fake_percentages.append(class_distribution.get(1, 0))
    except FileNotFoundError:
        print(f"File {file_path} not found.")
        print("-" * 30)
    except KeyError:
        print(f"No 'label' column found in {dataset_name}.")
        print("-" * 30)


In [None]:
# Plotting
fig, axs = plt.subplots(4, 4, figsize=(15, 25))
axs = axs.flatten()
colors = ['orange', 'blue']  # Assign colors for 'Fake' and 'Not Fake'

for i, (dataset_name, num_entries, fake_count, not_fake_count, fake_percentage, not_fake_percentage) in enumerate(zip(dataset_names, num_entries_list, fake_counts, not_fake_counts, fake_percentages, not_fake_percentages)):
    bar = axs[i].bar(['Fake', 'Not Fake'], [fake_count, not_fake_count], color=colors)
    axs[i].set_title(f'{dataset_name}\nEntries: {num_entries}')
    axs[i].set_ylabel('Counts')

    # Annotate bars with percentages
    for rect, percentage in zip(bar, [fake_percentage, not_fake_percentage]):
        height = rect.get_height()
        axs[i].annotate(f'{percentage:.1f}%', xy=(rect.get_x() + rect.get_width() / 2, height), xytext=(0, 3),
                        textcoords="offset points", ha='center', va='bottom')

# Hide empty subplots
for j in range(len(dataset_names), 15):
    axs[j].axis('off')

plt.tight_layout()
plt.show()


In [None]:
# Read and combine data for each category
combined_dataframes = {}
for category, datasets in combined_dataset_mappings.items():
    category_data = pd.concat([pd.read_feather(dataset_mapping[dataset]) for dataset in datasets])
    combined_dataframes[category] = category_data

# Plot class distribution for each category
plt.figure(figsize=(12, 8))
for i, (category, data) in enumerate(combined_dataframes.items()):
    plt.subplot(2, 3, i + 1)
    class_distribution = data['label'].value_counts(normalize=True)
    class_distribution = class_distribution.rename({0: 'Fake', 1: 'Not Fake'})
    class_distribution[['Fake', 'Not Fake']].plot(kind='bar', color=['orange', 'blue'])
    plt.title(f'Combined Class Distribution - {category}\nTotal Entries: {len(data)}')  # Include total number of entries
    plt.xlabel('Class')
    plt.ylabel('Percentage')
    plt.xticks(rotation=0)
    plt.ylim(0, 1)

    # Annotate bars with percentage values
    for index, value in enumerate(class_distribution):
        plt.text(index, value + 0.05, f'{value:.2%}', ha='center', color='black')

    plt.tight_layout()

plt.show()

In [None]:
# Plotting average model performance for each category with accuracy on bars in a 3x2 grid
categories = df['Datasets list'].unique()
model_names = df['Model name'].unique()
plt.figure(figsize=(20, 8))  # Adjust the figure size

for i, category in enumerate(categories):
    category_data = df[df['Datasets list'] == category]
    avg_accuracies = category_data.groupby('Model name')['Accuracy'].agg(['mean', 'count']).sort_values('mean')
    plt.subplot(2, 3, i + 1)
    bars = plt.barh(range(len(avg_accuracies)), avg_accuracies['mean'], align='center')
    plt.yticks(range(len(avg_accuracies)), avg_accuracies.index)
    plt.xlabel('Average Accuracy')
    plt.title(f'Average Accuracy by Model - {category}')
    min_accuracy = avg_accuracies['mean'].min()
    plt.margins(x=0.1, y=0)  # Set margins to start just below the smallest accuracy, and keep y-axis unchanged
    plt.xlim(0, 1.0)  # Set x-axis range from 0 to 1.0
    
    # Annotate bars with average accuracy percentages and number of entries
    for bar, accuracy, count in zip(bars, avg_accuracies['mean'], avg_accuracies['count']):
        plt.text(bar.get_width() - 0.05, bar.get_y() + bar.get_height() / 2, f'{accuracy:.1%} ({count} runs)', 
                 va='center', ha='right', color='white', fontweight='bold')

plt.tight_layout(rect=[0, 0, 1, 0.95])  # Adjust the spacing between subplots
plt.show()



In [None]:
# Plotting average model performance for each category with accuracy on bars in a 3x2 grid
categories = df['Datasets list'].unique()
model_names = df['Model name'].unique()
plt.figure(figsize=(20, 8))  # Adjust the figure size

for i, category in enumerate(categories):
    category_data = df[df['Datasets list'] == category]
    avg_accuracies = category_data.groupby('Model name')['G-mean'].agg(['mean', 'count']).sort_values('mean')
    plt.subplot(2, 3, i + 1)
    bars = plt.barh(range(len(avg_accuracies)), avg_accuracies['mean'], align='center', color='red')  # Change color to orange
    plt.yticks(range(len(avg_accuracies)), avg_accuracies.index)
    plt.xlabel('Average G-mean')
    plt.title(f'Average G-mean by Model - {category}')
    min_accuracy = avg_accuracies['mean'].min()
    plt.margins(x=0.1, y=0)  # Set margins to start just below the smallest accuracy, and keep y-axis unchanged
    plt.xlim(0, 1.0)  # Set x-axis range from 0 to 1.0
    
    # Annotate bars with average accuracy percentages and number of entries
    for bar, accuracy, count in zip(bars, avg_accuracies['mean'], avg_accuracies['count']):
        plt.text(bar.get_width() - 0.05, bar.get_y() + bar.get_height() / 2, f'{accuracy:.1%} ({count} runs)', 
                 va='center', ha='right', color='black', fontweight='bold')

plt.tight_layout(rect=[0, 0, 1, 0.95])  # Adjust the spacing between subplots
plt.show()


In [None]:
# Plotting average model performance for each category with accuracy on bars in a 3x2 grid
categories = df['Datasets list'].unique()
model_names = df['Model name'].unique()
plt.figure(figsize=(20, 8))  # Adjust the figure size

for i, category in enumerate(categories):
    category_data = df[df['Datasets list'] == category]
    avg_accuracies = category_data.groupby('Model name')['F-score'].agg(['mean', 'count']).sort_values('mean')
    plt.subplot(2, 3, i + 1)
    bars = plt.barh(range(len(avg_accuracies)), avg_accuracies['mean'], align='center', color='green')  # Change color to orange
    plt.yticks(range(len(avg_accuracies)), avg_accuracies.index)
    plt.xlabel('Average F-score')
    plt.title(f'Average F-score by Model - {category}')
    min_accuracy = avg_accuracies['mean'].min()
    plt.margins(x=0.1, y=0)  # Set margins to start just below the smallest accuracy, and keep y-axis unchanged
    plt.xlim(0, 1.0)  # Set x-axis range from 0 to 1.0
    
    # Annotate bars with average accuracy percentages and number of entries
    for bar, accuracy, count in zip(bars, avg_accuracies['mean'], avg_accuracies['count']):
        plt.text(bar.get_width() - 0.05, bar.get_y() + bar.get_height() / 2, f'{accuracy:.1%} ({count} runs)', 
                 va='center', ha='right', color='black', fontweight='bold')

plt.tight_layout(rect=[0, 0, 1, 0.95])  # Adjust the spacing between subplots
plt.show()

In [None]:
category_accuracies = df.groupby('Datasets list')['Accuracy'].agg(['max', 'min', 'mean'])
category_accuracies = category_accuracies.sort_values('mean', ascending=False)

# Set the 'viridis' color palette
colors_viridis = sns.color_palette("viridis", 3) 

# Plotting best, worst, and average accuracy by category using the 'viridis' palette
plt.figure(figsize=(10, 6))

categories = category_accuracies.index
bar_width = 0.3
index = range(len(categories))

plt.bar(index, category_accuracies['max'], bar_width, color=colors_viridis[0], label='Best Accuracy')
plt.bar([i + bar_width for i in index], category_accuracies['min'], bar_width, color=colors_viridis[1], label='Worst Accuracy')
plt.bar([i + 2 * bar_width for i in index], category_accuracies['mean'], bar_width, color=colors_viridis[2], label='Average Accuracy')

plt.xlabel('Categories')
plt.ylabel('Accuracy')
plt.title('Best, Worst, and Average Accuracy by Category')
plt.xticks([i + bar_width for i in index], categories)
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
df['G-mean'] = pd.to_numeric(df['G-mean'], errors='coerce')

# Calculate best, worst, and average G-mean by category
category_g_mean = df.groupby('Datasets list')['G-mean'].agg(['max', 'min', 'mean'])
category_g_mean = category_g_mean.sort_values('mean', ascending=False)

# Set color palette for G-mean
colors_g_mean = sns.color_palette("mako")

# Plotting best, worst, and average G-mean by category with distinct color palette
plt.figure(figsize=(10, 6))

categories = category_g_mean.index
bar_width = 0.3
index = range(len(categories))

plt.bar(index, category_g_mean['max'], bar_width, color=colors_g_mean[2], label='Best G-mean')
plt.bar([i + bar_width for i in index], category_g_mean['min'], bar_width, color=colors_g_mean[0], label='Worst G-mean')
plt.bar([i + 2 * bar_width for i in index], category_g_mean['mean'], bar_width, color=colors_g_mean[4], label='Average G-mean')

plt.xlabel('Categories')
plt.ylabel('G-mean')
plt.title('Best, Worst, and Average G-mean by Category')
plt.xticks([i + bar_width for i in index], categories)
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
df['F-score'] = pd.to_numeric(df['F-score'], errors='coerce')

# Calculate best, worst, and average F-score by category
category_f_score = df.groupby('Datasets list')['F-score'].agg(['max', 'min', 'mean'])
category_f_score = category_f_score.sort_values('mean', ascending=False)

# Set color palette for F-score
colors_f_score = sns.color_palette("rocket")

# Plotting best, worst, and average F-score by category with distinct color palette
plt.figure(figsize=(10, 6))

categories = category_f_score.index
bar_width = 0.3
index = range(len(categories))

plt.bar(index, category_f_score['max'], bar_width, color=colors_f_score[2], label='Best F-score')
plt.bar([i + bar_width for i in index], category_f_score['min'], bar_width, color=colors_f_score[0], label='Worst F-score')
plt.bar([i + 2 * bar_width for i in index], category_f_score['mean'], bar_width, color=colors_f_score[4], label='Average F-score')

plt.xlabel('Categories')
plt.ylabel('F-score')
plt.title('Best, Worst, and Average F-score by Category')
plt.xticks([i + bar_width for i in index], categories)
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
df['Training time'] = df['Training time'].apply(lambda x: re.findall(r'\d+\.\d+', str(x))[0] if re.findall(r'\d+\.\d+', str(x)) else None)
df['Inference time'] = df['Inference time'].apply(lambda x: re.findall(r'\d+\.\d+', str(x))[0] if re.findall(r'\d+\.\d+', str(x)) else None)
df['Runtime'] = pd.to_numeric(df['Training time']) + pd.to_numeric(df['Inference time'])
category_runtime = df.groupby('Datasets list')['Runtime'].mean()

# Sort the categories in descending order
category_runtime = category_runtime.sort_values(ascending=False)


# Set color palette
colors = sns.color_palette("Set2")

# Plotting the average runtime per category
plt.figure(figsize=(10, 6))

categories = category_runtime.index
index = range(len(categories))

plt.bar(index, category_runtime, color=colors)

plt.xlabel('Categories')
plt.ylabel('Average Runtime in Seconds')
plt.title('Average Runtime of Models per Category')
plt.xticks(index, categories, rotation=45)
plt.tight_layout()
plt.show()


In [None]:
df['Training time'] = df['Training time'].apply(lambda x: re.findall(r'\d+\.\d+', str(x))[0] if re.findall(r'\d+\.\d+', str(x)) else None)
df['Inference time'] = df['Inference time'].apply(lambda x: re.findall(r'\d+\.\d+', str(x))[0] if re.findall(r'\d+\.\d+', str(x)) else None)
df['Training time'] = pd.to_numeric(df['Training time'])
df['Inference time'] = pd.to_numeric(df['Inference time'])
category_training_time = df.groupby('Datasets list')['Training time'].mean()
category_testing_time = df.groupby('Datasets list')['Inference time'].mean()

# Sort the categories in descending order
category_training_time = category_training_time.sort_values(ascending=False)
category_testing_time = category_testing_time.sort_values(ascending=False)

plt.figure(figsize=(12, 6))

# Plotting training time
plt.subplot(1, 2, 1)
category_training_time.plot(kind='bar', color='skyblue')
plt.title('Average Training Time per Category')
plt.xlabel('Categories')
plt.ylabel('Average Training Time (seconds)')
plt.xticks(rotation=45)

# Plotting inference time
plt.subplot(1, 2, 2)
category_testing_time.plot(kind='bar', color='salmon')
plt.title('Average Inference Time per Category')
plt.xlabel('Categories')
plt.ylabel('Average Inference Time (seconds)')
plt.xticks(rotation=45)

plt.tight_layout()
plt.show()