In [None]:
# OFCOM Survey Data Analysis & Visualisation
## 1. Importing necessary Libraries & Modules
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns
import numpy as np
import datetime as dt

plt.close('all')
## 2. Loading the OFCOM survey data, Regional COVID Cases, Mortality Data and Vaccination Data and UK cases/mortality data 
### Loading the OFCOM data:
path_for_datasets = '1_data_cleaning_preprocessing/2_cleaned_files/ofcom_survey_data'

directory_path = Path(path_for_datasets)

file_list = [f.name for f in directory_path.iterdir() if f.is_file()]

file_paths = [f'{path_for_datasets}/{file}' for file in file_list if file[-4:] == '.csv']

ofcom_dataframes = {}
for i, file_path in enumerate(file_paths):
    dataframe = pd.read_csv(file_path)

    dataframe['start_date'] = pd.to_datetime(dataframe['start_date'], errors='coerce')
    dataframe['end_date'] = pd.to_datetime(dataframe['end_date'], errors='coerce')

    dataframe['start_date'] = dataframe['start_date'].dt.date
    dataframe['end_date'] = dataframe['end_date'].dt.date

    ofcom_dataframes[file_list[i][:-12]] = dataframe

dataframe['start_date'] = pd.to_datetime(dataframe['start_date'], errors='coerce')
dataframe['end_date'] = pd.to_datetime(dataframe['end_date'], errors='coerce')

dataframe['start_date'] = dataframe['start_date'].dt.date
dataframe['end_date'] = dataframe['end_date'].dt.date
### Loading the ukhsa regional information for mortality, vaccination and cases
ukhsa_mortality_dataframe = pd.read_csv(
    '1_data_cleaning_preprocessing/2_cleaned_files/ukhsa_mortality_data/uk_regional_covid_mortality_cleaned.csv')
ukhsa_cases_dataframe = pd.read_csv(
    '1_data_cleaning_preprocessing/2_cleaned_files/ukhsa_cases_data/ukhsa_cases_data_cleaned.csv')
ukhsa_vaccination_dataframe = pd.read_csv(
    '1_data_cleaning_preprocessing/2_cleaned_files/ukhsa_vaccination_data/ukhsa_vaccination_data_cleaned.csv')
### Loading the UK COVID API data 
covid_api_uk_dataframe = pd.read_csv(
    '1_data_cleaning_preprocessing/1_api_connectors_and_csv_parsers/2_processed_databases/covid_19_api_data/17_03_2020_29_11_2024_GBR.csv')
covid_api_uk_dataframe = covid_api_uk_dataframe[
    covid_api_uk_dataframe['province'].isin(['United Kingdom', 'England', 'Scotland', 'Wales'])]
# 3. Frequency of getting information and news on the pandemic
## I first get the corresponding dataframe
dataframe = ofcom_dataframes['frequency_of_getting_infonews_about_coronavirus_outbreak_in_last_week']
## I ensure that the dates are appropriately formatted
dataframe['start_date'] = pd.to_datetime(dataframe['start_date'], errors='coerce')
dataframe['end_date'] = pd.to_datetime(dataframe['end_date'], errors='coerce')

dataframe['start_date'] = dataframe['start_date'].dt.date
dataframe['end_date'] = dataframe['end_date'].dt.date
## I want to plot the percentage of respondants over time so I need to create a dataframe which contains unique dates and weighted bases from the dataframe and calculate the percentage of respondents
weighted_bases = dataframe[dataframe['response'] == 'Weighted base'][['start_date', 'total']].reset_index(drop=True)


def get_total(start_date):
    lookup_dict = weighted_bases.set_index('start_date')['total'].to_dict()

    return lookup_dict.get(start_date)


dataframe['weighted_base'] = dataframe['start_date'].map(get_total)
dataframe['percentage_total_respondents'] = round((dataframe['total'] / dataframe['weighted_base']) * 100, 2)
dataframe = dataframe[['start_date', 'response', 'percentage_total_respondents']]
dataframe = dataframe[~dataframe['response'].isin(['Unweighted base', 'NET: At least once a day', 'Weighted base'])]
## I then get the comparative data from the COVID-19 API to plot on the graph
start_date = min(dataframe['start_date'])
start_date
end_date = max(dataframe['start_date'])
end_date
cases_dataframe = covid_api_uk_dataframe.copy()
cases_dataframe['date'] = pd.to_datetime(cases_dataframe['date'], errors='coerce')
cases_dataframe['date'] = cases_dataframe['date'].dt.date
cases_dataframe = cases_dataframe[(cases_dataframe['date'] >= start_date) & (cases_dataframe['date'] <= end_date)]
cases_dataframe = cases_dataframe[cases_dataframe['province'] == 'England']
cases_dataframe = cases_dataframe[['date', 'confirmed_diff']]
lower_percentile = cases_dataframe['confirmed_diff'].quantile(0.01)
upper_percentile = cases_dataframe['confirmed_diff'].quantile(0.99)
cases_dataframe = cases_dataframe.ffill()
df_no_outliers = cases_dataframe[
    (cases_dataframe['confirmed_diff'] >= lower_percentile) & (cases_dataframe['confirmed_diff'] <= upper_percentile)]
### Finally, I plot the graph:
fig, ax1 = plt.subplots(figsize=(15, 10))

colours = plt.cm.viridis(np.linspace(0, 1, len(dataframe['response'].unique())))

for i, category in enumerate(dataframe['response'].unique()):
    category_data = dataframe[dataframe['response'] == category]
    ax1.plot(category_data['start_date'], category_data['percentage_total_respondents'],
             label=category, linestyle='-', color=colours[i])

ax1.set_xlabel('Date')
ax1.set_ylabel('Percentage of Total Survey Respondents')

ax1.xaxis.set_major_locator(mdates.MonthLocator())
ax1.xaxis.set_major_formatter(mdates.DateFormatter('%b %Y'))

ax1.set_xlim([dt.datetime(2020, 3, 27), dt.datetime(2021, 6, 4)])

ax2 = ax1.twinx()
ax2.plot(df_no_outliers['date'], df_no_outliers['confirmed_diff'], label='COVID Cases', linestyle='--', color='black')
ax2.set_ylabel('Number of COVID Cases Per Day')

ax1.legend(title='COVID News Search Frequency:', loc='upper left')
ax2.legend(title='Number of reported COVID Cases Per Day', loc='upper right')

ax1.grid(False)
ax2.grid(False)

for spine in ax1.spines.values():
    spine.set_visible(True)
    spine.set_color('black')

ax1.set_facecolor('white')
ax2.set_facecolor('white')

lockdown_periods = [
    ('2020-03-27', '2020-05-10', '1st Lockdown'),
    ('2020-11-05', '2020-12-02', '2nd Lockdown'),
    ('2021-01-06', '2021-03-08', '3rd Lockdown'),
]

for start_date, end_date, label in lockdown_periods:
    ax1.axvspan(
        dt.datetime.strptime(start_date, '%Y-%m-%d'),
        dt.datetime.strptime(end_date, '%Y-%m-%d'),
        color='gray', alpha=0.3, label=label
    )

plt.text(
    dt.datetime(2021, 4, 17), 51000,
    'Shaded areas\nrepresent lockdown\nperiods',
    fontsize=10,
    color='black',
    ha='center',
    va='center',
    bbox=dict(
        facecolor='white',
        edgecolor='black',
        boxstyle='round,pad=0.5',
        alpha=0.3
    )
)

plt.title('Time Series of COVID News Search Frequency and Number of COVID Cases')

plt.tight_layout()

plt.savefig(
    '3_final_figures/misinformation_ofcom/frequency_of_sourcing_news/misinformation_frequency_of_news_covid_timeseries.png',
    dpi=300)

plt.show()
# 4. Sources used to get information about the coronavirus outbreak
sources_used_dataframe = ofcom_dataframes['sources_used_to_get_infonews_about_coronavirus_outbreak_in_last_week']


### As I will be plotting many time series for the graph, I made a function to return the weighted percentages:
def calculate_percentage(dataframe):
    weighted_bases = dataframe[dataframe['response'] == 'Weighted base'][['start_date', 'total']].reset_index(drop=True)

    def get_total(start_date):
        lookup_dict = weighted_bases.set_index('start_date')['total'].to_dict()
        return lookup_dict.get(start_date)

    dataframe['weighted_base'] = dataframe['start_date'].map(get_total)

    dataframe['percentage_total_respondents'] = round((dataframe['total'] / dataframe['weighted_base']) * 100, 2)

    return dataframe[['start_date', 'response', 'percentage_total_respondents']]


sources_used_dataframe = calculate_percentage(sources_used_dataframe)
# sources_used_dataframe
sources_used_dataframe = sources_used_dataframe[
    ~sources_used_dataframe['response'].isin(['Unweighted base', 'NET: At least once a day', 'Weighted base'])]
### From the response categories (uncomment) I have selected a few broad categories of interest for different plots
# sources_used_dataframe['response'].unique()
local_news = [
    "Local sources across TV, radio and online",
    "Family and friends directly",
    "Community radio",
    "People in your local area/neighbourhood"
]
newspapers = [
    '“Red-top tabloids” such as The Sun or Daily Mirror ( printed )',
    '“Red-top tabloids” such as The Sun or Mirror ( online )',
    '“Broadsheets” such as The Times or Guardian',
    '“Broadsheets” such as The Times or Guardian ( online )',
    '“Mid-market tabloids” such as The Daily Mail or Daily Express ( printed )',
    '“Mid-market tabloids” such as MailOnline or Express ( online )'
]
#### After playing with the plotting function, I noted that the response changed from Direct from Government website/ email/ text/ post to Direct from UK Government website/ email/ text/ post
official_sources = [
    "Direct from Local health service website/ email/ text/ post",
    "Direct from NHS website/ email/ text/ post",
    "Direct from World Health Organisation (WHO) website/ email/ text/ post",
    "Direct from UK Government website/ email/ text/ post",
    "Direct from Local council website/ email/ text/ post",
    "Official scientists"
]
sources_used_dataframe.loc[sources_used_dataframe[
                               'response'] == "Direct from Government website/ email/ text/ post", 'response'] = "Direct from UK Government website/ email/ text/ post"
television = [
    'BBC - TV',
    'Channel 4',
    'Channel 5',
    'ITV'
]
social_media = [
    "NET: Social Media",
    "NET: Facebook (Facebook and Facebook Messenger)",
    "NET: WhatsApp (WhatsApp and WhatsApp groups)",
    "Instagram",
    "Twitter",
    "YouTube",
    "Snapchat",
]


## I created a function to plot similar Time Series graphs to Save time and decide which Pplots are most relevant
def plot_time_series(dataframe_1, dataframe_2, categories, filename, plot_title, x_title, y_title, y2_title,
                     legend_1_title, legend_2_title, min_date, max_date):
    fig, ax1 = plt.subplots(figsize=(15, 10))

    colours = plt.cm.viridis_r(np.linspace(0, 1, len(dataframe['response'].unique())))

    for i, category in enumerate(categories):
        category_data = dataframe_1[dataframe_1['response'] == category]
        ax1.plot(category_data['start_date'], category_data['percentage_total_respondents'],
                 label=category, linestyle='-', color=colours[i])
    ax1.grid(False)

    for spine in ax1.spines.values():
        spine.set_visible(True)
        spine.set_color('black')

    ax1.set_facecolor('white')

    ax1.set_xlabel(f'{x_title}')
    ax1.set_ylabel(f'{y_title}')

    ax1.xaxis.set_major_locator(mdates.MonthLocator())
    ax1.xaxis.set_major_formatter(mdates.DateFormatter('%b %Y'))

    ax2 = ax1.twinx()

    ax2.grid(False)

    ax2.set_facecolor('white')

    ax2.plot(dataframe_2['date'], dataframe_2['confirmed_diff'], label=f'{y2_title}', linestyle='--', color='black')
    ax2.set_ylabel('Number of COVID Cases Per Day')

    ax1.legend(title=f'{legend_1_title}', loc='upper left')
    ax2.legend(title=f'{legend_2_title}', loc='upper right')

    ax1.xaxis.set_major_locator(mdates.MonthLocator())
    ax1.xaxis.set_major_formatter(mdates.DateFormatter('%b %Y'))
    ax1.set_xlim([min_date, max_date])

    lockdown_periods = [
        ('2020-03-27', '2020-05-10', '1st Lockdown'),
        ('2020-11-05', '2020-12-02', '2nd Lockdown'),
        ('2021-01-06', '2021-03-08', '3rd Lockdown'),
    ]

    for start_date, end_date, label in lockdown_periods:
        ax1.axvspan(
            dt.datetime.strptime(start_date, '%Y-%m-%d'),
            dt.datetime.strptime(end_date, '%Y-%m-%d'),
            color='gray', alpha=0.3, label=label
        )

    plt.text(
        dt.datetime(2021, 4, 17), 51000,
        'Shaded areas\nrepresent lockdown\nperiods',
        fontsize=10,
        color='black',
        ha='center',
        va='center',
        bbox=dict(
            facecolor='white',
            edgecolor='black',
            boxstyle='round,pad=0.5',
            alpha=0.3
        )
    )

    plt.title(f'{plot_title}')
    plt.tight_layout()

    plt.savefig(f'{filename}.png', dpi=300)

    plt.close('all')


#### I set the root for the file path for the news_source_figures
root_image_path = '3_final_figures/misinformation_ofcom/news_sources_figures/'


## I also create two functions to filter the dataframe and return minimum and maximum date
def filter_dataframe(dataframe, category):
    copy_dataframe = dataframe.copy()
    copy_dataframe = copy_dataframe[copy_dataframe['response'].isin(category)]
    return copy_dataframe


def return_min_max_date(dataframe):
    min_date = dataframe['start_date'].min()
    max_date = dataframe['start_date'].max()
    return min_date, max_date


## Plotting the newspaper sources dataframe
newspaper_dataframe = filter_dataframe(sources_used_dataframe, newspapers)
min_date, max_date = return_min_max_date(newspaper_dataframe)
plot_time_series(dataframe_1=newspaper_dataframe, dataframe_2=df_no_outliers, categories=newspapers,
                 plot_title='Time Series of Newspaper Source Usage and COVID Cases', x_title='Date',
                 y_title='Percentage of Total Survey Respondents', y2_title='Number of COVID Cases Per Day',
                 legend_1_title='Percentage of Respondents using Newspaper Source',
                 legend_2_title='Number of reported COVID Cases Per Day',
                 filename=f'{root_image_path}newspaper_sources_time_series', min_date=min_date, max_date=max_date)
## Plotting the official sources data
official_sources_dataframe = filter_dataframe(sources_used_dataframe, official_sources)
min_date, max_date = return_min_max_date(official_sources_dataframe)
plot_time_series(dataframe_1=official_sources_dataframe, dataframe_2=df_no_outliers, categories=official_sources,
                 plot_title='Time Series of Official Source Usage and COVID Cases', x_title='Date',
                 y_title='Percentage of Total Survey Respondents', y2_title='Number of COVID Cases Per Day',
                 legend_1_title='Percentage of Respondents using Official Source',
                 legend_2_title='Number of reported COVID Cases Per Day',
                 filename=f'{root_image_path}official_sources_time_series', min_date=min_date, max_date=max_date)
## Plotting the television sources data
television_sources_dataframe = filter_dataframe(sources_used_dataframe, television)
min_date, max_date = return_min_max_date(television_sources_dataframe)
plot_time_series(dataframe_1=television_sources_dataframe, dataframe_2=df_no_outliers, categories=television,
                 plot_title='Time Series of Television News Source Usage and COVID Cases', x_title='Date',
                 y_title='Percentage of Total Survey Respondents', y2_title='Number of COVID Cases Per Day',
                 legend_1_title='Percentage of Respondents using Television News Source',
                 legend_2_title='Number of reported COVID Cases Per Day',
                 filename=f'{root_image_path}television_time_series', min_date=min_date, max_date=max_date)
## Plotting the social media source usage
social_media_dataframe = filter_dataframe(sources_used_dataframe, social_media)
min_date, max_date = return_min_max_date(social_media_dataframe)
plot_time_series(dataframe_1=social_media_dataframe, dataframe_2=df_no_outliers, categories=social_media,
                 plot_title='Time Series of Social Media News Source Usage and COVID Cases', x_title='Date',
                 y_title='Percentage of Total Survey Respondents', y2_title='Number of COVID Cases Per Day',
                 legend_1_title='Percentage of Respondents using Social Media News Source',
                 legend_2_title='Number of reported COVID Cases Per Day',
                 filename=f'{root_image_path}social_media_time_series', min_date=min_date, max_date=max_date)
## Plotting the local news source usage

local_news_dataframe = filter_dataframe(sources_used_dataframe, local_news)
min_date, max_date = return_min_max_date(local_news_dataframe)
plot_time_series(dataframe_1=local_news_dataframe, dataframe_2=df_no_outliers, categories=local_news,
                 plot_title='Time Series of Local News Source Usage and COVID Cases', x_title='Date',
                 y_title='Percentage of Total Survey Respondents (%)', y2_title='Number of COVID Cases Per Day',
                 legend_1_title='Percentage of Respondents using Local News Source',
                 legend_2_title='Number of reported COVID Cases Per Day',
                 filename=f'{root_image_path}local_news_time_series', min_date=min_date, max_date=max_date)
sources_used_dataframe
# 5. Most important news source
most_important_source_dataframe = ofcom_dataframes['most_important_source_used_in_last_week']
most_important_source_dataframe


def plot_pie_chart(data, labels, output_file, title, colorscheme='viridis', explode_index=0, figsize=(16, 12)):
    plt.figure(figsize=figsize)

    colors = plt.colormaps.get_cmap(colorscheme)(np.linspace(0, 0.95, len(labels)))

    viridis_colors = plt.cm.viridis(np.linspace(0, 0.95, 16))

    explosion = np.zeros(len(data))
    explosion[explode_index] = 0.1

    _, _, autotexts = plt.pie(
        data,
        labels=labels,
        colors=colors,
        autopct='%1.1f%%',
        startangle=140,
        labeldistance=1.1,
        explode=explosion

    )

    for autotext in autotexts:
        autotext.set_color('white')

    plt.title(title, fontsize=14)

    plt.tight_layout()

    plt.savefig(output_file, dpi=300)

    plt.show()

    plt.close('all')


## Pie chart of all source frequencies
source_frequency = most_important_source_dataframe.copy()
source_frequency = source_frequency[['response', 'total']]
source_frequency = source_frequency.groupby('response').sum()
source_frequency = source_frequency[~source_frequency.index.str.contains('NET:|Weighted base|Unweighted base')]
source_frequency = source_frequency.sort_values(by='total', ascending=False)
biggest_news_sources = source_frequency[:15]
other = source_frequency[15:]
other_total = other.sum().item()
current_other_row = biggest_news_sources.loc['Other']
current_other_row_total = current_other_row.iloc[0].item()
current_other_row['total'] = other_total + current_other_row_total
biggest_news_sources = biggest_news_sources.sort_values(by='total', ascending=False)
data = biggest_news_sources['total']
output_filepath = f'3_final_figures/misinformation_ofcom/news_sources_figures/all_sources_piechart.png'
title = "Percieved Most Important News Sources During the Pandemic (All Sources)"
explosion = np.zeros(len(biggest_news_sources['total']))
explosion[0] = 0.1
plot_pie_chart(
    data=data,
    labels=biggest_news_sources.index,
    output_file=output_filepath,
    title=title,
    colorscheme='viridis',
    explode_index=0,
    figsize=(16, 12))
# plt.figure(figsize=(16, 12))

# viridis_colors = plt.cm.viridis(np.linspace(0, 0.95, 16))

# explosion = np.zeros(len(biggest_news_sources['total']))

# explosion[0] = 0.1

# _, _, autotexts = plt.pie(
#     biggest_news_sources['total'], 
#     labels=biggest_news_sources.index, 
#     colors=viridis_colors,
#     autopct='%1.1f%%',
#     startangle=140,
#     labeldistance=1.1,
#     explode = explosion
# )

# for autotext in autotexts:
#     autotext.set_color('white')

# plt.title("Percieved Most Important News Sources During the Pandemic (All Sources)", fontsize=14)

# plt.tight_layout()

# plt.savefig(f'3_final_figures/misinformation_ofcom/news_sources_figures/all_sources_piechart.png', dpi=300)

# plt.show()

# plt.close('all')
## Pie Chart for grouped source frequencies
net_information_sources_most_important = most_important_source_dataframe.copy()
net_information_sources_most_important = net_information_sources_most_important[['response', 'total']]
net_information_sources_most_important = net_information_sources_most_important.groupby('response').sum()
net_information_sources_most_important = net_information_sources_most_important[
    net_information_sources_most_important.index.str.contains('NET:')]
net_information_sources_most_important = net_information_sources_most_important.sort_values(by='total', ascending=False)
net_information_sources_most_important = net_information_sources_most_important[
    ~net_information_sources_most_important.index.isin(
        ['NET: Offline', 'NET: Closed groups', 'NET: WhatsApp (WhatsApp and WhatsApp groups)'])]
data = net_information_sources_most_important['total']
output_file = '3_final_figures/misinformation_ofcom/news_sources_figures/net_most_important_piechart.png'
title = "Percieved Most Important News Sources During the Pandemic (Grouped Sources)"
plot_pie_chart(
    data=data,
    labels=net_information_sources_most_important.index,
    output_file=output_file,
    title=title,
    colorscheme='viridis',
    explode_index=0,
    figsize=(16, 12))
# plt.figure(figsize=(16, 12))

# viridis_colors = plt.cm.viridis(np.linspace(0, 1, len(net_information_sources_most_important['total'])))

# explosion = np.zeros(len(net_information_sources_most_important['total']))

# explosion[0] = 0.1

# _, _, autotexts = plt.pie(
#     net_information_sources_most_important['total'], 
#     labels=net_information_sources_most_important.index, 
#     colors=viridis_colors,
#     autopct='%1.1f%%',
#     startangle=140,
#     labeldistance=1.1,
#     explode = explosion
# )

# for autotext in autotexts:
#     autotext.set_color('white')

# plt.title("Percieved Most Important News Sources During the Pandemic (Grouped Sources)", fontsize=14)

# plt.tight_layout()

# plt.savefig(f'3_final_figures/misinformation_ofcom/news_sources_figures/net_most_important_piechart.png', dpi=300)

# plt.show()

# plt.close('all')
## Time series graphs of change in grouped information sources over time
most_important_source_dataframe = calculate_percentage(most_important_source_dataframe)
most_important_source_dataframe = most_important_source_dataframe[
    ~most_important_source_dataframe['response'].isin(['Unweighted base', 'NET: At least once a day', 'Weighted base'])]
## Local News information sources
local_news_dataframe = filter_dataframe(most_important_source_dataframe, local_news)
min_date, max_date = return_min_max_date(local_news_dataframe)
plot_time_series(dataframe_1=local_news_dataframe, dataframe_2=df_no_outliers, categories=local_news,
                 plot_title='Time Series of Respondants Considering Each Local News Source as The Most Important and COVID Cases',
                 x_title='Date', y_title='Percentage of Total Survey Respondents (%)',
                 y2_title='Number of COVID Cases Per Day',
                 legend_1_title='Percentage of Respondents considering Local News Source Most Important',
                 legend_2_title='Number of reported COVID Cases Per Day',
                 filename=f'{root_image_path}most_important_local_news_time_series', min_date=min_date,
                 max_date=max_date)
## Newspaper information sources
newspapers_dataframe = filter_dataframe(most_important_source_dataframe, newspapers)
min_date, max_date = return_min_max_date(newspapers_dataframe)
plot_time_series(dataframe_1=newspapers_dataframe, dataframe_2=df_no_outliers, categories=newspapers,
                 plot_title='Time Series of Respondants Considering Newspaper Source as The Most Important and COVID Cases',
                 x_title='Date', y_title='Percentage of Total Survey Respondents (%)',
                 y2_title='Number of COVID Cases Per Day',
                 legend_1_title='Percentage of Respondents considering Newspaper Source Most Important',
                 legend_2_title='Number of reported COVID Cases Per Day',
                 filename=f'{root_image_path}most_important_newspaper_time_series', min_date=min_date,
                 max_date=max_date)
## Official information sources
official_sources_dataframe = filter_dataframe(most_important_source_dataframe, official_sources)
min_date, max_date = return_min_max_date(official_sources_dataframe)
plot_time_series(dataframe_1=official_sources_dataframe, dataframe_2=df_no_outliers, categories=official_sources,
                 plot_title='Time Series of Respondants Considering Official Source as The Most Important and COVID Cases',
                 x_title='Date', y_title='Percentage of Total Survey Respondents (%)',
                 y2_title='Number of COVID Cases Per Day',
                 legend_1_title='Percentage of Respondents considering Official Source Most Important',
                 legend_2_title='Number of reported COVID Cases Per Day',
                 filename=f'{root_image_path}most_important_official_source_time_series', min_date=min_date,
                 max_date=max_date)
## Television Information Sources
television_sources_dataframe = filter_dataframe(most_important_source_dataframe, television)
min_date, max_date = return_min_max_date(television_sources_dataframe)
plot_time_series(dataframe_1=television_sources_dataframe, dataframe_2=df_no_outliers, categories=television,
                 plot_title='Time Series of Respondants Considering Television Source as The Most Important and COVID Cases',
                 x_title='Date', y_title='Percentage of Total Survey Respondents (%)',
                 y2_title='Number of COVID Cases Per Day',
                 legend_1_title='Percentage of Respondents considering Television Source Most Important',
                 legend_2_title='Number of reported COVID Cases Per Day',
                 filename=f'{root_image_path}most_important_official_source_time_series', min_date=min_date,
                 max_date=max_date)
## Social Media
social_media_sources_dataframe = filter_dataframe(most_important_source_dataframe, social_media)
min_date, max_date = return_min_max_date(social_media_sources_dataframe)
plot_time_series(dataframe_1=social_media_sources_dataframe, dataframe_2=df_no_outliers, categories=social_media,
                 plot_title='Time Series of Respondants Considering Television Source as The Most Important and COVID Cases',
                 x_title='Date', y_title='Percentage of Total Survey Respondents (%)',
                 y2_title='Number of COVID Cases Per Day',
                 legend_1_title='Percentage of Respondents considering Social Media Source Most Important',
                 legend_2_title='Number of reported COVID Cases Per Day',
                 filename=f'{root_image_path}most_important_social_media_source_time_series', min_date=min_date,
                 max_date=max_date)
## 6. Trust in the news
trust_in_source_dataframe = ofcom_dataframes[
    'trust_in_the_sources_for_informationnews_about_coronavirus_used_in_last_week']
trust_in_source_dataframe = calculate_percentage(trust_in_source_dataframe)
trust_in_source_dataframe = trust_in_source_dataframe[~trust_in_source_dataframe['response'].isin(
    ['Unweighted base', 'Weighted base', 'NET: Trust', 'NET: Do not trust', 'Mean',
     'Standard deviation Standard error'])]
min_date, max_date = return_min_max_date(trust_in_source_dataframe)
categories = trust_in_source_dataframe['response'].unique()
plot_time_series(dataframe_1=trust_in_source_dataframe, dataframe_2=df_no_outliers, categories=categories,
                 plot_title='Time Series of Trust in News Sources and COVID Cases', x_title='Date',
                 y_title='Percentage of Total Survey Respondents (%)', y2_title='Number of COVID Cases Per Day',
                 legend_1_title='Percentage of Respondents and their Trust Rating',
                 legend_2_title='Number of reported COVID Cases Per Day',
                 filename=f'{root_image_path}trust_in_news_source_time_series', min_date=min_date, max_date=max_date)
## 7. Frequency of Exposure to misinformation 
### Exposure to misinformation over the pandemic
exposure_fake_news = ofcom_dataframes[
    'whether_came_across_informationnews_about_coronavirus_that_you_think_has_been_false_or_misleading_in_last_week']
exposure_fake_news = calculate_percentage(exposure_fake_news)
exposure_fake_news = exposure_fake_news[~exposure_fake_news['response'].isin(['Unweighted base', 'Weighted base'])]
categories = exposure_fake_news['response'].unique()
min_date, max_date = return_min_max_date(exposure_fake_news)
plot_time_series(dataframe_1=exposure_fake_news, dataframe_2=df_no_outliers, categories=categories,
                 plot_title='Time Series of Exposure to Misinformation and COVID Cases', x_title='Date',
                 y_title='Percentage of Total Survey Respondents (%)', y2_title='Number of COVID Cases Per Day',
                 legend_1_title='Percentage of Respondents Who Have Been Exposed',
                 legend_2_title='Number of reported COVID Cases Per Day',
                 filename=f'{root_image_path}exposure_to_misinformation_time_series', min_date=min_date,
                 max_date=max_date)
## 8. How misinformation has been reported
ofcom_dataframes[
    'how_theory_that_the_origin_or_cause_of_coronavirus_is_in_some_way_linked_to_5g_technology_has_been_reported']
fake_news = [
    'how_claims_that_the_coronavirus_vaccine_is_a_cover_for_a_plan_to_implant_trackable_microchips_in_people_have_been_reported',
    'how_claims_that_the_coronavirus_vaccine_may_reduce_fertility_have_been_reported',
    'how_theory_that_the_origin_or_cause_of_coronavirus_is_in_some_way_linked_to_5g_technology_has_been_reported',
    'how_claims_about_injecting_disinfectant_have_been_reported',
    'how_claims_about_empty_hospitals_on_social_media_posts_prove_that_coronavirus_has_been_exaggerated_have_been_reported',
    'how_claims_about_the_coronavirus_test_which_shows_if_you_currently_have_the_virus_does_not_work_and_93_of_tests_produce_a_false_positive_have_been_reported',
    'how_claims_stating_that_the_flu_alone_is_killing_more_people_than_coronavirus_have_been_reported',
    'how_claims_about_the_potential_dangers_of_a_coronavirus_vaccine_have_been_reported',
    'how_claims_about_face_maskscoverings_offering_no_protection_or_being_harmful_have_been_reported'
    ]
categories = {'gender': ['male', 'female'], 'age': ['16-24', '18-24', '25-34', '35-44', '45-54', '55-64', '65+'],
              'class': ['class_upper_and_middle', 'class_lower_middle', 'class_skilled_working',
                        'class_working_class_lowest_grade'],
              'region': ['scotland', 'north_east', 'north_west', 'yorkshire_&_humberside', 'west_midlands',
                         'east_midlands', 'wales', 'eastern', 'london', 'south_east', 'south_west', 'northern_ireland']}
for news in fake_news:
    misinformation_dataframe = ofcom_dataframes[news]

    for category, category_list in categories.items():

        try:

            formatted_labels = [c.title().replace('_', ' ') for c in category_list]

            copy_df = misinformation_dataframe.copy()

            columns = category_list + ['response', 'start_date']
            copy_df = copy_df[columns]

            aggregated_df = copy_df.groupby(['response'])[category_list].sum().reset_index()

            aggregated_df = aggregated_df.loc[~aggregated_df['response'].isin(['Weighted base', 'Unweighted base'])]

            groups = tuple(category_list)

            response_dict = {}

            response_list = []

            response_df = aggregated_df['response']

            for response in response_df:
                response_dict[response] = []
                response_list.append(response)

            for i, group in enumerate(groups):
                response_df = aggregated_df['response']

                filter_df = aggregated_df[group]

                filter_df = filter_df.loc[~aggregated_df['response'].isin(['Weighted base', 'Unweighted base'])]

                for i, value in enumerate(filter_df):
                    response_dict[response_list[i]].append(value)

            for key, value in response_dict.items():
                response_dict[key] = np.array(value)

            width = 0.5

            fig, ax = plt.subplots(figsize=(12, 8))
            bottom = np.zeros(len(category_list))

            cmap = plt.get_cmap("viridis_r")

            positions = np.linspace(0, 1, 3)

            colours = [cmap(pos) for pos in positions]

            i = 0

            for group, response_count in response_dict.items():
                p = ax.bar(groups, response_count, width, label=group, bottom=bottom, color=colours[i])
                bottom += response_count
                i += 1

            ax.set_title(f"{news.replace('_', ' ').title()}")

            ax.legend(loc="upper right")

            plt.xticks(ticks=range(len(groups)), labels=formatted_labels, rotation=90)

            plt.ylabel("Number of Respondents")

            plt.xlabel(category.title())

            plt.savefig(f'3_final_figures/misinformation_ofcom/how_false_news_has_been_reported/{news}_{category}.png',
                        dpi=300)

            plt.close('all')

        except KeyError:
            print(f"Error when plotting {news} {category} graph")

## 9. Exposure to specific false or misleading recommendations/claims
exposure_dataframe = ofcom_dataframes[
    'whether_came_across_any_of_these_false_or_misleading_recommendations_about_avoiding_the_coronavirus_in_the_last_week']
aggregated_df = exposure_dataframe[
    ['response', 'total', 'male', 'female', 'class_upper_and_middle', 'class_lower_middle', 'class_skilled_working',
     'class_working_class_lowest_grade']].groupby('response').sum()
columns_to_calculate = [
    'male',
    'female',
    'class_upper_and_middle',
    'class_lower_middle',
    'class_skilled_working',
    'class_working_class_lowest_grade'
]

for column in columns_to_calculate:
    aggregated_df[f'{column}_percentage'] = round(
        aggregated_df[column] / aggregated_df.loc['Weighted base', column] * 100, 2)
aggregated_df = aggregated_df.drop(index=['Unweighted base', 'Weighted base', 'NET: Any'])
aggregated_df = aggregated_df.rename(index={
    'Increasing use of natural remedies such as colloidal silver, essential oils, garlic, MMS (chlorine dioxide) or vitamin C': 'Increasing use of natural remedies e.g. colloidal silver'})
index_order = [i for i in aggregated_df.index if i not in ['Other', 'None of these']] + ['Other', 'None of these']
aggregated_df = aggregated_df.reindex(index_order)
gender_percentage_misleading_claims = aggregated_df[['male_percentage', 'female_percentage']]
plt.figure(figsize=(15, 12))
ax = sns.heatmap(gender_percentage_misleading_claims, annot=True, cmap='viridis_r', fmt='.2f', cbar=True,
                 cbar_kws={'label': 'Percentage Total Respondents Reporting Exposure'})

x_labels = ax.get_xticklabels()
new_labels = [label.get_text().replace('_', ' ').title()[:-10] for label in x_labels]
ax.set_xticklabels(new_labels)

plt.title("Heatmap of Exposure to Misleading Claims by Gender")
plt.xlabel("Gender")
plt.ylabel("Misleading Claim")

plt.tight_layout()

plt.savefig(f'3_final_figures/misinformation_ofcom/exposure_to_misinformation/exposure_by_gender.png', dpi=300)

plt.close('all')
class_percentage_misleading_claims = aggregated_df[
    ['class_upper_and_middle_percentage', 'class_lower_middle_percentage', 'class_skilled_working_percentage',
     'class_working_class_lowest_grade_percentage']]
plt.figure(figsize=(15, 12))
ax = sns.heatmap(class_percentage_misleading_claims, annot=True, cmap='viridis_r', fmt='.2f', cbar=True,
                 cbar_kws={'label': 'Percentage Total Respondents Reporting Exposure'})

x_labels = ax.get_xticklabels()
new_labels = [label.get_text().replace('_', ' ').title()[6:-10] for label in x_labels]
ax.set_xticklabels(new_labels, rotation=90)

plt.title("Heatmap of Percentage Exposure of Survey Respondents to Misleading Claims by Social Class")
plt.xlabel("Social Class")
plt.ylabel("Misleading Claim")

plt.tight_layout()

plt.savefig(f'3_final_figures/misinformation_ofcom/exposure_to_misinformation/exposure_by_class.png', dpi=300)

plt.close('all')
## 10. Device usage 
device_used_to_connect = ofcom_dataframes['device_usage_to_connect_to_internet']
device_used_to_connect = device_used_to_connect[['response', 'total']]
device_used_to_connect = device_used_to_connect.groupby('response').sum()
device_used_to_connect = device_used_to_connect.sort_values(by='total', ascending=False)
biggest_used_devices = device_used_to_connect[:13]
other = device_used_to_connect[13:-1]
final_row = device_used_to_connect[-1:]
other_total = other.sum().item()
current_other_row = biggest_used_devices.loc[
    'Other portable/ handheld device (e.g. portable games console/ iPod Touch)']
other_total = other.sum().item()

current_other_row_total = current_other_row.iloc[0].item()

current_other_row['total'] = other_total + current_other_row_total
biggest_used_devices = pd.concat([biggest_used_devices, final_row], ignore_index=False)
biggest_used_devices = biggest_used_devices.sort_values(by='total', ascending=False)
data = biggest_used_devices['total']
labels = biggest_used_devices.index
biggest_used_devices
title = "Pie Chart Showing the Devices Used by Respondents to Access the Internet."
output_file = "3_final_figures/misinformation_ofcom/device_usage/piechart_devices_used_all_respondents"
plot_pie_chart(data=data, labels=labels, output_file=output_file, title=title, colorscheme='viridis', explode_index=0,
               figsize=(16, 12))
