# OFCOM Survey Data Analysis & Visualisation

## 1. Importing necessary Libraries & Modules

In [1]:
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns
import numpy as np
import datetime as dt
import squarify
import geopandas as gpd
import plotly.graph_objects as go
from wordcloud import WordCloud

## 2. Loading the OFCOM survey data, Regional COVID Cases, Mortality Data and Vaccination Data and UK cases/mortality data 
### Loading the OFCOM data:

In [2]:
path_for_datasets = '1_data_cleaning_preprocessing/2_cleaned_files/ofcom_survey_data'

directory_path = Path(path_for_datasets)

file_list = [f.name for f in directory_path.iterdir() if f.is_file()]

file_paths = [f'{path_for_datasets}/{file}' for file in file_list if file[-4:] == '.csv']

ofcom_dataframes = {}

for i, file_path in enumerate(file_paths):
    
    dataframe = pd.read_csv(file_path)
    
    dataframe['start_date'] = pd.to_datetime(dataframe['start_date'], errors='coerce')
    dataframe['end_date'] = pd.to_datetime(dataframe['end_date'], errors='coerce')
    
    dataframe['start_date'] = dataframe['start_date'].dt.date
    dataframe['end_date'] = dataframe['end_date'].dt.date
    
    ofcom_dataframes[file_list[i][:-12]] = dataframe

In [3]:
dataframe['start_date'] = pd.to_datetime(dataframe['start_date'], errors='coerce')
dataframe['end_date'] = pd.to_datetime(dataframe['end_date'], errors='coerce')

dataframe['start_date'] = dataframe['start_date'].dt.date
dataframe['end_date'] = dataframe['end_date'].dt.date

### Loading the UKHSA regional information for mortality, vaccination and cases

In [4]:
ukhsa_mortality_dataframe = pd.read_csv('1_data_cleaning_preprocessing/2_cleaned_files/ukhsa_mortality_data/uk_regional_covid_mortality_cleaned.csv')

In [5]:
ukhsa_cases_dataframe = pd.read_csv('1_data_cleaning_preprocessing/2_cleaned_files/ukhsa_cases_data/ukhsa_cases_data_cleaned.csv')

In [6]:
ukhsa_vaccination_dataframe = pd.read_csv('1_data_cleaning_preprocessing/2_cleaned_files/ukhsa_vaccination_data/ukhsa_vaccination_data_cleaned.csv')

In [7]:
ukhsa_mortality_dataframe_by_date = ukhsa_mortality_dataframe.copy()

In [8]:
ukhsa_mortality_dataframe_by_date = ukhsa_mortality_dataframe_by_date.groupby('date')['metric_value'].sum()

In [9]:
ukhsa_cases_data_by_date = ukhsa_cases_dataframe.copy()

In [10]:
ukhsa_cases_data_by_date= ukhsa_cases_data_by_date.groupby('date')['new_cases_by_specimen_date'].sum()

### Loading the UK COVID API data 

In [11]:
covid_api_uk_dataframe = pd.read_csv('1_data_cleaning_preprocessing/1_api_connectors_and_csv_parsers/2_processed_databases/covid_19_api_data/17_03_2020_29_11_2024_GBR.csv')

In [12]:
covid_api_uk_dataframe = covid_api_uk_dataframe[covid_api_uk_dataframe['province'].isin(['United Kingdom', 'England', 'Scotland', 'Wales'])]

# 3. Frequency of getting information and news on the pandemic
## I first get the corresponding dataframe

In [13]:
dataframe = ofcom_dataframes['frequency_of_getting_infonews_about_coronavirus_outbreak_in_last_week']

## I ensure that the dates are appropriately formatted

In [14]:
dataframe['start_date'] = pd.to_datetime(dataframe['start_date'], errors='coerce')
dataframe['end_date'] = pd.to_datetime(dataframe['end_date'], errors='coerce')

dataframe['start_date'] = dataframe['start_date'].dt.date
dataframe['end_date'] = dataframe['end_date'].dt.date

## I want to plot the percentage of respondants over time so I need to create a dataframe which contains unique dates and weighted bases from the dataframe and calculate the percentage of respondents

In [15]:
weighted_bases = dataframe[dataframe['response'] == 'Weighted base'][['start_date', 'total']].reset_index(drop=True)

In [16]:
def get_total(start_date):
    lookup_dict = weighted_bases.set_index('start_date')['total'].to_dict()
    
    return lookup_dict.get(start_date)

dataframe['weighted_base'] = dataframe['start_date'].map(get_total)

In [17]:
dataframe['percentage_total_respondents'] = round((dataframe['total']/dataframe['weighted_base']) * 100, 2)

In [18]:
dataframe = dataframe[['start_date', 'response', 'percentage_total_respondents']]

In [19]:
dataframe = dataframe[~dataframe['response'].isin(['Unweighted base', 'NET: At least once a day', 'Weighted base'])]

## I then get the comparative data from the COVID-19 API to plot on the graph

In [20]:
start_date = min(dataframe['start_date'])

In [21]:
end_date = max(dataframe['start_date'])

In [22]:
cases_dataframe = covid_api_uk_dataframe.copy()

In [23]:
cases_dataframe['date'] = pd.to_datetime(cases_dataframe['date'], errors='coerce')
cases_dataframe['date'] = cases_dataframe['date'].dt.date

In [24]:
cases_dataframe = cases_dataframe[(cases_dataframe['date'] >= start_date) & (cases_dataframe['date'] <= end_date)]

In [25]:
cases_dataframe = cases_dataframe[cases_dataframe['province'] == 'England']

In [26]:
cases_dataframe = cases_dataframe[['date','confirmed_diff']]

In [27]:
lower_percentile = cases_dataframe['confirmed_diff'].quantile(0.01)
upper_percentile = cases_dataframe['confirmed_diff'].quantile(0.99)

In [28]:
cases_dataframe = cases_dataframe.ffill()
df_no_outliers = cases_dataframe[(cases_dataframe['confirmed_diff'] >= lower_percentile) & (cases_dataframe['confirmed_diff'] <= upper_percentile)]

### Finally, I plot the graph:

In [29]:
fig, ax1 = plt.subplots(figsize=(15, 10))

colours = plt.cm.viridis(np.linspace(0, 1, len(dataframe['response'].unique())))

for i, category in enumerate(dataframe['response'].unique()):
    category_data = dataframe[dataframe['response'] == category]
    ax1.plot(category_data['start_date'], category_data['percentage_total_respondents'], 
             label=category, linestyle='-', color=colours[i])

ax1.set_xlabel('Date')
ax1.set_ylabel('Percentage of Total Survey Respondents')

ax1.xaxis.set_major_locator(mdates.MonthLocator())
ax1.xaxis.set_major_formatter(mdates.DateFormatter('%b %Y'))

ax1.set_xlim([dt.datetime(2020, 3, 27), dt.datetime(2021, 6, 4)])

ax2 = ax1.twinx()
ax2.plot(df_no_outliers['date'], df_no_outliers['confirmed_diff'], label='COVID Cases', linestyle='--', color='black')
ax2.set_ylabel('Number of COVID Cases Per Day')

ax1.legend(title='COVID News Search Frequency:', loc='upper left')
ax2.legend(title='Number of reported COVID Cases Per Day', loc='upper right')

ax1.grid(False)
ax2.grid(False)

for spine in ax1.spines.values():
    spine.set_visible(True)
    spine.set_color('black')
    
ax1.set_facecolor('white')
ax2.set_facecolor('white')

lockdown_periods = [
    ('2020-03-27', '2020-05-10', '1st Lockdown'),
    ('2020-11-05', '2020-12-02', '2nd Lockdown'),
    ('2021-01-06', '2021-03-08', '3rd Lockdown'),
]
    
for start_date, end_date, label in lockdown_periods:
    ax1.axvspan(
        dt.datetime.strptime(start_date, '%Y-%m-%d'), 
        dt.datetime.strptime(end_date, '%Y-%m-%d'), 
        color='gray', alpha=0.3, label=label
    )

plt.text(
    dt.datetime(2021, 4, 17), 51000,
    'Shaded areas\nrepresent lockdown\nperiods',
    fontsize=10,
    color='black',
    ha='center',
    va='center',
    bbox=dict(
        facecolor='white',
        edgecolor='black',
        boxstyle='round,pad=0.5', 
        alpha=0.3
    )
)

plt.title('Time Series of COVID News Search Frequency and Number of COVID Cases')

plt.tight_layout()

plt.savefig('3_final_figures/misinformation_ofcom/frequency_of_sourcing_news/misinformation_frequency_of_news_covid_timeseries.png', dpi=300)

plt.close('all')

# 4. Sources used to get information about the coronavirus outbreak

In [26]:
sources_used_dataframe = ofcom_dataframes['sources_used_to_get_infonews_about_coronavirus_outbreak_in_last_week']

### As I will be plotting many time series for the graph, I made a function to return the weighted percentages:

In [27]:
def calculate_percentage(dataframe):
    
    weighted_bases = dataframe[dataframe['response'] == 'Weighted base'][['start_date', 'total']].reset_index(drop=True)

    def get_total(start_date):
        lookup_dict = weighted_bases.set_index('start_date')['total'].to_dict()
        return lookup_dict.get(start_date)

    dataframe['weighted_base'] = dataframe['start_date'].map(get_total)

    dataframe['percentage_total_respondents'] = round((dataframe['total'] / dataframe['weighted_base']) * 100, 2)

    return dataframe[['start_date', 'response', 'percentage_total_respondents']]

In [28]:
sources_used_dataframe = calculate_percentage(sources_used_dataframe)

In [29]:
# sources_used_dataframe

In [30]:
sources_used_dataframe = sources_used_dataframe[~sources_used_dataframe['response'].isin(['Unweighted base', 'NET: At least once a day', 'Weighted base'])]

### From the response categories (uncomment) I have selected a few broad categories of interest for different plots

In [31]:
# sources_used_dataframe['response'].unique()

In [32]:
local_news = [
    "Family and friends directly",
    "Community radio",
    "People in your local area/neighbourhood"
]

In [33]:
newspapers = [
        '“Red-top tabloids” such as The Sun or Daily Mirror ( printed )',
       '“Red-top tabloids” such as The Sun or Mirror ( online )',
       '“Broadsheets” such as The Times or Guardian',
       '“Broadsheets” such as The Times or Guardian ( online )',
       '“Mid-market tabloids” such as The Daily Mail or Daily Express ( printed )',
       '“Mid-market tabloids” such as MailOnline or Express ( online )'
]

#### After playing with the plotting function, I noted that the response changed from Direct from Government website/ email/ text/ post to Direct from UK Government website/ email/ text/ post I therefore combined the groups

In [34]:
official_sources = [
    "Direct from Local health service website/ email/ text/ post",
    "Direct from NHS website/ email/ text/ post",
    "Direct from World Health Organisation (WHO) website/ email/ text/ post",
    "Direct from UK Government website/ email/ text/ post",
    "Direct from Local council website/ email/ text/ post",
    "Official scientists"
]

In [35]:
sources_used_dataframe.loc[sources_used_dataframe['response'] == "Direct from Government website/ email/ text/ post", 'response'] = "Direct from UK Government website/ email/ text/ post"

In [36]:
television = [
    'BBC - TV',
    'Channel 4',
    'Channel 5',
    'ITV'
]

In [37]:
social_media = [
    "NET: Social Media",
    "NET: Facebook (Facebook and Facebook Messenger)",
    "NET: WhatsApp (WhatsApp and WhatsApp groups)",
    "Instagram",
    "Twitter",
    "YouTube",
    "Snapchat",
]

## I created a function to plot similar Time Series graphs to Save time and decide which Plots are most relevant

In [38]:
def plot_time_series(dataframe_1, dataframe_2, categories, filename, plot_title, x_title, y_title, y2_title, legend_1_title, legend_2_title, min_date, max_date):
    fig, ax1 = plt.subplots(figsize=(15, 10))

    colours = plt.cm.viridis_r(np.linspace(0, 1, len(dataframe['response'].unique())))

    for i, category in enumerate(categories):
        category_data = dataframe_1[dataframe_1['response'] == category]
        ax1.plot(category_data['start_date'], category_data['percentage_total_respondents'], 
                 label=category, linestyle='-', color = colours[i])
    ax1.grid(False)

    for spine in ax1.spines.values():
        spine.set_visible(True)
        spine.set_color('black')
        
    ax1.set_facecolor('white')

    ax1.set_xlabel(f'{x_title}')
    ax1.set_ylabel(f'{y_title}')

    ax1.xaxis.set_major_locator(mdates.MonthLocator())
    ax1.xaxis.set_major_formatter(mdates.DateFormatter('%b %Y'))

    ax2 = ax1.twinx()

    ax2.grid(False)
    
    ax2.set_facecolor('white')

    ax2.plot(dataframe_2['date'], dataframe_2['confirmed_diff'], label=f'{y2_title}', linestyle='--', color='black')
    ax2.set_ylabel('Number of COVID Cases Per Day')

    ax1.legend(title=f'{legend_1_title}', loc='upper left')
    ax2.legend(title=f'{legend_2_title}', loc='upper right')

    ax1.xaxis.set_major_locator(mdates.MonthLocator())
    ax1.xaxis.set_major_formatter(mdates.DateFormatter('%b %Y'))
    ax1.set_xlim([min_date, max_date])
    
    lockdown_periods = [
        ('2020-03-27', '2020-05-10', '1st Lockdown'),
        ('2020-11-05', '2020-12-02', '2nd Lockdown'),
        ('2021-01-06', '2021-03-08', '3rd Lockdown'),
    ]

    for start_date, end_date, label in lockdown_periods:
        ax1.axvspan(
            dt.datetime.strptime(start_date, '%Y-%m-%d'), 
            dt.datetime.strptime(end_date, '%Y-%m-%d'), 
            color='gray', alpha=0.3, label=label
        )

    plt.text(
    dt.datetime (2021, 4, 17), 51000,
    'Shaded areas\nrepresent lockdown\nperiods',
    fontsize=10,
    color='black',
    ha='center',
    va='center',
    bbox=dict(
        facecolor='white',
        edgecolor='black',
        boxstyle='round,pad=0.5', 
        alpha=0.3
    )
)

    plt.title(f'{plot_title}')
    plt.tight_layout()

    plt.savefig(f'{filename}.png', dpi=300)

    plt.close('all')

#### I set the root for the file path for the news_source_figures

In [39]:
root_image_path = '3_final_figures/misinformation_ofcom/news_sources_figures/'

## I also create two functions to filter the dataframe and return minimum and maximum date

In [40]:
def filter_dataframe(dataframe, category):
    copy_dataframe = dataframe.copy()
    copy_dataframe = copy_dataframe[copy_dataframe['response'].isin(category)]
    return copy_dataframe

In [41]:
def return_min_max_date(dataframe):
    min_date = dataframe['start_date'].min()
    max_date = dataframe['start_date'].max()
    return min_date, max_date

## Plotting the newspaper sources dataframe

In [42]:
newspaper_dataframe = filter_dataframe(sources_used_dataframe, newspapers)

In [43]:
min_date, max_date = return_min_max_date(newspaper_dataframe)

In [44]:
plot_time_series(dataframe_1 = newspaper_dataframe, dataframe_2 = df_no_outliers, categories = newspapers, plot_title = 'Time Series of Newspaper Source Usage and COVID Cases', x_title = 'Date', y_title = 'Percentage of Total Survey Respondents', y2_title = 'Number of COVID Cases Per Day', legend_1_title = 'Percentage of Respondents using Newspaper Source', legend_2_title = 'Number of reported COVID Cases Per Day', filename = f'{root_image_path}newspaper_sources_time_series', min_date = min_date, max_date = max_date)

## Plotting the official sources data

In [45]:
official_sources_dataframe = filter_dataframe(sources_used_dataframe, official_sources)

In [46]:
min_date, max_date = return_min_max_date(official_sources_dataframe)

In [47]:
plot_time_series(dataframe_1 = official_sources_dataframe, dataframe_2 = df_no_outliers, categories = official_sources, plot_title = 'Time Series of Official Source Usage and COVID Cases', x_title = 'Date', y_title = 'Percentage of Total Survey Respondents', y2_title = 'Number of COVID Cases Per Day', legend_1_title = 'Percentage of Respondents using Official Source', legend_2_title = 'Number of reported COVID Cases Per Day', filename = f'{root_image_path}official_sources_time_series', min_date = min_date, max_date = max_date)

## Plotting the television sources data

In [48]:
television_sources_dataframe = filter_dataframe(sources_used_dataframe, television)

In [49]:
min_date, max_date = return_min_max_date(television_sources_dataframe)

In [50]:
plot_time_series(dataframe_1 = television_sources_dataframe, dataframe_2 = df_no_outliers, categories = television, plot_title = 'Time Series of Television News Source Usage and COVID Cases', x_title = 'Date', y_title = 'Percentage of Total Survey Respondents', y2_title = 'Number of COVID Cases Per Day', legend_1_title = 'Percentage of Respondents using Television News Source', legend_2_title = 'Number of reported COVID Cases Per Day', filename = f'{root_image_path}television_time_series', min_date = min_date, max_date = max_date)

## Plotting the social media source usage

In [51]:
social_media_dataframe = filter_dataframe(sources_used_dataframe, social_media)

In [52]:
min_date, max_date = return_min_max_date(social_media_dataframe)

In [53]:
plot_time_series(dataframe_1 = social_media_dataframe, dataframe_2 = df_no_outliers, categories = social_media, plot_title = 'Time Series of Social Media News Source Usage and COVID Cases', x_title = 'Date', y_title = 'Percentage of Total Survey Respondents', y2_title = 'Number of COVID Cases Per Day', legend_1_title = 'Percentage of Respondents using Social Media News Source', legend_2_title = 'Number of reported COVID Cases Per Day', filename = f'{root_image_path}social_media_time_series', min_date = min_date, max_date = max_date)

## Plotting the local news source usage


In [54]:
local_news_dataframe = filter_dataframe(sources_used_dataframe, local_news)

In [55]:
min_date, max_date = return_min_max_date(local_news_dataframe)

In [56]:
plot_time_series(dataframe_1 = local_news_dataframe, dataframe_2 = df_no_outliers, categories = local_news, plot_title = 'Time Series of Local News Source Usage and COVID Cases', x_title = 'Date', y_title = 'Percentage of Total Survey Respondents (%)', y2_title = 'Number of COVID Cases Per Day', legend_1_title = 'Percentage of Respondents using Local News Source', legend_2_title = 'Number of reported COVID Cases Per Day', filename = f'{root_image_path}local_news_time_series', min_date = min_date, max_date = max_date)

In [30]:
## I created a pie chart plotting function which can be called in a similar manner to save time when plotting these graphs and to add consistency to the layout

The history saving thread hit an unexpected error (OperationalError('attempt to write a readonly database')).History will not be written to the database.


In [31]:
def plot_pie_chart(data, labels, output_file, title, colorscheme='viridis', explode_index=0, figsize=(16, 12)):

    plt.figure(figsize=figsize)

    colors = plt.colormaps.get_cmap(colorscheme)(np.linspace(0, 0.95, len(labels)))

    viridis_colors = plt.cm.viridis(np.linspace(0, 0.95, 16))


    explosion = np.zeros(len(data))
    explosion[explode_index] = 0.1

    _, _, autotexts = plt.pie(
        data,
        labels=labels,
        colors=colors,
        autopct='%1.1f%%',
        startangle=140,
        labeldistance=1.1,
        explode=explosion

    )

    for autotext in autotexts:
        autotext.set_color('white')

    plt.title(title, fontsize=14)

    plt.tight_layout()

    plt.savefig(output_file, dpi=300)

    plt.close('all')

# 5. Most important news source

In [57]:
most_important_source_dataframe = ofcom_dataframes['most_important_source_used_in_last_week']

## Pie chart of all source frequencies

## I plotted the pie chart for the sources described as most important, I ran into some challenges when plotting as there were so many graphs I therefore played around with groupings to find the combination which allowed me to see meaningful data without combining too many of the smaller columns:

In [59]:
source_frequency = most_important_source_dataframe.copy()

In [60]:
source_frequency = source_frequency[['response', 'total']]

In [61]:
source_frequency = source_frequency.groupby('response').sum()

In [62]:
source_frequency = source_frequency[~source_frequency.index.str.contains('NET:|Weighted base|Unweighted base')]

In [63]:
source_frequency = source_frequency.sort_values(by='total', ascending = False)

In [64]:
biggest_news_sources = source_frequency[:15]

In [65]:
other = source_frequency[15:]
other_total = other.sum().item()   

In [66]:
current_other_row = biggest_news_sources.loc['Other']

In [67]:
current_other_row_total = current_other_row.iloc[0].item()

In [68]:
current_other_row['total'] = other_total + current_other_row_total

In [69]:
biggest_news_sources = biggest_news_sources.sort_values(by='total', ascending = False)

In [70]:
data = biggest_news_sources['total']

In [71]:
output_filepath = f'3_final_figures/misinformation_ofcom/news_sources_figures/all_sources_piechart.png'

In [72]:
title = "Percieved Most Important News Sources During the Pandemic (All Sources)"

In [73]:
explosion = np.zeros(len(biggest_news_sources['total']))
explosion[0] = 0.1

In [74]:
plot_pie_chart(
    data=data, 
    labels = biggest_news_sources.index, 
    output_file = output_filepath, 
    title = title, 
    colorscheme='viridis', 
    explode_index=0, 
    figsize=(16, 12))

## Pie Chart for grouped source frequencies

In [75]:
net_information_sources_most_important = most_important_source_dataframe.copy()

In [76]:
net_information_sources_most_important = net_information_sources_most_important[['response', 'total']]

In [77]:
net_information_sources_most_important = net_information_sources_most_important.groupby('response').sum()

In [78]:
net_information_sources_most_important = net_information_sources_most_important[net_information_sources_most_important.index.str.contains('NET:')]

In [79]:
net_information_sources_most_important = net_information_sources_most_important.sort_values(by='total', ascending = False)

In [80]:
net_information_sources_most_important = net_information_sources_most_important[~net_information_sources_most_important.index.isin(['NET: Offline', 'NET: Closed groups', 'NET: WhatsApp (WhatsApp and WhatsApp groups)'])]

In [81]:
data = net_information_sources_most_important['total']

In [82]:
output_file = '3_final_figures/misinformation_ofcom/news_sources_figures/net_most_important_piechart.png'

In [83]:
title = "Percieved Most Important News Sources During the Pandemic (Grouped Sources)"

In [84]:
plot_pie_chart(
    data=data, 
    labels = net_information_sources_most_important.index, 
    output_file = output_file, 
    title = title, 
    colorscheme='viridis', 
    explode_index=0, 
    figsize=(16, 12))

plt.close('all')

## Time series graphs of change in grouped information sources over time

In [85]:
most_important_source_dataframe = calculate_percentage(most_important_source_dataframe)

In [86]:
most_important_source_dataframe = most_important_source_dataframe[~most_important_source_dataframe['response'].isin(['Unweighted base', 'NET: At least once a day', 'Weighted base'])]

## Local News information sources

In [87]:
local_news_dataframe = filter_dataframe(most_important_source_dataframe, local_news)

In [88]:
min_date, max_date = return_min_max_date(local_news_dataframe)

In [89]:
plot_time_series(dataframe_1 = local_news_dataframe, dataframe_2 = df_no_outliers, categories = local_news, plot_title = 'Time Series of Respondants Considering Each Local News Source as The Most Important and COVID Cases', x_title = 'Date', y_title = 'Percentage of Total Survey Respondents (%)', y2_title = 'Number of COVID Cases Per Day', legend_1_title = 'Percentage of Respondents considering Local News Source Most Important', legend_2_title = 'Number of reported COVID Cases Per Day', filename = f'{root_image_path}most_important_local_news_time_series', min_date = min_date, max_date = max_date)

## Newspaper information sources

In [90]:
newspapers_dataframe = filter_dataframe(most_important_source_dataframe, newspapers)

In [91]:
min_date, max_date = return_min_max_date(newspapers_dataframe)

In [92]:
plot_time_series(dataframe_1 = newspapers_dataframe, dataframe_2 = df_no_outliers, categories = newspapers, plot_title = 'Time Series of Respondants Considering Newspaper Source as The Most Important and COVID Cases', x_title = 'Date', y_title = 'Percentage of Total Survey Respondents (%)', y2_title = 'Number of COVID Cases Per Day', legend_1_title = 'Percentage of Respondents considering Newspaper Source Most Important', legend_2_title = 'Number of reported COVID Cases Per Day', filename = f'{root_image_path}most_important_newspaper_time_series', min_date = min_date, max_date = max_date)

## Official information sources

In [93]:
official_sources_dataframe = filter_dataframe(most_important_source_dataframe, official_sources)

In [94]:
min_date, max_date = return_min_max_date(official_sources_dataframe)

In [95]:
plot_time_series(dataframe_1 = official_sources_dataframe, dataframe_2 = df_no_outliers, categories = official_sources, plot_title = 'Time Series of Respondants Considering Official Source as The Most Important and COVID Cases', x_title = 'Date', y_title = 'Percentage of Total Survey Respondents (%)', y2_title = 'Number of COVID Cases Per Day', legend_1_title = 'Percentage of Respondents considering Official Source Most Important', legend_2_title = 'Number of reported COVID Cases Per Day', filename = f'{root_image_path}most_important_official_source_time_series', min_date = min_date, max_date = max_date)

## Television Information Sources

In [96]:
television_sources_dataframe = filter_dataframe(most_important_source_dataframe, television)

In [97]:
min_date, max_date = return_min_max_date(television_sources_dataframe)

In [98]:
plot_time_series(dataframe_1 = television_sources_dataframe, dataframe_2 = df_no_outliers, categories = television, plot_title = 'Time Series of Respondants Considering Television Source as The Most Important and COVID Cases', x_title = 'Date', y_title = 'Percentage of Total Survey Respondents (%)', y2_title = 'Number of COVID Cases Per Day', legend_1_title = 'Percentage of Respondents considering Television Source Most Important', legend_2_title = 'Number of reported COVID Cases Per Day', filename = f'{root_image_path}most_important_official_source_time_series', min_date = min_date, max_date = max_date)

## Social Media

In [99]:
social_media_sources_dataframe = filter_dataframe(most_important_source_dataframe, social_media)

In [100]:
min_date, max_date = return_min_max_date(social_media_sources_dataframe)

In [101]:
plot_time_series(dataframe_1 = social_media_sources_dataframe, dataframe_2 = df_no_outliers, categories = social_media, plot_title = 'Time Series of Respondants Considering Television Source as The Most Important and COVID Cases', x_title = 'Date', y_title = 'Percentage of Total Survey Respondents (%)', y2_title = 'Number of COVID Cases Per Day', legend_1_title = 'Percentage of Respondents considering Social Media Source Most Important', legend_2_title = 'Number of reported COVID Cases Per Day', filename = f'{root_image_path}most_important_social_media_source_time_series', min_date = min_date, max_date = max_date)

## 6. Trust in the news

In [102]:
trust_in_source_dataframe = ofcom_dataframes['trust_in_the_sources_for_informationnews_about_coronavirus_used_in_last_week']

In [103]:
trust_in_source_dataframe = calculate_percentage(trust_in_source_dataframe)

In [104]:
trust_in_source_dataframe = trust_in_source_dataframe[~trust_in_source_dataframe['response'].isin(['Unweighted base', 'Weighted base', 'NET: Trust', 'NET: Do not trust', 'Mean', 'Standard deviation Standard error'])]

In [105]:
min_date, max_date = return_min_max_date(trust_in_source_dataframe)

In [106]:
categories = trust_in_source_dataframe['response'].unique()

In [107]:
plot_time_series(dataframe_1 = trust_in_source_dataframe, dataframe_2 = df_no_outliers, categories = categories, plot_title = 'Time Series of Trust in News Sources and COVID Cases', x_title = 'Date', y_title = 'Percentage of Total Survey Respondents (%)', y2_title = 'Number of COVID Cases Per Day', legend_1_title = 'Percentage of Respondents and their Trust Rating', legend_2_title = 'Number of reported COVID Cases Per Day', filename = f'{root_image_path}trust_in_news_source_time_series', min_date = min_date, max_date = max_date)

## 7. Frequency of Exposure to misinformation 
### Exposure to misinformation over the pandemic

In [108]:
exposure_fake_news = ofcom_dataframes['whether_came_across_informationnews_about_coronavirus_that_you_think_has_been_false_or_misleading_in_last_week']

In [109]:
exposure_fake_news = calculate_percentage(exposure_fake_news)

In [110]:
exposure_fake_news = exposure_fake_news[~exposure_fake_news['response'].isin(['Unweighted base', 'Weighted base'])]

In [111]:
categories = exposure_fake_news['response'].unique()

In [112]:
min_date, max_date = return_min_max_date(exposure_fake_news)

In [113]:
plot_time_series(dataframe_1 = exposure_fake_news, dataframe_2 = df_no_outliers, categories = categories, plot_title = 'Time Series of Exposure to Misinformation and COVID Cases', x_title = 'Date', y_title = 'Percentage of Total Survey Respondents (%)', y2_title = 'Number of COVID Cases Per Day', legend_1_title = 'Percentage of Respondents Who Have Been Exposed', legend_2_title = 'Number of reported COVID Cases Per Day', filename = f'{root_image_path}exposure_to_misinformation_time_series', min_date = min_date, max_date = max_date)

## 8. How misinformation has been reported

### I iterated over several of the OFCOM dataframes to produce stacking bar charts which demonstrated how different sources were reported and how this was percieved by gender, age, region and social class to see if any patterns emerge.

In [115]:
fake_news = [ 'how_claims_that_the_coronavirus_vaccine_is_a_cover_for_a_plan_to_implant_trackable_microchips_in_people_have_been_reported',
    'how_claims_that_the_coronavirus_vaccine_may_reduce_fertility_have_been_reported',
    'how_theory_that_the_origin_or_cause_of_coronavirus_is_in_some_way_linked_to_5g_technology_has_been_reported',
    'how_claims_about_injecting_disinfectant_have_been_reported',
    'how_claims_about_empty_hospitals_on_social_media_posts_prove_that_coronavirus_has_been_exaggerated_have_been_reported','how_claims_about_the_coronavirus_test_which_shows_if_you_currently_have_the_virus_does_not_work_and_93_of_tests_produce_a_false_positive_have_been_reported',
    'how_claims_stating_that_the_flu_alone_is_killing_more_people_than_coronavirus_have_been_reported',
    'how_claims_about_the_potential_dangers_of_a_coronavirus_vaccine_have_been_reported',
    'how_claims_about_face_maskscoverings_offering_no_protection_or_being_harmful_have_been_reported'
]

In [116]:
categories = {'gender': ['male', 'female'], 'age': ['16-24', '18-24', '25-34', '35-44', '45-54', '55-64', '65+'],
              'class': ['class_upper_and_middle', 'class_lower_middle', 'class_skilled_working',
                        'class_working_class_lowest_grade'],
              'region': ['scotland', 'north_east', 'north_west', 'yorkshire_&_humberside', 'west_midlands',
                         'east_midlands', 'wales', 'eastern', 'london', 'south_east', 'south_west', 'northern_ireland']}

In [117]:
for news in fake_news:
    misinformation_dataframe = ofcom_dataframes[news]
    
    for category, category_list in categories.items():
        
        try:
        
            formatted_labels = [c.title().replace('_', ' ') for c in category_list]
            
            copy_df = misinformation_dataframe.copy()
                    
            columns = category_list + ['response', 'start_date']
            copy_df = copy_df[columns]
    
            aggregated_df = copy_df.groupby(['response'])[category_list].sum().reset_index()
            
            aggregated_df = aggregated_df.loc[~aggregated_df['response'].isin(['Weighted base', 'Unweighted base'])] 
                
            groups = tuple(category_list)
    
            response_dict = {}
            
            response_list = []
            
            response_df = aggregated_df['response']
    
            for response in response_df:
                response_dict[response] = []
                response_list.append(response)
                    
            for i, group in enumerate(groups):
                response_df = aggregated_df['response']
                
                filter_df = aggregated_df[group]
                
                filter_df = filter_df.loc[~aggregated_df['response'].isin(['Weighted base', 'Unweighted base'])]
                
                for i, value in enumerate(filter_df):
                    response_dict[response_list[i]].append(value)
            
            for key, value in response_dict.items():
                response_dict[key] = np.array(value)
            
            width = 0.5
    
            fig, ax = plt.subplots(figsize=(12, 8))
            bottom = np.zeros(len(category_list))
            
            cmap = plt.get_cmap("viridis_r")

            positions = np.linspace(0, 1, 3)

            colours = [cmap(pos) for pos in positions]
            
            i = 0
            
            for group, response_count in response_dict.items():
                p = ax.bar(groups, response_count, width, label=group, bottom=bottom, color=colours[i])
                bottom += response_count
                i += 1
    
            ax.set_title(f"{news.replace('_', ' ').title()}")
            
            ax.legend(loc="upper right")
                    
            plt.xticks(ticks=range(len(groups)), labels=formatted_labels, rotation=90)
            
            plt.ylabel("Number of Respondents")
            
            plt.xlabel(category.title())

            plt.tight_layout()

            plt.savefig(f'3_final_figures/misinformation_ofcom/how_false_news_has_been_reported/{news}_{category}.png', dpi=300)
            
            plt.close('all')
            
        except KeyError:
            print(f"Error when plotting {news} {category} graph")
        

Error when plotting how_claims_about_injecting_disinfectant_have_been_reported region graph
Error when plotting how_claims_about_the_potential_dangers_of_a_coronavirus_vaccine_have_been_reported region graph


## 9. Exposure to specific false or misleading recommendations/claims

In [118]:
exposure_dataframe = ofcom_dataframes['whether_came_across_any_of_these_false_or_misleading_recommendations_about_avoiding_the_coronavirus_in_the_last_week']

In [119]:
aggregated_df = exposure_dataframe[['response','total', 'male', 'female', 'class_upper_and_middle', 'class_lower_middle', 'class_skilled_working', 'class_working_class_lowest_grade']].groupby('response').sum()

In [120]:
columns_to_calculate = [
    'male', 
    'female', 
    'class_upper_and_middle', 
    'class_lower_middle', 
    'class_skilled_working', 
    'class_working_class_lowest_grade'
]

for column in columns_to_calculate:
    aggregated_df[f'{column}_percentage'] = round(aggregated_df[column] / aggregated_df.loc['Weighted base', column] * 100, 2)

In [121]:
aggregated_df = aggregated_df.drop(index=['Unweighted base', 'Weighted base', 'NET: Any'])
aggregated_df = aggregated_df.rename(index={'Increasing use of natural remedies such as colloidal silver, essential oils, garlic, MMS (chlorine dioxide) or vitamin C': 'Increasing use of natural remedies e.g. colloidal silver'})

In [122]:
index_order = [i for i in aggregated_df.index if i not in ['Other', 'None of these']] + ['Other', 'None of these']
aggregated_df = aggregated_df.reindex(index_order)

In [123]:
gender_percentage_misleading_claims = aggregated_df[['male_percentage', 'female_percentage']]

In [124]:
plt.figure(figsize=(15, 12))
ax = sns.heatmap(gender_percentage_misleading_claims, annot=True, cmap='viridis_r', fmt='.2f', cbar=True, cbar_kws={'label': 'Percentage Total Respondents Reporting Exposure'})

x_labels = ax.get_xticklabels()
new_labels = [label.get_text().replace('_', ' ').title()[:-10] for label in x_labels]
ax.set_xticklabels(new_labels)

plt.title("Heatmap of Exposure to Misleading Claims by Gender")
plt.xlabel("Gender")
plt.ylabel("Misleading Claim")

plt.tight_layout()

plt.savefig(f'3_final_figures/misinformation_ofcom/exposure_to_misinformation/exposure_by_gender.png', dpi=300)

plt.close('all')

In [125]:
class_percentage_misleading_claims = aggregated_df[['class_upper_and_middle_percentage', 'class_lower_middle_percentage', 'class_skilled_working_percentage', 'class_working_class_lowest_grade_percentage']]

In [126]:
plt.figure(figsize=(15, 12))
ax = sns.heatmap(class_percentage_misleading_claims, annot=True, cmap='viridis_r', fmt='.2f', cbar=True, cbar_kws={'label': 'Percentage Total Respondents Reporting Exposure'})

x_labels = ax.get_xticklabels()
new_labels = [label.get_text().replace('_', ' ').title()[6:-10] for label in x_labels]
ax.set_xticklabels(new_labels, rotation=90)

plt.title("Heatmap of Percentage Exposure of Survey Respondents to Misleading Claims by Social Class")
plt.xlabel("Social Class")
plt.ylabel("Misleading Claim")

plt.tight_layout()

plt.savefig(f'3_final_figures/misinformation_ofcom/exposure_to_misinformation/exposure_by_class.png', dpi=300)

plt.close('all')

## 10. Device usage 

In [127]:
device_used_to_connect = ofcom_dataframes['device_usage_to_connect_to_internet']

In [128]:
device_used_to_connect = device_used_to_connect[['response', 'total']]

In [129]:
device_used_to_connect = device_used_to_connect.groupby('response').sum()

In [130]:
device_used_to_connect = device_used_to_connect.sort_values(by='total', ascending=False)

In [131]:
biggest_used_devices = device_used_to_connect[:13]

In [132]:
other = device_used_to_connect[13:-1]

In [133]:
final_row = device_used_to_connect[-1:]

In [134]:
other_total = other.sum().item()

In [135]:
current_other_row = biggest_used_devices.loc['Other portable/ handheld device (e.g. portable games console/ iPod Touch)']

In [136]:
other_total = other.sum().item() 

current_other_row_total = current_other_row.iloc[0].item()

current_other_row['total'] = other_total + current_other_row_total

In [137]:
biggest_used_devices = pd.concat([biggest_used_devices, final_row], ignore_index=False)

In [138]:
biggest_used_devices = biggest_used_devices.sort_values(by='total', ascending=False)

In [139]:
data = biggest_used_devices['total']

In [140]:
labels = biggest_used_devices.index

In [141]:
title = "Pie Chart Showing the Devices Used by Respondents to Access the Internet."

In [142]:
output_file = "3_final_figures/misinformation_ofcom/device_usage/piechart_devices_used_all_respondents"

In [143]:
plot_pie_chart(data= data, labels=labels, output_file = output_file, title = title, colorscheme='viridis', explode_index=0, figsize=(16, 12))

# 11. Frequency of sources used

In [293]:
sources_used_dataframe = ofcom_dataframes['sources_used_to_get_infonews_about_coronavirus_outbreak_in_last_week']

In [294]:
all_sources_dataframe = sources_used_dataframe.copy()

In [295]:
all_sources_dataframe = all_sources_dataframe[['response', 'total']]

In [296]:
all_sources_dataframe = all_sources_dataframe.groupby('response').sum()

In [297]:
all_sources_dataframe = all_sources_dataframe.groupby('response').sum()

In [298]:
all_sources_dataframe = all_sources_dataframe.sort_values(by='total', ascending=False)

In [299]:
all_sources_dataframe = all_sources_dataframe[
    ~all_sources_dataframe.index.str.contains('NET:|Weighted base|Unweighted base')]

In [308]:
colors = plt.cm.viridis(np.linspace(0, 1, len(all_sources_dataframe)))

plt.figure(figsize=(20, 15))

squarify.plot(
    sizes=all_sources_dataframe['total'],
    color=colors
)

plt.title("Treemap of Source of News Reported Usage Frequencies", fontsize=14)

plt.axis('off')

handles = [plt.Rectangle((0, 0), 1, 1, color=colors[i]) for i in range(len(all_sources_dataframe))]
plt.legend(handles, all_sources_dataframe.index, title="Media Types", loc='center left', bbox_to_anchor=(1, 0.5), fontsize=10)

plt.tight_layout()

plt.savefig(f'3_final_figures/misinformation_ofcom/news_sources_figures/news_sources_used_treemap.png', dpi=300)

plt.close('all')

In [311]:
response_counts = dict(zip(all_sources_dataframe['response'], all_sources_dataframe['total']))

wordcloud = WordCloud(width=800, height=400, background_color='white', colormap='viridis_r').generate_from_frequencies(response_counts)

plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')

plt.axis('off')

plt.title('Frequency for Reported News Source Usage during the COVID-19 Pandemic')

plt.tight_layout()

plt.savefig("3_final_figures/misinformation_ofcom/news_sources_figures/news_sources_word_cloud.png", dpi=300)

plt.close()

## 12. Vaccination

### OFCOM DATA

In [152]:
vaccination_survey_dataframe = ofcom_dataframes['to_what_extent_do_you_agree_or_disagree_with_the_following_statements_i_would_agree_to_be_vaccinated_against_the_coronavirus_if_there_was_a_vaccine_approved_by_the']

In [153]:
values_to_remove = [
    "NET: Agree",
    "NET: Disagree",
    "Standard deviation",
    "Standard error",
    "Unweighted base",
    "Weighted base",
    "Standard deviation Standard error",
    "Mean"
]

In [154]:
vaccination_survey_dataframe['question'][0]

'to_what_extent_do_you_agree_or_disagree_with_the_following_statements?_i_would_agree_to_be_vaccinated_against_the_coronavirus_if_there_was_a_vaccine_approved_by_the'

In [155]:
vaccination_survey_dataframe = vaccination_survey_dataframe[~vaccination_survey_dataframe['response'].isin(values_to_remove)]

In [156]:
vaccination_survey_dataframe = vaccination_survey_dataframe[['response', 'eastern', 'north_east', 'north_west', 'yorkshire_&_humberside', 'west_midlands', 'east_midlands', 'london', 'south_east', 'south_west']]

In [157]:
location_columns = ['north_east', 'eastern', 'north_west', 'yorkshire_&_humberside','west_midlands', 'east_midlands', 'london', 'south_east', 'south_west']

In [158]:
vaccination_survey_dataframe[location_columns] = vaccination_survey_dataframe[location_columns].astype(int)

In [159]:
vaccination_survey_dataframe = vaccination_survey_dataframe.groupby(['response'], as_index=False)[location_columns].sum()

In [160]:
vaccination_survey_dataframe['response_weighting'] = pd.to_numeric(vaccination_survey_dataframe['response'].str[-3:-1], errors='coerce'
).fillna(0).astype(int)

In [161]:
for location in location_columns:
    vaccination_survey_dataframe[f'weighted_survey_{location}'] = vaccination_survey_dataframe[location]*vaccination_survey_dataframe['response_weighting']

In [162]:
weighted_survey_series= vaccination_survey_dataframe[['weighted_survey_north_east',
       'weighted_survey_north_west', 'weighted_survey_yorkshire_&_humberside',
       'weighted_survey_west_midlands', 'weighted_survey_east_midlands',
       'weighted_survey_london', 'weighted_survey_south_east',
       'weighted_survey_south_west', 'weighted_survey_eastern']]

In [163]:
sums = weighted_survey_series.sum()

In [164]:
weighted_survey_dataframe = sums.to_frame()

In [165]:
weighted_survey_dataframe.rename(columns={0: 'vaccine_sentiment'}, inplace=True)

In [166]:
weighted_survey_dataframe.index = weighted_survey_dataframe.index.str.replace('weighted_survey_', '')

In [167]:
weighted_survey_dataframe.rename(index={'eastern': 'east_of_england', 'yorkshire_&_humberside': 'yorkshire_and_the_humber'}, inplace=True)
weighted_survey_dataframe.index.name = 'area_name'

## UKHSA Vaccine Data

In [168]:
ukhsa_vaccination_dataframe['date'] = pd.to_datetime(ukhsa_vaccination_dataframe['date'])

december_2020_vaccination_dataframe = ukhsa_vaccination_dataframe[(ukhsa_vaccination_dataframe['date'] >= '2020-12-01') & (ukhsa_vaccination_dataframe['date'] <= '2020-12-31')]

In [169]:
december_region_vaccination = december_2020_vaccination_dataframe[['date', 'area_name', 'newPeopleVaccinatedFirstDoseByVaccinationDate', 'VaccineRegisterPopulationByVaccinationDate']]

In [170]:
december_region_vaccination = december_region_vaccination.groupby(['area_name'], as_index=False)[['newPeopleVaccinatedFirstDoseByVaccinationDate', 'VaccineRegisterPopulationByVaccinationDate']].sum()

In [171]:
december_region_vaccination['percentage_vaccine_uptake'] = (december_region_vaccination['newPeopleVaccinatedFirstDoseByVaccinationDate']/december_region_vaccination['VaccineRegisterPopulationByVaccinationDate'])*100

In [172]:
december_region_vaccination = december_region_vaccination[['area_name', 'percentage_vaccine_uptake']]

In [173]:
december_region_vaccination = december_region_vaccination.set_index('area_name')

### It would be useful to represent the data on a map so that you can visually see the uptake of vaccinations and the overall feelings towards vaccination in a similar time period

### In order to plot the data geographically I need to first read the shape file using the geopandas library. I found the necessary region shapefiles on the [Geoportal Statistics website](https://geoportal.statistics.gov.uk/datasets/cca6931ac5e54dcfba12ebbee4d9ae60_0/explore?location=52.083344%2C0.353373%2C5.73)

In [174]:
shapefile_path = '2_shape_files/Regions Dec 2020 EN BUC (1)/RGN_DEC_2020_EN_BUC.shp'

In [175]:
gdf = gpd.read_file(shapefile_path)

In [176]:
gdf['area_name'] = gdf['RGN20NM'].str.lower().str.replace(' ', '_')

In [177]:
gdf = gdf.set_index('area_name')

In [178]:
merged_dataframe = pd.merge(gdf, weighted_survey_dataframe, left_index=True, right_index=True)

### Willingness to be vaccinated plot

In [179]:
fig, ax = plt.subplots(1, 1, figsize=(10, 10))

map_plot = merged_dataframe.plot(
    ax=ax, 
    column=merged_dataframe['vaccine_sentiment'], 
    cmap='viridis_r', 
    linewidth=0.25, 
    edgecolor='0.5', 
    legend=True, 
    legend_kwds={'label': "Agreeability"}
)

ax.set_xticks([])
ax.set_yticks([])

cbar = map_plot.get_figure().get_axes()[-1]

cbar.set_yticks(np.linspace(merged_dataframe['vaccine_sentiment'].min()+25, merged_dataframe['vaccine_sentiment'].max()-25, 3))
cbar.set_yticklabels(['Strongly\nDisagree', 'Neither\nAgree\nNor\nDisagree', 'Strongly\nAgree'])

map_plot.set_title('Willingness to be Vaccinated by Region September - November 2020')

plt.tight_layout()

fig.savefig("3_final_figures/misinformation_ofcom/vaccination/willingness_for_vaccination.png", dpi=300)

plt.close('all')

### Uptake of vaccination by region

In [180]:
merged_dataframe = pd.merge(gdf, december_region_vaccination, left_index=True, right_index=True)

In [181]:
fig, ax = plt.subplots(1, 1, figsize=(10, 10))

map_plot = merged_dataframe.plot(
    ax=ax, 
    column=merged_dataframe['percentage_vaccine_uptake'], 
    cmap='viridis_r', 
    linewidth=0.25, 
    edgecolor='0.5', 
    legend=True, 
    legend_kwds={'label': "Vaccination Uptake"}
)

ax.set_xticks([])
ax.set_yticks([])

cbar = map_plot.get_figure().get_axes()[-1]

cbar.set_yticks(np.linspace(merged_dataframe['percentage_vaccine_uptake'].min()+ 0.0025, merged_dataframe['percentage_vaccine_uptake'].max() - 0.0025, 3))
cbar.set_yticklabels(['Low', 'Medium', 'High'])
map_plot.set_title('Vaccine Uptake by Region December 2020')

plt.tight_layout()

fig.savefig("3_final_figures/misinformation_ofcom/vaccination/vaccine_uptake_by_region.png", dpi=300)

plt.close('all')

## 13 Sources of misinformation

In [245]:
sources = ofcom_dataframes['source_of_claims_about_about_face_maskscoverings_offering_no_protection_or_being_harmful__as_true']

In [246]:
list_sources = [
    'source_of_claims_about_about_face_maskscoverings_offering_no_protection_or_being_harmful__as_true',
    'source_of_claims_that_the_coronavirus_vaccine_is_a_cover_for_a_plan_to_implant_trackable_microchips_in_people_reported_as_true',
    'source_of_claims_about_empty_hospitals_on_social_media_posts_prove_that_coronavirus_has_been_exaggerated_reported_as_true',
    'source_of_theory_linking_coronavirus_to_5g_technology_reported_as_true',
    'source_of_claims_stating_that_the_flu_alone_is_killing_more_people_than_coronavirus_as_true',
    'source_of_claims_about_about_injecting_disinfectant_as_true',
    'source_of_claims_about_the_potential_dangers_of_a_coronavirus_vaccine_reported_as_true'
]

In [247]:
sources_dataframe = ofcom_dataframes[list_sources[0]]

for source in list_sources[1:]:
    sources_dataframe = pd.concat([sources_dataframe, ofcom_dataframes[list_sources[i]]], axis=0)    

In [248]:
sources_dataframe = sources_dataframe[['response', 'total']]

In [249]:
sources_dataframe = sources_dataframe.groupby('response')['total'].sum()

In [254]:
sources_dataframe = sources_dataframe.reset_index()
sources_dataframe.index.name = 'response'

In [255]:
sources_dataframe = sources_dataframe[~sources_dataframe['response'].str.contains('NET:')]
sources_dataframe = sources_dataframe[~sources_dataframe['response'].str.contains('Unweighted base')]
sources_dataframe = sources_dataframe[~sources_dataframe['response'].str.contains('Weighted base')]

In [256]:
response_counts = dict(zip(sources_dataframe['response'], sources_dataframe['total']))

wordcloud = WordCloud(width=800, height=400, background_color='white', colormap='viridis_r').generate_from_frequencies(response_counts)

plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')

plt.axis('off')

plt.title('Frequency for Reported Sources of Misinformation during the COVID-19 Pandemic')

plt.tight_layout()

plt.savefig("3_final_figures/misinformation_ofcom/sources_of_misinformation/sources_of_misinformation_word_cloud.png", dpi=300)

plt.close()

In [257]:
concern = ofcom_dataframes['to_what_extent_are_you_concerned_or_not_concerned_about_the_following_statements__the_amount_of_false_or_misleading_information_you_may_be_getting_about_coronavirus']

In [310]:
concern

Unnamed: 0,start_date,end_date,question,response,total,male,female,16-24,18-24,25-34,...,yorkshire_&_humberside,west_midlands,east_midlands,wales,eastern,london,south_east,south_west,northern_ireland,net:_england
0,2020-06-19,2020-06-21,to_what_extent_are_you_concerned_or_not_concer...,Standard deviation Standard error,1.24,1.28,1.17,1.25,1.25,1.22,...,0.0,0,0,0.0,0.0,0,0.0,0.0,0.0,0
1,2020-06-19,2020-06-21,to_what_extent_are_you_concerned_or_not_concer...,Mean,3.03,2.88,3.18,3.01,2.98,3.05,...,0.0,0,0,0.0,0.0,0,0.0,0.0,0.0,0
2,2020-06-19,2020-06-21,to_what_extent_are_you_concerned_or_not_concer...,NET: Bottom 2 box,716,419.00,297,108.00,99.00,119,...,0.0,0,0,0.0,0.0,0,0.0,0.0,0.0,0
3,2020-06-19,2020-06-21,to_what_extent_are_you_concerned_or_not_concer...,1 - Not at all concerned,302,193.00,109,41.00,38.00,47,...,0.0,0,0,0.0,0.0,0,0.0,0.0,0.0,0
4,2020-06-19,2020-06-21,to_what_extent_are_you_concerned_or_not_concer...,2,413,226.00,187,67.00,61.00,71,...,0.0,0,0,0.0,0.0,0,0.0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
116,2021-06-04,2021-06-06,to_what_extent_are_you_concerned_or_not_concer...,3,671,286.00,385,101.00,86.00,117,...,53.0,57,59,30.0,71.0,73,91.0,64.0,22.0,569
117,2021-06-04,2021-06-06,to_what_extent_are_you_concerned_or_not_concer...,4,402,201.00,202,48.00,41.00,81,...,27.0,40,23,19.0,37.0,69,48.0,36.0,17.0,345
118,2021-06-04,2021-06-06,to_what_extent_are_you_concerned_or_not_concer...,5 - Very concerned,258,131.00,128,42.00,40.00,43,...,26.0,17,14,10.0,18.0,55,28.0,17.0,7.0,219
119,2021-06-04,2021-06-06,to_what_extent_are_you_concerned_or_not_concer...,Weighted base,2135,1044.00,1091,295.00,254.00,356,...,174.0,186,153,101.0,201.0,282,292.0,182.0,64.0,1792
