# Lexicon-based sentiment analysis - UK Muslims (2024-02-15)

_by A. Maurits van der Veen_  

_Modification history:_  
_2020-09-30 - Initial extraction from various application-specific notebooks_  
_2020-10-08 - Compile into coherent notebook_  
_2020-11-05 - Updates & cleaning_  
_2020-11-09 - Updates for pathnames & saving visualizations_  
_2020-11-20 - Updates for displaying (selections of) full rows_  
_2024-02-15 - Minor updates & explanatory text for upload as part of replication data_  

This file contains the code to do analyze the sentiment of the coverage of Muslims in the UK press, as reported in:

`van der Veen, A. Maurits, and Erik Bleich. "The advantages of lexicon-based sentiment analysis in an age of machine learning."`

Specifically, it recreates figures 1 and 2 in that paper.

### 0. Set-up

In [None]:
projectfolder = '/Users/xxx/Replication/'


In [None]:
import sys
# import os

from datetime import datetime

import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt

# Print summary version info (for fuller info, simply print sys.version)
print('You are using python version {}.'.format(sys.version.split()[0]))

### 1. Load corpus


In [None]:
# Load into dataframes; make the index (the row-name) the ids (column 0)
metadatafile = projectfolder + 'Muslims_meta.csv'
valencefile = projectfolder + 'Muslims_vals_cal.csv'

df = pd.read_csv(metadatafile)
valencedf = pd.read_csv(valencefile)
df = df.merge(valencedf, how='left', on='id')


In [None]:
# Show a list of variables in the dataset
list(df.columns)


In [None]:
# Check article quantities & definition of 'tabloid'

pd.crosstab(df.Publication, df.tabloid, 
            margins=True, margins_name='Total')


### 2. Prepare data for plotting


In [None]:
# Convert to internal date format
df['Date'] = pd.to_datetime(df['Date'])

In [None]:
# Binarize valence
df['avg_valence_bin'] = np.where(df['avg_valence'] < 0, -1, 1)

In [None]:
# String variables cannot be aggregated => drop
stringvars = ['corpusid', 'Publication', 'Day of week']

In [None]:
# Reduce to monthly averages: group by month, then take means and add a count column

# Group by month
df_bymonth = df.drop(stringvars, axis=1).set_index('Date').groupby(pd.Grouper(freq='M'))

# Collapse by taking averages and adding a count variable indicating number of articles that month
df_bymonthX = df_bymonth.aggregate(np.mean).reset_index().join(df_bymonth.size().reset_index(name='nrarticles').drop('Date', axis=1))
df_bymonthX = df_bymonthX.set_index('Date')

In [None]:
# 5-month exponential moving average
df_bymonth_ewm5  = df_bymonthX.ewm(span=5, adjust=False).mean()


### 3. Generate charts


In [None]:
# Set typeface and default size
font = {'family' : 'Times New Roman',
        'size'   : 16}

matplotlib.rc('font', **font)

In [None]:
# Figure 1: Decade surrounding 9/11, average monthly valence, comparing binarized vs. fine-grained valence measures

# Specify date range
start_date = '1996-01-01'
end_date = '2006-12-31'
nrdays = (pd.to_datetime(end_date) - pd.to_datetime(start_date)) / pd.offsets.Day(1)

nrdays *= 1.1  # add for left-right margin of graph
days2add = nrdays * 0.05  # to skip left-hand margin

# Initialize figure
fig, ax = plt.subplots(figsize=(15,9))

# Specify dataframe to use
df2use = df_bymonth_ewm5  

# Plot average valence, original and binarized
ax.plot(df2use.loc[start_date:end_date, :].index, 
        df2use.loc[start_date:end_date, 'avg_valence_bin'], color='C0', lw=1,  label = 'Binary valence')
ax.plot(df2use.loc[start_date:end_date, :].index, 
        df2use.loc[start_date:end_date, 'avg_valence'], color='C1', lw=1,  label = 'Valence')


# Add short horizontal lines for mean values during particular period (note: don't use geometric weighted avg. here!!)

sections = [('2001-10-01', '2001-12-31'),
            ('2002-01-01', '2002-12-31')]

for sectionstart, sectionend in sections:    
    meanval = df_bymonthX.loc[sectionstart:sectionend, 'avg_valence'].mean()
    ax.hlines(meanval, pd.to_datetime(sectionstart), pd.to_datetime(sectionend), color='C1', lw=3)
    meanval = df_bymonthX.loc[sectionstart:sectionend, 'avg_valence_bin'].mean()
    ax.hlines(meanval, pd.to_datetime(sectionstart), pd.to_datetime(sectionend), color='C0', lw=3)
    
# Add vertical lines at key events (Sep. 11)

eventcol = 'gray'
eventstyle = 'dashed'

x_bounds = ax.get_xlim()
xposition = [(pd.to_datetime('2001-09-01'), '  9/11/2001', eventcol, eventstyle, '1'),
             # (pd.to_datetime('2005-07-07'), '  7/7/2005', eventcol, eventstyle, '1'),
            ]
for xc, event, linecolor, style, width in xposition:
    ax.axvline(x=xc, color=linecolor, linestyle=style, lw=width)
    nrdays_elapsed = (xc - pd.to_datetime(start_date)) / pd.offsets.Day(1)
    ax.annotate(text=event, xy =(((nrdays_elapsed+days2add)/nrdays),0.99), xycoords='axes fraction', verticalalignment='top', horizontalalignment='center' , rotation = 270)

# Label y axis
ax.set_ylabel('Article valence', fontsize='large')

# Add axis & legend info

# Legend data for plotted line(s)
handles, labels = ax.get_legend_handles_labels()
# Add text legend
handles.append(matplotlib.lines.Line2D([0], [0], color='gray', lw=0, label='(Horizontal lines are multi-month averages)')) 

# Update the legend
plt.legend(handles=handles, loc='center right', fontsize='large')

# Save and display
plt.tight_layout()
plt.savefig(projectfolder + "Fig1-test.pdf", bbox_inches="tight", dpi=300)
plt.show()


In [None]:
# Figure 2: Valences for entire period, distinguishing tabloids and broadsheets

# Specify dataframe to use
df2use = df_bymonth_ewm5

# Specify date range
start_date = '1996-05-01'
end_date = '2016-12-31'
nrdays = (pd.to_datetime(end_date) - pd.to_datetime(start_date)) / pd.offsets.Day(1)

nrdays *= 1.1  # add for left-right margin of graph
days2add = nrdays * 0.05  # to skip left-hand margin

# Initialize figure
fig, ax = plt.subplots(figsize=(15,9))

# To plot sub-sections of the corpus, we need to split those out first,
# and then repeat the aggregation and averaging steps to get a comparable subcorpus to plot.
targetvariable = 'tabloid'
targetvalues = (0, 1)
targetlabels = ('Broadsheet', 'Tabloid')
targetcolors = ('C0', 'C1')

for val, subcorpus in df.groupby(targetvariable):
    if val in targetvalues:  # Use & plot this sub-corpus
        subcorpus_bymonth = subcorpus.drop(stringvars, axis=1).set_index('Date').groupby(pd.Grouper(freq='M'))

        # Collapse by taking averages and adding a count variable indicating number of articles that month
        subcorpus_bymonthX = subcorpus_bymonth.aggregate(np.mean).reset_index().join(subcorpus_bymonth.size().reset_index(name='nrarticles').drop('Date', axis=1))
        subcorpus_bymonthX = subcorpus_bymonthX.set_index('Date')
        # Generate exponential moving average (make sure to use same span as for full corpus!)
        subcorpus_bymonth_ewm5  = subcorpus_bymonthX.ewm(span=5, adjust=False).mean()

        # Now plot the lines for this subcorpus (as distinct from full corpus)
        ax.plot(subcorpus_bymonth_ewm5.loc[start_date:end_date, :].index, 
                subcorpus_bymonth_ewm5.loc[start_date:end_date, 'avg_valence'], 
                color = targetcolors[val],
                lw = 3,
                label = '{} valence'.format(targetlabels[val]))
        ax.plot(subcorpus_bymonth_ewm5.loc[start_date:end_date, :].index, 
                subcorpus_bymonth_ewm5.loc[start_date:end_date, 'avg_valence_bin'], 
                color = targetcolors[val],
                lw = 2, linestyle = 'dashed',
                label = '_nolegend_' # {} binary valence'.format(targetlabels[val])
               )
    
# Add vertical lines at key events (Sep. 11)
eventcol = 'gray'
eventstyle = 'dashed'

x_bounds = ax.get_xlim()
xposition = [(pd.to_datetime('2001-09-01'), '  9/11/2001', eventcol, eventstyle, '1'),
             # (pd.to_datetime('2005-07-07'), '  7/7/2005', eventcol, eventstyle, '1'),
            ]
for xc, event, linecolor, style, width in xposition:
    ax.axvline(x=xc, color=linecolor, linestyle=style, lw=width)
    nrdays_elapsed = (xc - pd.to_datetime(start_date)) / pd.offsets.Day(1)
    ax.annotate(text=event, xy =(((nrdays_elapsed+days2add)/nrdays),0.99), xycoords='axes fraction', verticalalignment='top', horizontalalignment='center' , rotation = 270)

# Label y axis 
ax.set_ylabel('Article valence', fontsize='large')

# Add axis & legend info

# Legend data for plotted line(s)
handles, labels = ax.get_legend_handles_labels()

# Add text legend
handles.append(matplotlib.lines.Line2D([0], [0], color='gray', lw=0, label='(Dashed lines are binarized valence)')) 

# Update the legend
plt.legend(handles=handles, loc='upper right', fontsize='large')

# Save & display
plt.tight_layout()
plt.savefig(projectfolder + "Fig2-test.pdf", bbox_inches="tight", dpi=300)
plt.show()

