In [10]:
import pandas as pd
from pathlib import Path
import os
import sys
import csv
import math
import re
import numpy as np

#Path
os.chdir(r"C:\Users\lenovo\Documents\BSE\SEM_2\text_mining\as_4")

#Read in file

corpus_data = pd.read_csv("all_english.csv", delimiter=',', encoding='utf-8', converters = {"tweet_hashtags": lambda x: x.strip("[]").replace("'","").split(", ")})

corpus_data.head()
#%% 
# Auxiliary functions
def cleanTweets(s):
    #Line breaks
    s = s.replace(r'<lb>', "\n")
    s = re.sub(r'<br */*>', "\n", s)
    # Tabs
    s = s.replace(r'<tab>', "\i")
    #Symbols
    s = s.replace("&lt;", "<").replace("&gt;", ">").replace("&amp;", "&")
    s = s.replace("&amp;", "&")
    # urls
    s = re.sub(r'\(https*://[^\)]*\)', "[url]", s)
    s = re.sub(r'https*://[^\s]*', "[url]", s)
    # Replace double instances of quotations with single instance
    s = re.sub(r'"+', '"', s)
    # custom removals
    s = re.sub(r'@[A-Za-z0-9_]+', "@usermention", s) # remove mentions
    #s = re.sub(r'#[A-Za-z0-9_]+', "#hashtag", s) # remove hashtags
    s = re.sub(r':[^:]+','[emoji]',s) # remove demojized text
    return str(s)


  exec(code_obj, self.user_global_ns, self.user_ns)


In [14]:
uk_data = corpus_data[corpus_data["group_country"]=="United Kingdom"]
uk_cons = uk_data[uk_data["party_name"]=="Conservative"]
uk_labour = uk_data[uk_data["party_name"]=="Labour"]
for df in uk_cons, uk_labour:
    df['demojized_text'] = [cleanTweets(text) for text in df['demojized_text']]
print(len(uk_cons))
print(len(uk_labour))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['demojized_text'] = [cleanTweets(text) for text in df['demojized_text']]


32438
40764


In [15]:
#%%
from nltk.corpus import stopwords
from datetime import datetime
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_extraction import text
np.seterr(divide='ignore', invalid='ignore')

start_time = datetime.now()

# Add custom stop words
add_stopwords = ['usermention','emoji','url'] # can add hashtag here
stop_words = text.ENGLISH_STOP_WORDS.union(add_stopwords)

# Vectorise word counts - only construct tri-grams
vec = CountVectorizer(ngram_range = (4,4), stop_words=stop_words, min_df=15, max_df=0.6)
#Fit vectoriser and convert to dense matrix
uk_vector_cons = vec.fit_transform(uk_cons.demojized_text).todense()
uk_vector_lab = vec.fit_transform(uk_labour.demojized_text).todense()
# Term frequencies
tf_cons = np.array(uk_vector_cons)
tf_lab = np.array(uk_vector_lab)# frequencies of each token in a numpy array
totaltf_cons = tf_cons.sum(axis=0) # sum of all frequencies for a particular token for all corpus (column)
totaltf_lab = tf_lab.sum(axis=0)
print("matrix has size", uk_vector_cons.shape)
print("matrix has size", uk_vector_lab.shape)

end_time = datetime.now()
print('Duration: {}'.format(end_time - start_time))

matrix has size (32438, 199)
matrix has size (40764, 465)
Duration: 0:00:06.153827


In [18]:
# %%
# Frequencies of all terms
all_terms_cons = dict(zip(vec.get_feature_names_out(), totaltf_cons))
all_terms_lab = dict(zip(vec.get_feature_names_out(), totaltf_lab))
from heapq import nlargest
# DICTIONARY - top 100 terms (you can vary this)
N = 100
top100_terms_cons = nlargest(N, all_terms_cons , key = all_terms_cons.get)
top100_terms_lab = nlargest(N, all_terms_lab , key = all_terms_lab.get)
#print(top100_terms_cons)
#print(top100_terms_lab)

# %%
# Add column for filtering with text split
#demojized_text_split = [i.split() for i in list(uk_data['demojized_text'])]
top100_terms_split_cons = [i.split() for i in top100_terms_cons]
top100_terms_split_lab = [i.split() for i in top100_terms_lab]
print(top100_terms_split_cons)
print(top100_terms_split_lab)

[['attorney', 'general', 'legal', 'advice'], ['clinically', 'unjustifiable', 'father', 'autistic'], ['consecutive', 'year', 'central', 'government'], ['govt', 'really', 'embarrassing', 'shambles'], ['bid', 'deliver', 'meaningful', 'changes'], ['brexit', 'deal', 'knows', 'lose'], ['check', 'labour', 'personal', 'manifesto'], ['deal', 'brexit', 'nthis', 'acceptable'], ['getting', 'longer', 'public', 'health'], ['issues', 'care', 'll', 'tell'], ['issues', 'challenge', 'boris', 'johnson'], ['attempt', 'make', 'bad', 'deal'], ['deal', 'nshe', 'running', 'scared'], ['chaos', 'unable', 'deliver', 'brexit'], ['damaging', 'impact', 'brexit', 'deal'], ['adverts', 'featured', 'misleading', 'factually'], ['bad', 'isn', 'willing', 'parliament'], ['boris', 'johnson', 'caught', 'lying'], ['botched', 'brexit', 'deal', 'worst'], ['bring', 'forward', 'meaningful', 'vote'], ['care', 'll', 'tell', 've'], ['country', 'puts', 'jobs', 'economy'], ['cuts', 'police', 'simple', 'truth'], ['daily', 'monthly', 'o

That is basically returning two nested lists with the most frequent 4-grams. When going through the list it is clearly visible that most of the 4-grams could be defined as polital spin. Essentially as an approach to make the before used collabsed dictionary more robust, we will substract the words used by the conservatives from the labour term, to get a more labour unique dictionary.

In [20]:
## flatten 4-grams and substracting conservatives terms from labour terms
flat_list_cons = [item for sublist in top100_terms_split_cons for item in sublist]
flat_list_lib = [item for sublist in top100_terms_split_lab for item in sublist]
output = [x for x in flat_list_lib if not x in flat_list_cons or flat_list_cons.remove(x)]
output = list(set(output))
output

['nsource',
 'say',
 'uk',
 'forced',
 'questions',
 'saying',
 'people',
 'working',
 'said',
 'pneumonia',
 'face',
 'ge2019',
 'neil',
 'rough',
 'grabbed',
 'theresa',
 'farage',
 'just',
 'barr',
 'denial',
 'hospital',
 'nwake',
 'unacceptable',
 'britain',
 'prove',
 'things',
 'tried',
 'phone',
 'sell',
 'commit',
 'come',
 'thursday',
 'jack',
 'years',
 'view',
 'boris',
 'entry',
 'save',
 'coats',
 'tories',
 'suspected',
 'nit',
 'prepared',
 'issues',
 'williment',
 'ready',
 'year',
 'statsofshame',
 'saveournhs',
 'votelabour',
 '12',
 'sleeping',
 'picture',
 'untrustworthy',
 'nhs',
 'wait',
 'likes',
 'floor',
 'nthe',
 'control',
 'na',
 'pmqs',
 'nvote',
 'oven',
 'doesn',
 'end',
 'pocket',
 'provide',
 'state',
 'austerity',
 'home',
 'labour',
 '14',
 'johnson',
 'seven',
 'free',
 'challenge',
 'pm',
 'mr',
 'nigel',
 'generalelection2019',
 'late',
 'running',
 'deemed',
 'poverty',
 'old',
 'votelabourtoday',
 'parlipuboty',
 'video',
 'interview',
 'nandrew

In [21]:
with open('labour_dictionary.txt', 'w') as f:
    for item in output:
        f.write("%s\n" % item)

### Create a simple measure for spin
Basically label data containing spin according to their media attachment. If a video or photo is attached to a tweet from the official party accounts, these tweets can be in general classified as spin. Of course there will be some exceptions, but the idea is to measure differences using this labelling strategy in the regression to previous results

In [23]:
uk_md = pd.read_csv("uk_tweets.csv")
uk_md = uk_md[['id', 'created_at', 'entities.urls', 'attachments.media', 'author.public_metrics.followers_count']]

uk_md['created_at'] = pd.to_datetime(uk_md['created_at'], format="%Y-%m-%dT%H:%M:%S.000Z")

uk_data_spin = pd.merge(uk_data, uk_md, on = 'id')

In [36]:
#uk_data_spin[uk_data_spin['attachments.media'] != None]
#uk_data_spin['label'] = [1 if i.contains('video|photo', na=False) else 0 for i in uk_data_spin['attachments.media']]
uk_data_spin['label'] = uk_data_spin['attachments.media'].str.contains('video|photo', na=False).astype(int)
uk_data_spin['label'].value_counts()

0    33138
1     7538
Name: label, dtype: int64