# Corona-Tweet Analysis
### Samuel Heinz, Alexander von Stegmann

## 1 Preparation

### 1.1 Import Libraries and load training set and data set

In [132]:
import pandas as pd
import re
import emoji
import random
import nltk
from string import punctuation
from nltk import WordNetLemmatizer
from nltk.corpus import stopwords, wordnet
from nltk.stem import SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report
from collections import Counter
from nltk import NaiveBayesClassifier
from tqdm.auto import tqdm

In [108]:
td = pd.read_csv('training_data.csv', sep='|', lineterminator='\n')

# prepare full corpus
LABELS = {'acknowledged', 'opposed', 'neutral', 'not related'}
full_corpus = [
    (text, label)
    for label in LABELS
    for text in td.text.loc[td[label] == 1].values.tolist()
]

lbl_dist = Counter(label for _, label in full_corpus)
print(lbl_dist)

Counter({'neutral': 398, 'acknowledged': 276, 'not related': 180, 'opposed': 146})


### 1.2 Preprocessing

##### Normalization

In [109]:
NORMALIZE_RULES = {
    "it's": "it is",
    "it’s": "it is",
    "n't": " not",
    "n’t": " not",
    "'m": " am",
    "’m": " am",
    "'ve": " have",
    "’ve": " have",
    "'re": " are",
    "’re": " are",
    "'ll": " will",
    "’ll": " will",
    "&amp;": "and",
    "&gt;": ""
}

def normalize(text):
    for old, new in NORMALIZE_RULES.items():
        text = text.replace(old, new)
    return text

##### Remove Punctuation

In [110]:
PUNCTUATION = set(punctuation)

def remove_punctuation(text):
    # text = re.sub('(?<=\d),(?=\d)', '', text)
    text = re.sub('([,\d]*\d,\d[,\d]*)', 'num', text)
    text = re.sub('(\d{,3}[.]*\d{2,}%)', 'perc', text)
    for p in PUNCTUATION:
        text = text.replace(p, " ")
    return text

##### Define Stopwords

In [111]:
STOPWORDS = set(stopwords.words('english'))

##### Lemmatizing

In [112]:
LEMMA = WordNetLemmatizer()

def convert_pos_tag(tag):

    converted = wordnet.NOUN

    if tag.startswith('J'):
        converted = wordnet.ADJ
    elif tag.startswith('V'):
        converted = wordnet.VERB
    elif tag.startswith('R'):
        converted = wordnet.ADV

    return converted

##### Stemming

In [113]:
STEMMER = SnowballStemmer(language='english')

def stem(tokens):
    return [STEMMER.stem(token) for token in tokens]

##### Tokenization

In [114]:
def tokenize(text):
    """
    Takes a text (or tweet) and applies following rules. Afterwards the text is split into tokens that
    are lemmatized and stemmed
    :param text: full str of a text
    :return: list of tokens
    """
    # 1.1 Lower case everything
    lower = text.lower()
    # 1.2 Normalize & clear punctuation
    normalized = normalize(lower)
    no_punct = remove_punctuation(normalized)
    # 1.3 "de-emojize"
    demojized = emoji.demojize(no_punct, delimiters=[' ',' '])
    # 1.4 Create tokens and drop Stopwords
    tokens = [token
              for token in demojized.split()
              if token not in STOPWORDS]

    # 2. POS-Tagging
    with_pos = nltk.pos_tag(tokens)

    # 3.1 Conversion of pos tags for lemmatizer
    with_converted_pos = [(token, convert_pos_tag(tag)) for token, tag in with_pos]

    # 3.2 Lemmatize
    lemmatized_tokens = [LEMMA.lemmatize(token, pos=tag) for token, tag in with_converted_pos]

    # 4. Stemming
    stemmed_tokens = [STEMMER.stem(token) for token in lemmatized_tokens]

    return stemmed_tokens

In [115]:
howclean = td.text.values.tolist()

In [116]:
howclean2 = [tokenize(text) for text in howclean]

In [117]:
for i in range(0,25):
    print(f"Input : {howclean[i]}")
    print(f"Tokens : {howclean2[i]} \n")

Input : Organizers of today's #SickOutBC are not ruling out another protest before #Christmas.   Langley mom Miranda Tracy (@mjt8080) says many parents --and teachers-- do not trust public health claims #COVID19 transmission rates among children are low.    #bced #bcpoli @NEWS1130
Tokens : ['organ', 'today', 'sickoutbc', 'rule', 'anoth', 'protest', 'christma', 'langley', 'mom', 'miranda', 'traci', 'mjt8080', 'say', 'mani', 'parent', 'teacher', 'trust', 'public', 'health', 'claim', 'covid19', 'transmiss', 'rate', 'among', 'child', 'low', 'bced', 'bcpoli', 'news1130'] 

Input : So questionnn the people administering the vaccine are they getting it they self??? #vaccine  #COVID19
Tokens : ['questionnn', 'peopl', 'administ', 'vaccin', 'get', 'self', 'vaccin', 'covid19'] 

Input : President #Trump to sign decree that prioritises American citizens getting access to #coronavirus vaccine before other nations.
Tokens : ['presid', 'trump', 'sign', 'decre', 'prioritis', 'american', 'citizen', 'ge

#### 1.3 Feature Extraction

In [118]:
def extract_features(tokens, vocabulary):
    features = {}

    token_count = Counter(tokens)

    for vocab_token in vocabulary:
        features[f'amount({vocab_token})'] = token_count[vocab_token]
    return features

#### 1.4  Dividing Training Set

In [119]:
def train_test_split(corpus, amount=0.8):
    split_index = int(len(corpus) * amount)

    shuffled = random.sample(corpus, len(corpus))

    train = shuffled[:split_index]
    test = shuffled[split_index:]

    return train, test

In [120]:
def build_vocabulary(corpus, num_words=300):
    full_count = Counter(token for tokens, _ in corpus for token in tokens)
    return sorted([token for token, _ in full_count.most_common(num_words)])

### 2 Evaluation

#### 2.1 Split the data set

In [158]:
train_set = []
test_set = []
for label in LABELS:
    train_tweets, test_tweets = train_test_split([tweet for tweet in full_corpus if tweet[1] == label], 0.9)
    train_set.extend(train_tweets)
    test_set.extend(test_tweets)

print(f"The train set has {len(train_set)} items, the test set {len(test_set)}")

Counter(label for _ , label in train_set), Counter(label for _ , label in test_set)

The train set has 899 items, the test set 101


(Counter({'not related': 162,
          'neutral': 358,
          'acknowledged': 248,
          'opposed': 131}),
 Counter({'not related': 18,
          'neutral': 40,
          'acknowledged': 28,
          'opposed': 15}))

#### 2.2 Preprocessing

In [None]:
train_set_preprocessed = [(tokenize(tweets), labels) for tweets, labels in tqdm(train_set, total=len(train_set), desc='Preprocessing')]
test_set_preprocessed = [(tokenize(tweets), labels) for tweets, labels in tqdm(test_set, total=len(test_set), desc='Preprocessing')]

#### 2.3 Build vocabulary (only nltk)

In [20]:
TRAIN_VOCABULARY = build_vocabulary(train_set_preprocessed, 300)
TEST_VOCABULARY = build_vocabulary(test_set_preprocessed, 300)

#### 2.4 Feature extraction

In [None]:
train_data = [(extract_features(tokens, TRAIN_VOCABULARY), label) for tokens, label in tqdm(train_set_preprocessed, total=len(train_set_preprocessed), desc='Preprocessing')]
test_data = [(extract_features(tokens, TEST_VOCABULARY), label) for tokens, label in tqdm(test_set_preprocessed, total=len(test_set_preprocessed), desc='Preprocessing')]

#### 2.5 Train classifier

In [22]:
#nltk
nb = NaiveBayesClassifier.train(train_data)

In [170]:
#nltk
nb.show_most_informative_features(10)

Most Informative Features
            amount(life) = 1              not re : neutra =     13.9 : 1.0
       amount(wearamask) = 1              acknow : neutra =     13.0 : 1.0
          amount(govern) = 1              oppose : acknow =     11.9 : 1.0
            amount(wear) = 1              acknow : neutra =     11.3 : 1.0
 amount(realdonaldtrump) = 1              acknow : neutra =     11.1 : 1.0
           amount(elect) = 1              not re : neutra =     11.0 : 1.0
            amount(even) = 1              not re : neutra =     11.0 : 1.0
            amount(mask) = 1              acknow : neutra =      9.6 : 1.0
             amount(new) = 1              neutra : oppose =      9.5 : 1.0
            amount(want) = 1              oppose : neutra =      8.2 : 1.0


In [24]:
#nltk
nltk.classify.accuracy(nb, test_data)

0.5495049504950495

In [25]:
#nltk
predictions = nb.classify_many([feature for feature, _ in test_data])

In [26]:
#nltk
gold = [label for _, label in test_data]

In [27]:
# Micro Average

tp, fp, fn = 0, 0, 0

for predicted, correct in zip(predictions, gold):
    for label in LABELS:
        if correct == label:
            if predicted == label:
                tp += 1
            else:
                fn += 1
        else:
            if predicted == label:
                fp += 1
            # We don't care about TN for precision/recall

micro_precision = tp / (tp + fp)
micro_recall = tp / (tp + fn)
micro_fscore = (2 * micro_precision * micro_recall) / (micro_precision + micro_recall)

print(f"""
Micro-Precision: {micro_precision:.2f}
Micro-Recall   : {micro_recall:.2f}
Micro-FScore   : {micro_fscore:.2f}
""")


Micro-Precision: 0.55
Micro-Recall   : 0.55
Micro-FScore   : 0.55



In [28]:
# Macro Average

precisions, recalls, fscores = {}, {}, {} # as dictionary so, we store it by _label_

for label in LABELS:
    tp, fp, fn = 0, 0, 0
    for predicted, correct in zip(predictions, gold):
        if correct == label:
            if predicted == label:
                tp += 1
            else:
                fn += 1
        else:
            if predicted == label:
                fp += 1
    p = tp / (tp + fp)
    r = tp / (tp + fn)
    f = (2 * p * r) / (p + r)

    precisions[label] = p
    recalls[label] = r
    fscores[label] = f


print(f"Precision per Label:")
print('\n'.join(['\t' + f'{label:<14}: {value:.2f}' for label, value in precisions.items()]))
print()

print(f"Recall per Label:")
print('\n'.join(['\t' + f'{label:<14}: {value:.2f}' for label, value in recalls.items()]))
print()

print(f"F-Score per Label:")
print('\n'.join(['\t' + f'{label:<14}: {value:.2f}' for label, value in fscores.items()]))
print()

macro_precision = sum(precisions.values()) / len(precisions)
macro_recall = sum(recalls.values()) / len(recalls)
macro_fscore = sum(fscores.values()) / len(fscores)


print(f"""
Macro-Precision: {macro_precision:.2f}
Macro-Recall   : {macro_recall:.2f}
Macro-FScore   : {macro_fscore:.2f}
""")

Precision per Label:
	not related   : 0.45
	neutral       : 0.73
	acknowledged  : 0.47
	opposed       : 0.44

Recall per Label:
	not related   : 0.58
	neutral       : 0.64
	acknowledged  : 0.48
	opposed       : 0.40

F-Score per Label:
	not related   : 0.51
	neutral       : 0.68
	acknowledged  : 0.47
	opposed       : 0.42


Macro-Precision: 0.52
Macro-Recall   : 0.53
Macro-FScore   : 0.52



Testing

In [429]:
"""
example = td.text[7]
print(f"Input : '{example}'")
print(f"Tokens: {tokenization(example)}")
"""

'\nexample = td.text[7]\nprint(f"Input : \'{example}\'")\nprint(f"Tokens: {tokenization(example)}")\n'

In [430]:
"""
example = "#COVID19 why are those that work closely with COVID patients (a highly contagious vaccine) a priority if they had been doing work since the beginning? Wouldn’t it be better to use for those with higher health risks? Isn’t the current system they are using be working?"
tokens = tokenization(example)
features = extract_features(tokens)
output = nb.classify(features)

print(f"The classifier predicts: {output}")
"""

'\nexample = "#COVID19 why are those that work closely with COVID patients (a highly contagious vaccine) a priority if they had been doing work since the beginning? Wouldn’t it be better to use for those with higher health risks? Isn’t the current system they are using be working?"\ntokens = tokenization(example)\nfeatures = extract_features(tokens)\noutput = nb.classify(features)\n\nprint(f"The classifier predicts: {output}")\n'