In [1]:
import pandas as pd
import numpy as np
import data_preparation as prep
import baseline_models as bas
from helper_functions import *
import warnings
warnings.filterwarnings("ignore")

In [2]:
#set random seed and load data
seed = 1337
pop_covid = read_covid()
pop_non_covid = read_non_covid()
pop_non_covid.columns.values[4] = 'ID_Posting'
pop_reference = read_reference()
pop_reference.columns.values[4] = 'ID_Posting'
#remove repetitive comments found in the sampling process
pop_reference = pop_reference.loc[pop_reference['Score']<7]
pop_non_covid = pop_non_covid.loc[pop_non_covid['Score']<9]

In [3]:
gold_sample = pd.read_csv('../data/gold/gold_sample.csv', header = None, index_col= 0)
gold_sample.columns = ['ID_Posting', 'Comment','Label']
dict_gold_covid =pop_covid[pop_covid['ID_Posting'].isin(list(gold_sample['ID_Posting']))]
dict_gold_non_covid = pop_non_covid[pop_non_covid['ID_Posting'].isin(list(gold_sample['ID_Posting']))]
dict_gold_reference = pop_reference[pop_reference['ID_Posting'].isin(list(gold_sample['ID_Posting']))]
dict_gold_full = pd.concat([dict_gold_covid,dict_gold_non_covid,dict_gold_reference])

In [4]:
#split the gold_sample into train, validation and test sets and save them for later use
train800, val200, test200 =  prep.strat_train_val_test(gold_sample)
count_labels(train800)
count_labels(test200)
count_labels(test200)

train800.to_csv('../data/gold/train800.csv', index = False)
val200.to_csv('../data/gold/val200.csv', index = False)
test200.to_csv('../data/gold/test200.csv', index = False)

800
200
There are 198 populist comments.
There are 49 populist comments.
There are 49 populist comments.
There are 149 populist comments.
There are 148 populist comments.


## Hawkins and Castanho Silva 2016

### Preprocessing

In [17]:
#create the document-term matrix 
process800 = pd.concat([train800, test200]).reset_index()
process800['Processed'] = prep.preprocess_hawkins(process800['Comment'])
process800 = process800[['ID_Posting','Comment','Label','Processed']]
matrix800 = prep.doc_term_matrix(process800['Processed'])
train_matrix800 = matrix800[:len(train800)] 
test_matrix800 = matrix800[len(train800):]

In [None]:
#search for the optimal parameter setting of the elastic net model
score800, params800 = bas.param_tuning_elastic(train_matrix800, train800['Label'], np.logspace(-5,2,num=8), [0.2,0.4,0.6,0.8], 5)
print(score800)

In [22]:
best_model800 = bas.elastic_net(params800['alpha'], params800['l1_ratio'])
best_model800 = bas.fit_model(best_model800, train_matrix800, train800['Label'])

In [24]:
pred800 = bas.predict(best_model800, test_matrix800)
bas.eval_metrics(test200['Label'],pred800)

The model reaches a recall of:0.3877551020408163
The model reaches a precision of:0.5428571428571428
The model reaches a F1-Score of:0.4523809523809524
The model reaches an accuracy of:0.77


## Gründl 2020

In [25]:
#create two data sets with populism labels; one where one dictionary terms leads to a populism label and one where two terms are needed
dict_one = dict_gold_full.copy()
dict_one['Score'][dict_one['Score']>0] = 1
dict_two = dict_gold_full.copy()
dict_two['Score'][dict_two['Score']<2] = 0
dict_two['Score'][dict_two['Score']>1] = 1

In [26]:
gruendl_eval_one200 = test200.merge(dict_one, on = 'ID_Posting')
gruendl_eval_two200 = test200.merge(dict_two, on = 'ID_Posting')

In [None]:
#evaluate the gruendl dictionary with a score of one
bas.eval_metrics(gruendl_eval_one200['Label'],gruendl_eval_one200['Score'])

The model reaches a recall of:0.88
The model reaches a precision of:0.44
The model reaches a F1-Score of:0.5866666666666667
The model reaches an accuracy of:0.69


In [49]:
print(sum(gruendl_eval_one200['Score']))
print(sum(gruendl_eval_two200['Score']))

109
107


In [None]:
#evaluate the gruendl dictionary with a score of two
bas.eval_metrics(gruendl_eval_two200['Label'],gruendl_eval_two200['Score'])

The model reaches a recall of:0.88
The model reaches a precision of:0.4489795918367347
The model reaches a F1-Score of:0.5945945945945946
The model reaches an accuracy of:0.7


## Rooduijn and Pauwels 2011

In [46]:
#use the german version of the  RP dictionary to create populism labels with a dictionary score of two
rood_dict_ger = ['elit', 'konsens', 'undemokratisch', 'referend', 'korrupt', 'propagand', 'politiker', 'täusch', 'betrüg', 'betrug', 'verrat', 'scham', 'schäm', 'skandal', 'wahrheit', 'unfair', 'unehrlich', 'establishm', 'herrsch', 'lüge']
def check_dict2(df):
    dict_score = df.copy()
    dict_score['Count'] = 0
    for index, row in dict_score.iterrows():
        count = 0
        comment = row['Comment']
        comment = comment.lower()
        for x in rood_dict_ger:
            count = count + comment.count(x)

        dict_score.at[index,'Count'] = count
    dict_score['Label'].loc[dict_score['Count'] > 1] = 1
    dict_score['Label'].loc[dict_score['Count'] < 2] = 0
    dict_score['Label'] = dict_score['Label'].astype(int)
    return dict_score
rood_score_two200 = check_dict2(test200)

In [48]:
#evaluation for the dictionary score of two
bas.eval_metrics(test200['Label'], rood_score_two200['Label'].astype(int))

The model reaches a recall of:0.2653061224489796
The model reaches a precision of:0.5416666666666666
The model reaches a F1-Score of:0.35616438356164387
The model reaches an accuracy of:0.765


In [47]:
#create labels for a dictionary score of one
rood_score_ger200 = bas.check_dict(test200)
bas.eval_metrics(test200['Label'], rood_score_ger200['Label'].astype(int))

The model reaches a recall of:0.5102040816326531
The model reaches a precision of:0.5319148936170213
The model reaches a F1-Score of:0.5208333333333334
The model reaches an accuracy of:0.77
