##### Pass 5 objective: Perform CGT on the remaining unclassified SMRs, use the GTxM Classifier, HO1 and HO2 to code new GTD

In [1]:
import re
import numpy as np
import pandas as pd
from krippendorff_alpha import *

### CGT Step 1: Pattern Detection

In [2]:
# Import libraries
from pprint import pprint
# Gensim for topic modeling functions
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
# spacy for lemmatization
import spacy
# Plotting tools
import pyLDAvis
import pyLDAvis.gensim_models
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)
# libraries to tokenize, clean up and calculate word counts
import nltk
from nltk.corpus import words
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
nltk.download('words')
nltk.download('punkt')
nltk.download('omw-1.4')
wordlist = nltk.corpus.words.words()
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()
wordlist = [stemmer.stem(lemmatizer.lemmatize(word)) for word in wordlist]

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\kazeem\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\kazeem\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\kazeem\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\kazeem\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [31]:
# Import Dataset
df = pd.read_csv('data/MasterTokens_10to260SupTw_Not_GTr.csv', dtype='str')
df_GTD_Rec = pd.read_csv('data/GTxM_Pass4/GTxM_Pass4_GTD_UpTodate.csv', dtype='str')
df_Reject_Rec = pd.read_csv('data/GTxM_Pass4/GTxM_Pass4_Reject_UpTodate.csv', dtype='str')
len(df), len(df_GTD_Rec), len(df_Reject_Rec)

(1512, 1354, 350)

In [32]:
# get the CGT RecIDs by ensuring GTD and Rejected are not included
df_CGT_Rec = pd.merge(df['RecID'], df_GTD_Rec, on='RecID', how='left')
len(df_CGT_Rec)

1512

In [33]:
df_CGT_Rec.head(2)

Unnamed: 0,RecID,Label,Target
0,1057291398880391170,,
1,1124056098925944832,Entertainment,2.0


In [34]:
df_CGT_Rec = df_CGT_Rec[df_CGT_Rec.Target.isna()]
len(df_CGT_Rec)

972

In [35]:
# remove Rejected
df_CGT_Rec_NoReject = pd.merge(df_CGT_Rec, df_Reject_Rec, on='RecID', how="outer", indicator=True
              ).query('_merge=="left_only"')

In [36]:
len(df_CGT_Rec_NoReject)

681

In [9]:
df_CGT_Rec_NoReject.head(2)

Unnamed: 0,RecID,Label,Target,_merge
0,1057291398880391170,,,left_only
2,1165822705037217792,,,left_only


In [10]:
df_CGT = pd.merge(df, df_CGT_Rec_NoReject['RecID'], on='RecID', how='inner')

In [11]:
len(df_CGT)

681

In [12]:
df_CGT.head(2)

Unnamed: 0,rowid,RecID,PubTitle,RecDoc,countSupTweets,smrHashtags,smrMentions,smrNER,smrNouns,smrVerbs,smrAdverbs,smrAdjectives,smrTopText,smrSummary
0,155,1057291398880391170,Could fireworks be restricted at Scottish homes?,This is the effect fireworks can have on a dog...,45.0,banfireworks fireworks,neilmackay gamesshed didriksoderlind bbcradios...,juli timmi uk last night daili bob marley ub j...,effect firework dog juli hors goat lot firewor...,stand comfort reli built hear held purchas des...,outsid long care fairli ahead exactli seemingl...,gener licens wide gener last daili big loud so...,This is the effect fireworks can have on a dog...,This is the effect fireworks can have on a dog...
1,374,1165822705037217792,Cars Are Death Machines. Self-Driving Tech Won...,"Please RT if you, or someone you know, has bee...",17.0,,aarieff realdonaldtrump ttmitch,yard hoboken nj washington yanke stadium long ...,car experi mobil panel daughter car yard aspha...,hit hit thrown end broken land save pass place...,badli recent straight right nearli nearli slow...,littl upcom catastroph oncom danger high small...,"Please RT if you, or someone you know, has bee...","Please RT if you, or someone you know, has bee..."


In [13]:
ExpName = "Pass5_CGTNounAdv"
df_CGT_corpus = df_CGT['smrNouns'].fillna(value='') + df_CGT['smrAdverbs'].fillna(value='')
data = df_CGT_corpus.str.split()
data_words = data.values.tolist()
print('Token list created successfully.')

Token list created successfully.


In [14]:
df_CGT_docs = df_CGT['RecID']
id2word = corpora.Dictionary(data_words)
print('Word dictionary created successfully.')

Word dictionary created successfully.


In [15]:
# Term Document Frequency
texts = data_words
corpus = [id2word.doc2bow(text) for text in texts]
print('Term-Doc-Frequency created successfully.')

Term-Doc-Frequency created successfully.


In [16]:
len(corpus)

681

In [17]:
#Build LDA model
print('Building LDA model...')
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,  
                                           num_topics=10,
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

# Print the Keyword in the 12 topics
print('LDA model created successfully.')
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

Building LDA model...
LDA model created successfully.
[(0,
  '0.033*"peopl" + 0.013*"trump" + 0.013*"tax" + 0.011*"coffe" + 0.011*"berni" '
  '+ 0.010*"space" + 0.008*"money" + 0.007*"campaign" + 0.006*"job" + '
  '0.006*"dongcot"'),
 (1,
  '0.011*"fy" + 0.010*"book" + 0.007*"peopl" + 0.006*"guy" + 0.006*"work" + '
  '0.006*"dog" + 0.005*"christma" + 0.005*"igcom" + 0.005*"anim" + '
  '0.005*"itgco"'),
 (2,
  '0.045*"china" + 0.021*"sport" + 0.020*"shoot" + 0.014*"deadspin" + '
  '0.014*"chines" + 0.010*"hong" + 0.009*"wuhan" + 0.009*"kong" + 0.008*"nba" '
  '+ 0.008*"freedom"'),
 (3,
  '0.012*"liverpool" + 0.011*"bridg" + 0.010*"youth" + 0.010*"pun" + '
  '0.009*"latinx" + 0.008*"toast" + 0.007*"vendor" + 0.006*"champion" + '
  '0.006*"batman" + 0.005*"yahoo"'),
 (4,
  '0.004*"bellami" + 0.001*"faeogi" + 0.000*"humpback" + 0.000*"fkin" + '
  '0.000*"sadcertainli" + 0.000*"motionless" + 0.000*"whale" + '
  '0.000*"greenhith" + 0.000*"hike" + 0.000*"hitch"'),
 (5,
  '0.021*"peopl" + 0.0

In [27]:
print('Generating performance scores for '+ExpName)
# Compute Perplexity
perplex_lda = lda_model.log_perplexity(corpus)
print('\nPerplexity: {:.2f}'.format(perplex_lda))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=texts, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: {:.2f}'.format(coherence_lda))

Generating performance scores for Pass5_CGTNounAdv

Perplexity: -8.98

Coherence Score: 0.39


In [18]:
# Got error, had to downgrade pandas to version 1.5.3 see: https://github.com/bmabey/pyLDAvis/issues/247
# Decided to set versions for all modules, use: Py38_Modules_w_versions.ipynb
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word, mds='mmds', sort_topics=False) #)#, mds='tsne'
pyLDAvis.save_html(vis, 'lda_tsne_'+ExpName+'.html')

  default_term_info = default_term_info.sort_values(


In [19]:
# generate doc-topics lists
doc_topic = lda_model.get_document_topics(corpus, minimum_probability=0.1)

In [20]:
i = 0
docs=[]
topics=[]
scores=[]
for doc in doc_topic:
    doc_id = df_CGT_docs.iloc[i]
    i = i+1
    #if i>5: break
    for topic_id, score in doc:
        #print(doc_id, topic_id, score)
        scores.append(score)
        topics.append(topic_id+1)
        docs.append(doc_id)

In [21]:
df_doc_topic = pd.DataFrame({'RecID': docs, 'TopicID': topics, 'TopicScore': scores})

In [22]:
df_doc_topic

Unnamed: 0,RecID,TopicID,TopicScore
0,1057291398880391170,2,0.948927
1,1165822705037217792,2,0.866018
2,1174449406172491776,2,0.936766
3,1177796393349603328,2,0.282757
4,1177796393349603328,6,0.324031
...,...,...,...
1753,1222691732912869376,2,0.270219
1754,1222691732912869376,6,0.117748
1755,1223039083112517632,1,0.386498
1756,1223039083112517632,2,0.106632


In [23]:
df_doc_topic.groupby(by='TopicID').count()

Unnamed: 0_level_0,RecID,TopicScore
TopicID,Unnamed: 1_level_1,Unnamed: 2_level_1
1,260,260
2,552,552
3,30,30
4,7,7
5,1,1
6,557,557
7,4,4
8,33,33
9,272,272
10,42,42


In [24]:
df_doc_topic.to_csv('data/GTxM_Pass5/CGT/lda_doc_topic_all_'+ExpName+'.csv')

In [25]:
df_doc_topic_nodup = df_doc_topic.sort_values(['TopicScore'], ascending=(False)).drop_duplicates(['RecID'])
len(df_doc_topic_nodup)

681

In [26]:
df_doc_topic_nodup.to_csv('data/GTxM_Pass5/CGT/lda_doc_dominant_topic_'+ExpName+'.csv', index=False)

In [27]:
topics_ndarray = df_doc_topic_nodup.TopicID.unique()
topic_list =sorted(topics_ndarray)
listTopicID = list(set(df_doc_topic_nodup['TopicID'].tolist())) # get the unique list of Topic IDs
listTopicID

[1, 2, 3, 4, 6, 8, 9, 10]

In [28]:
# Select top 20%, upto 20 items max of each Topic
df_doc_topic_top20pcent = pd.DataFrame(columns=['RecID', 'TopicID', 'TopicScore'])
i=0
for i in range(len(listTopicID)):
    df_temp = df_doc_topic_nodup[(df_doc_topic_nodup.TopicID == listTopicID[i])]
    topic_items = len(df_temp)
    #if len(df_temp) > 9:
    topic20pc_items = round(len(df_temp)/5)
    if topic20pc_items > 20: #max 20 items
        topic20pc_items = 20
    df_temp = df_temp.head(topic20pc_items)
    df_doc_topic_top20pcent = pd.concat([df_doc_topic_top20pcent, df_temp])
    print(i, listTopicID[i], topic_items, topic20pc_items)

0 1 75 15
1 2 265 20
2 3 4 1
3 4 2 0
4 6 257 20
5 8 7 1
6 9 51 10
7 10 20 4


In [29]:
df_doc_topic_top20pcent

Unnamed: 0,RecID,TopicID,TopicScore
481,1189651968979025920,1,0.894049
153,1182030910671204353,1,0.828340
31,1179740913771405313,1,0.779122
740,1196553520201248768,1,0.734689
287,1184269805219733504,1,0.724936
...,...,...,...
785,1197994951771951105,9,0.703444
1275,1213840375380447233,10,0.764998
245,1183637409608552448,10,0.661296
818,1198921926371995648,10,0.655669


In [30]:
df_doc_topic_top20pcent.to_csv('data/GTxM_Pass5/CGT/lda_doc_topic_top20pct_'+ExpName+'.csv', index=False)

In [31]:
df_doc_topic_top20pcent.groupby(by='TopicID').count()

Unnamed: 0_level_0,RecID,TopicScore
TopicID,Unnamed: 1_level_1,Unnamed: 2_level_1
1,15,15
2,20,20
3,1,1
6,20,20
8,1,1
9,10,10
10,4,4


### CGT Step 2: Pattern Refinement

In [4]:
# Load Pass 5 JatoClassified after deep reading
df_lda_dread = pd.read_csv('D:/KOPro/PhD/Implementation/SourceCode/jatoDR_Pass5/genTopics/lda_doc_topic_top20pct_Pass5_CGTNounAdv.csv', dtype='str')
df_jato_dread = pd.read_csv('D:/KOPro/PhD/Implementation/SourceCode/jatoDR_Pass5/data/JatoClassified_Pass5DR.csv', dtype='str')
len(df_lda_dread), len(df_jato_dread)

(71, 2385)

In [5]:
df_dread = pd.merge(df_lda_dread,df_jato_dread, on='RecID')

In [6]:
df_dread.head(2)

Unnamed: 0,RecID,TopicID,TopicScore,rowid,SavedDataTime,JatoCat,JatoSubCat,JatoCat2,JatoSubCat2,JatoCat3,JatoSubCat3,NewRecClass,GTCodes,NewsPubCat
0,1189651968979025920,1,0.89404905,1546,2023:07:20 12:21:31,Political,Debate,,,,,Mix of Business and Politics,,Unknown
1,1182030910671204353,1,0.8283404,835,2023:07:20 11:58:37,Business,Complaint,,,,,Mix of Politics and Business,,Unknown


In [7]:
df_dread.drop(['TopicID','TopicScore','rowid','SavedDataTime','JatoCat','JatoSubCat','JatoCat2','JatoSubCat2','JatoCat3','JatoSubCat3','NewRecClass','GTCodes'], axis=1, inplace=True)
len(df_dread)

71

In [8]:
df_dread.groupby(['NewsPubCat']).size()

NewsPubCat
Entertainment      3
Environmental      4
Health             2
Human Rights       4
Law and Order      3
Obituary           3
Social Stories     7
Sports             2
Travel             2
UK Politics        8
USA Politics       6
Unknown           21
World Politics     6
dtype: int64

In [9]:
df_dread.rename(columns={'NewsPubCat': 'Label'}, inplace=True)

In [10]:
df_dread.head(2)

Unnamed: 0,RecID,Label
0,1189651968979025920,Unknown
1,1182030910671204353,Unknown


In [11]:
# Use the labels list from Pass 4 since there is no change in Pass 5
Labels_Targets = pd.read_csv('data/GTxM_Pass4/Labels_Targets_CGTPass4.csv')

In [12]:
Labels_Targets.head(2)

Unnamed: 0,Target,Label
0,1,Business
1,2,Entertainment


In [13]:
df_dread_Target = pd.merge(df_dread,Labels_Targets, on='Label', how='left')

In [14]:
df_dread_Target.head(2)

Unnamed: 0,RecID,Label,Target
0,1189651968979025920,Unknown,0
1,1182030910671204353,Unknown,0


In [15]:
len(df_dread_Target)

71

In [33]:
# In pass 4, we removed the 7 "Unknown" labels at this stage, since we didn't want to generate unknown labels with SBERT.
# HOWEVER, in Pass 5, there are 21 "Unknown" labels i.e ~ 30% of the deep reading.
# It is best to send them to SBERT and decide what to do with the results later.

# df_dread_Target = df_dread_Target[df_dread_Target.Label!='Unknown']
# len(df_dread_Target)

85

In [34]:
# NOT NECESSARY -- work it out in the SBERT PROGRAM.
# df_recTweets = pd.read_csv('data/MasterTokens.csv', usecols=['RecID','RecDoc'], dtype='str')

In [16]:
# save to GTxM Pass 4 folder and upload to Google Colab or Kaggle
df_dread_Target.to_csv('data/GTxM_Pass5/DeepRead_Pass5.csv', index=False)

### CGT Step 3: Pattern Confirmation (See Kaggle Notebook for code with SBERT)

#### Build Confirm_InReplyTo_Labeled

In [2]:
df_CoreCGT = pd.read_csv('data/CGTexpandedSMR_Data.csv', usecols=['TID','InReplyTo'], dtype=str)

In [3]:
df_CoreGTr = pd.read_csv('data/GroundTruthBERT.csv', usecols=['TID','InReplyTo'], dtype=str)

In [4]:
df_Corpus = pd.concat([df_CoreCGT,df_CoreGTr])
len(df_Corpus)

322199

In [5]:
df_Corpus.head()

Unnamed: 0,TID,InReplyTo
0,222818213392678912,
1,1207942378688040961,2.2281821339267888e+17
2,1207982506152222720,2.2281821339267888e+17
3,1207883227601494016,2.2281821339267888e+17
4,1207766213255991299,2.2281821339267888e+17


In [6]:
df_sbert_pass5 = pd.read_csv('data/GTxM_Pass5/CGTsbert_kaggle_pass5.csv', dtype=str)
len(df_sbert_pass5)

175046

In [7]:
df_sbert_pass5.head()

Unnamed: 0.1,Unnamed: 0,RID,QID,cQID,TID,Tweet,Score
0,0,317,1189651968979025920,,1189651968979025920,"It might be a nice troll of FB, but @jack's de...",1.0
1,1,14736,1189651968979025920,,1183254726294560768,Are you dumb? FB revenues would not even take ...,0.7048031687736511
2,2,11907,1189651968979025920,,1182042909396201472,@DeschutesDems So we can not believe any of @f...,0.7016364336013794
3,3,24321,1189651968979025920,,1186391700404494336,Ppl need to get over their illusions about FB....,0.6893789172172546
4,4,129096,1189651968979025920,,1189646292491784192,"Wow! Integrity for a change. Thank you! Now, h...",0.6864067912101746


In [8]:
df_dread_label = pd.read_csv('data/GTxM_Pass5/CGT_DeepRead_Pass5.csv', usecols=['RecID','Label','Target'], dtype=str)
len(df_dread_label)

71

In [9]:
df_dread_label.head()

Unnamed: 0,RecID,Label,Target
0,1189651968979025920,Unknown,0
1,1182030910671204353,Unknown,0
2,1179740913771405313,Unknown,0
3,1196553520201248768,Health,4
4,1184269805219733504,USA Politics,15


In [10]:
# rename RecID to QID for merging
df_dread_label.rename(columns={'RecID': 'QID'}, inplace=True)

In [11]:
df_sbert_labels = pd.merge(df_sbert_pass5, df_dread_label, on='QID', how='left')

In [12]:
df_sbert_labels.head()

Unnamed: 0.1,Unnamed: 0,RID,QID,cQID,TID,Tweet,Score,Label,Target
0,0,317,1189651968979025920,,1189651968979025920,"It might be a nice troll of FB, but @jack's de...",1.0,Unknown,0
1,1,14736,1189651968979025920,,1183254726294560768,Are you dumb? FB revenues would not even take ...,0.7048031687736511,Unknown,0
2,2,11907,1189651968979025920,,1182042909396201472,@DeschutesDems So we can not believe any of @f...,0.7016364336013794,Unknown,0
3,3,24321,1189651968979025920,,1186391700404494336,Ppl need to get over their illusions about FB....,0.6893789172172546,Unknown,0
4,4,129096,1189651968979025920,,1189646292491784192,"Wow! Integrity for a change. Thank you! Now, h...",0.6864067912101746,Unknown,0


In [13]:
df_sbert_labels_inreplyto = pd.merge(df_sbert_labels, df_Corpus, on='TID', how='left')
len(df_sbert_labels_inreplyto)

175105

In [14]:
df_sbert_labels_inreplyto.head(2)

Unnamed: 0.1,Unnamed: 0,RID,QID,cQID,TID,Tweet,Score,Label,Target,InReplyTo
0,0,317,1189651968979025920,,1189651968979025920,"It might be a nice troll of FB, but @jack's de...",1.0,Unknown,0,
1,1,14736,1189651968979025920,,1183254726294560768,Are you dumb? FB revenues would not even take ...,0.7048031687736511,Unknown,0,1.1830198808676803e+18


In [15]:
# Remove the headings imported with the SBERT csv
df_sbert_labels_inreplyto = df_sbert_labels_inreplyto[df_sbert_labels_inreplyto.RID !='RID']

In [16]:
# Remove InReplyTo IDs that are in GTxM_Pass4_Reject_UpTodate, GTxM_Pass4_GTD_UpTodate and CGT_DeepRead_Pass5
# Start with GTxM_Pass4_Reject_UpTodate
df_pass4_reject = pd.read_csv('data/GTxM_Pass4/GTxM_Pass4_Reject_UpTodate.csv', dtype=str)
df_pass4_reject.rename(columns={'RecID': 'InReplyTo'}, inplace=True)
len(df_pass4_reject)

350

In [17]:
df_sbert_labels_inreplyto1 = pd.merge(df_sbert_labels_inreplyto,df_pass4_reject, on='InReplyTo', how='outer', indicator=True)
df_sbert_labels_inreplyto1 = df_sbert_labels_inreplyto1[df_sbert_labels_inreplyto1['_merge']=='left_only']
len(df_sbert_labels_inreplyto1)

175040

In [18]:
# Remove InReplyTo IDs that are GTxM_Pass4_GTD_UpTodate
df_pass4_gtd = pd.read_csv('data/GTxM_Pass4/GTxM_Pass4_GTD_UpTodate.csv', usecols=['RecID'], dtype=str)
df_pass4_gtd.rename(columns={'RecID': 'InReplyTo'}, inplace=True)
len(df_pass4_gtd)

1354

In [19]:
# cleanup the _merge column before new filter
df_sbert_labels_inreplyto1.drop(['_merge'], axis=1, inplace=True)
df_sbert_labels_inreplyto2 = pd.merge(df_sbert_labels_inreplyto1,df_pass4_gtd, on='InReplyTo', how='outer', indicator=True)
df_sbert_labels_inreplyto2 = df_sbert_labels_inreplyto2[df_sbert_labels_inreplyto2['_merge']=='left_only']
len(df_sbert_labels_inreplyto2)

36742

In [20]:
# rename Deep Read QID in InReplyTo before merging
df_dread_label.rename(columns={'QID': 'InReplyTo'}, inplace=True)

In [21]:
# cleanup the _merge column
df_sbert_labels_inreplyto2.drop(['_merge'], axis=1, inplace=True)
df_sbert_labels_inreplyto_final = pd.merge(df_sbert_labels_inreplyto2,df_dread_label['InReplyTo'], on='InReplyTo', how='outer', indicator=True)
df_sbert_labels_inreplyto_final = df_sbert_labels_inreplyto_final[df_sbert_labels_inreplyto_final['_merge']=='left_only']
len(df_sbert_labels_inreplyto_final)

30146

In [22]:
df_sbert_labels_inreplyto_final.to_csv('df_sbert_labels_inreplyto_final.csv')

#### Filter based on Scores

In [23]:
df_sbert_labels_inreplyto_final['Score'] = df_sbert_labels_inreplyto_final['Score'].astype(float)

In [24]:
# df_sbert_labels_inreplyto_less = df_sbert_labels_inreplyto_final.query('Score < 0')
# len(df_sbert_labels_inreplyto_less)

In [25]:
df_sbert_labels_inreplyto_more = df_sbert_labels_inreplyto_final.query('Score >= 0.3')
len(df_sbert_labels_inreplyto_more)

29631

In [26]:
df_sbert_confirm = pd.DataFrame(df_sbert_labels_inreplyto_more.groupby(['InReplyTo','Label','Target']).size())

In [27]:
df_sbert_confirm.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,0
InReplyTo,Label,Target,Unnamed: 3_level_1
1177796393349603328,Law and Order,7,1
1177796393349603328,World Politics,12,7
1178766691272216578,Health,4,1
1178766691272216578,Law and Order,7,1
1178766691272216578,UK Politics,14,13


In [28]:
# temporarily dump the dataframe to csv, to flatten the structure (I couldn't find a better way, for now)
df_sbert_confirm.to_csv('temp.csv')

In [29]:
df_sbert_confirm = pd.read_csv('temp.csv', dtype='str')
df_sbert_confirm.head()

Unnamed: 0,InReplyTo,Label,Target,0
0,1177796393349603328,Law and Order,7,1
1,1177796393349603328,World Politics,12,7
2,1178766691272216578,Health,4,1
3,1178766691272216578,Law and Order,7,1
4,1178766691272216578,UK Politics,14,13


In [30]:
df_sbert_confirm.rename(columns={'0': 'QueryCount'}, inplace=True)
len(df_sbert_confirm)

3204

In [31]:
df_sbert_confirm['QueryCount'] = df_sbert_confirm['QueryCount'].astype(int)

In [32]:
df_sbert_confirm = df_sbert_confirm[df_sbert_confirm.QueryCount>9]
len(df_sbert_confirm)

503

In [33]:
df_sbert_confirm = df_sbert_confirm.sort_values(['InReplyTo','QueryCount'], ascending=[False, True])

In [34]:
# df_sbert_unknown = df_sbert_confirm[df_sbert_confirm['Label']=='Unknown']
# len(df_sbert_unknown)

In [35]:
# df_sbert_confirm_final = df_sbert_confirm[df_sbert_confirm['Label']!='Unknown']
# len(df_sbert_confirm_final)

In [36]:
df_sbert_confirm = df_sbert_confirm.drop_duplicates(['InReplyTo'])
len(df_sbert_confirm)

247

In [37]:
df_sbert_confirm.head()

Unnamed: 0,InReplyTo,Label,Target,QueryCount
3202,1223039083112517632,Unknown,0,14
3190,1222603409284567042,Unknown,0,10
3183,1222603086558126080,UK Politics,14,22
3169,1222474209374408705,UK Politics,14,13
3146,1222243393847169024,Unknown,0,18


In [38]:
df_sbert_confirm.groupby(['Label']).size()

Label
Entertainment      6
Environmental     21
Health             2
Human Rights       9
Law and Order     17
Obituary           9
Social Stories    31
Sports             6
Travel            12
UK Politics       21
USA Politics      18
Unknown           77
World Politics    18
dtype: int64

#### Remove Unclassified Medley, save CGT Labeled Data

In [39]:
# df_sbert_unknown.loc[(df_sbert_unknown.Label == 'Unknown'), 'Label'] = 'Unclassified Medley'
# df_sbert_unknown.loc[(df_sbert_unknown.Target == 0), 'Target'] = 16

In [40]:
df_sbert_confirm.drop(['QueryCount'], axis=1, inplace=True)

In [41]:
len(df_dread_label)

71

In [42]:
df_dread_label.head(2)

Unnamed: 0,InReplyTo,Label,Target
0,1189651968979025920,Unknown,0
1,1182030910671204353,Unknown,0


In [43]:
df_cgt_labeled = pd.concat([df_sbert_confirm,df_dread_label], axis=0)
len(df_cgt_labeled)

318

In [44]:
df_cgt_labeled.rename(columns={'InReplyTo': 'RecID'}, inplace=True)

In [45]:
# rename "Unknown" to "Unclassified Medley"
df_cgt_labeled.loc[(df_cgt_labeled.Label == 'Unknown'), 'Label'] = 'Unclassified Medley'
df_cgt_labeled.loc[(df_cgt_labeled.Target == 0), 'Target'] = 16

In [46]:
df_unclass_medley = df_cgt_labeled[df_cgt_labeled['Label']=='Unclassified Medley']
len(df_unclass_medley)

98

In [47]:
df_cgt_labeled_final = df_cgt_labeled[df_cgt_labeled['Label']!='Unclassified Medley']
len(df_cgt_labeled_final)

220

In [48]:
#Change USA and UK politics to Politics
df_cgt_labeled_final.loc[(df_cgt_labeled_final.Label == 'USA Politics'), 'Label'] = 'Politics'
df_cgt_labeled_final.loc[(df_cgt_labeled_final.Label == 'UK Politics'), 'Label'] = 'Politics'
df_cgt_labeled_final.loc[(df_cgt_labeled_final.Target == '14'), 'Target'] = '6'
df_cgt_labeled_final.loc[(df_cgt_labeled_final.Target == '15'), 'Target'] = '6'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cgt_labeled_final.loc[(df_cgt_labeled_final.Label == 'USA Politics'), 'Label'] = 'Politics'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cgt_labeled_final.loc[(df_cgt_labeled_final.Label == 'UK Politics'), 'Label'] = 'Politics'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cgt_labeled_final.loc[(df_cgt_labeled_final.Target == '14'), 'Target'] = '6'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: ht

In [49]:
df_cgt_labeled_final.groupby(['Label']).size()

Label
Entertainment      9
Environmental     25
Health             4
Human Rights      13
Law and Order     20
Obituary          12
Politics          53
Social Stories    38
Sports             8
Travel            14
World Politics    24
dtype: int64

In [50]:
df_unclass_medley.to_csv('data/GTxM_Pass5/CGT_Pass5_Unclassified_Medley.csv', index=False)

In [51]:
# save data
df_cgt_labeled_final.to_csv('data/GTxM_Pass5/GTxM_CGT_Labeled_Pass5.csv', index=False)

In [52]:
# save base Intercoder Prediction
df_cgt_labeled_final.to_csv('results/GTxM_Pass5/GTxM_Intercoder_Pred_Pass5.csv', index=False)

### CCL Sub-Framework

### Cleanup for Jato

In [169]:
# For use with Jato

def removeSpChar4Jato(text):
  #old text = re.sub(r'\W', ' ', text) #replace ALL non-word characters, including emojis with space
  # remove all non ASCII characters 
  text.replace('\n', ' ')
  text.replace('\r', ' ')
  # credit: https://stackoverflow.com/questions/2758921/regular-expression-that-finds-and-replaces-non-ascii-characters-with-python
  text = re.sub(r"[\u0080-\uFFFF]", " ", text) #see ASCII list: https://www.asciitable.com/
  text = " ".join(text.split()) # replace multiple spaces with a single space
  return(text)

def cleanupText4Jato(text):
  #replace this strange chars in the text with space
  text = text.replace("GCO"," ")
  text = text.replace("GY=n+"," ")
  text = text.replace("GCY"," ")
  text = text.replace("fcafc+"," ")
  text = text.replace("fAE+"," ")
  text = text.replace("#fAE"," ")
  text = text.replace("fnae"," ")
  text = text.replace("fye"," ")

  #Replace common abbreviations and slangs
  text = text.replace(" i m "," i am ")
  text = text.replace(" i ve "," i have ")
  text = text.replace(" i ll "," i will ")
  text = text.replace(" i d "," i had ")
  text = text.replace(" that s "," that is ")
  text = text.replace(" isn t "," is not ")
  text = text.replace(" it s "," it is ")
  text = text.replace(" she s "," she is ")
  text = text.replace(" he s "," he is ")
  text = text.replace(" u "," you ")
  text = text.replace(" ur "," your ")
  text = text.replace(" b4 "," before ")
  text = text.replace(" wasnt "," was not ")
  text = text.replace(" wasn t "," was not ")
  text = text.replace(" cant "," can not ")
  text = text.replace(" can t "," can not ")
  text = text.replace(" couldnt "," could not ")
  text = text.replace(" couldn t "," could not ")
  text = text.replace(" wouldnt "," would not ")
  text = text.replace(" wouldn t "," would not ")
  text = text.replace(" dont "," do not ")
  text = text.replace(" don t "," do not ")
  text = text.replace(" didnt "," did not ")
  text = text.replace(" didn t "," did not ")
  text = text.replace(" let s "," let us ")
  text = text.replace(" i'm "," i am ")
  text = text.replace(" i've "," i have ")
  text = text.replace(" i'll "," i will ")
  text = text.replace(" i'd "," i had ")
  text = text.replace(" that's "," that is ")
  text = text.replace(" isn't "," is not ")
  text = text.replace(" it's "," it is ")
  text = text.replace(" she's "," she is ")
  text = text.replace(" he's "," he is ")
  text = text.replace(" u "," you ")
  text = text.replace(" ur "," your ")
  text = text.replace(" b4 "," before ")
  text = text.replace(" wasn't "," was not ")
  text = text.replace(" can't "," can not ")
  text = text.replace(" couldn't "," could not ")
  text = text.replace(" wouldn't "," would not ")
  text = text.replace(" don't "," do not ")
  text = text.replace(" didn't "," did not ")
  text = text.replace(" let's "," let us ")
  text = text.replace(" luv "," love ")
  text = text.replace(" true "," truth ")
  text = text.replace(" ppl "," people ")
  text = text.replace(" fb "," facebook ")
  text = text.replace(" b day "," birthday ")
  text = text.replace(" bday "," birthday ")
  if (len(text.strip())  == 0):
      text = ' ' #replace None with a single space
  return(text)


### Prepare JatoMaster for HO1

In [10]:
data = pd.read_csv('data/MasterTokens.csv', encoding='ISO-8859-1', dtype='str')
len(data)

4684

In [15]:
# Load data
df_cgt_labeled_final = pd.read_csv('data/GTxM_Pass5/GTxM_CGT_Labeled_Pass5.csv', dtype='str')
len(df_cgt_labeled_final)

220

In [16]:
df_jatoMaster_Pass5_HO1 = pd.merge(data, df_cgt_labeled_final, on='RecID', how='inner')

In [18]:
df_jatoMaster_Pass5_HO1['PubTitle'] = df_jatoMaster_Pass5_HO1['PubTitle'].apply(removeSpChar4Jato)
df_jatoMaster_Pass5_HO1['RecDoc'] = df_jatoMaster_Pass5_HO1['RecDoc'].apply(removeSpChar4Jato)
df_jatoMaster_Pass5_HO1['smrTopText'] = df_jatoMaster_Pass5_HO1['smrTopText'].apply(removeSpChar4Jato)
df_jatoMaster_Pass5_HO1['smrSummary'] = df_jatoMaster_Pass5_HO1['smrSummary'].apply(removeSpChar4Jato)

In [19]:
len(df_jatoMaster_Pass5_HO1)

220

In [20]:
df_jatoMaster_Pass5_HO1.to_csv('D:/KOPro/PhD/Implementation/SourceCode/JatoPass5_HO1/data/JatoMaster.csv')

#### Generate JatoClassified

In [21]:
df_jato_HO1_pass0 = pd.read_csv('D:/KOPro/PhD/Implementation/SourceCode/Jato/data/JatoOp1Labels.csv', usecols=['RecID','Label'], dtype='str')
len(df_jato_HO1_pass0)

2042

In [22]:
df_jato_HO1_pass0.head(2)

Unnamed: 0,RecID,Label
0,222818213392678912,Politics
1,826262311560216578,Politics


In [23]:
df_jato_Clf_blank = pd.read_csv('D:/KOPro/PhD/Implementation/SourceCode/JatoPass5_HO1/data/JatoClassified_BlankCat.csv', dtype='str')
len(df_jato_Clf_blank)

2385

In [24]:
df_jato_Clf_blank.head(2)

Unnamed: 0,rowid,RecID,SavedDataTime,JatoCat,JatoSubCat,JatoCat2,JatoSubCat2,JatoCat3,JatoSubCat3,NewRecClass,GTCodes,NewsPubCat
0,6,222818213392678912,2021:01:17 21:15:20,Political,Debate,,,,,,,Unknown
1,60,826262311560216578,2021:01:13 19:53:48,Political,Debate,,,,,,,Unknown


In [54]:
df_jato_HO1_pass5 = pd.merge(df_jato_Clf_blank, df_jato_HO1_pass0, on='RecID', how='left')
df_jato_HO1_pass5.drop(['NewsPubCat'], axis=1, inplace=True)
df_jato_HO1_pass5.rename(columns={'Label': 'NewsPubCat'}, inplace=True)
df_jato_HO1_pass5.head(2)

Unnamed: 0,rowid,RecID,SavedDataTime,JatoCat,JatoSubCat,JatoCat2,JatoSubCat2,JatoCat3,JatoSubCat3,NewRecClass,GTCodes,NewsPubCat
0,6,222818213392678912,2021:01:17 21:15:20,Political,Debate,,,,,,,Politics
1,60,826262311560216578,2021:01:13 19:53:48,Political,Debate,,,,,,,Politics


In [None]:
df_jato_HO1_pass5.groupby(['NewsPubCat']).size()

NewsPubCat
Business          87
Entertainment    228
Environmental     78
Human Rights     338
Obituary         138
Politics         872
Sports           143
Stories           84
Technology        74
dtype: int64

In [None]:
# Change Stories, Technology and Blank
df_jato_HO1_pass5.loc[(df_jato_HO1_pass5.NewsPubCat == 'Stories'), 'NewsPubCat'] = 'Social Stories'
df_jato_HO1_pass5.loc[(df_jato_HO1_pass5.NewsPubCat == 'Technology'), 'NewsPubCat'] = 'Business'
df_jato_HO1_pass5.loc[(df_jato_HO1_pass5.NewsPubCat.isna()), 'NewsPubCat'] = 'Unknown'

In [None]:
df_jato_HO1_pass5.groupby(['NewsPubCat']).size()

NewsPubCat
Business          161
Entertainment     228
Environmental      78
Human Rights      338
Obituary          138
Politics          872
Social Stories     84
Sports            143
Unknown           343
dtype: int64

In [None]:
df_jato_HO1_pass5.to_csv('D:/KOPro/PhD/Implementation/SourceCode/JatoPass5_HO1/data/JatoClassified.csv', index=False)

#### Fix problem with JatoClassified for Pass 5

In [72]:
df_jato_fix_pass5 = pd.read_csv('D:/KOPro/PhD/Implementation/SourceCode/JatoPass5_HO1/data/JatoClassified_HO1_Pass5.csv', dtype='str')
len(df_jato_fix_pass5)

2385

In [73]:
# Change Stories, Technology, Arts, Climate, Books, Animal Rights and Blank
df_jato_fix_pass5.loc[(df_jato_fix_pass5.NewsPubCat == 'Stories'), 'NewRecClass'] = 'Fix this'
df_jato_fix_pass5.loc[(df_jato_fix_pass5.NewsPubCat == 'Technology'), 'NewRecClass'] = 'Fix this'
df_jato_fix_pass5.loc[(df_jato_fix_pass5.NewsPubCat == 'Arts'), 'NewRecClass'] = 'Fix this'
df_jato_fix_pass5.loc[(df_jato_fix_pass5.NewsPubCat == 'Books'), 'NewRecClass'] = 'Fix this'
df_jato_fix_pass5.loc[(df_jato_fix_pass5.NewsPubCat == 'Climate'), 'NewRecClass'] = 'Fix this'
df_jato_fix_pass5.loc[(df_jato_fix_pass5.NewsPubCat == 'Animal Rights'), 'NewRecClass'] = 'Fix this'
df_jato_fix_pass5.loc[(df_jato_fix_pass5.NewsPubCat.isna()), 'NewRecClass'] = 'Fix this'

In [74]:
df_jato_fix_pass5.head(2)

Unnamed: 0,rowid,RecID,SavedDataTime,JatoCat,JatoSubCat,JatoCat2,JatoSubCat2,JatoCat3,JatoSubCat3,NewRecClass,GTCodes,NewsPubCat
0,6.0,222818213392678912,2021:01:17 21:15:20,Political,Debate,,,,,,,Politics
1,60.0,826262311560216578,2021:01:13 19:53:48,Political,Debate,,,,,,,Politics


In [75]:
df_jato_fix_pass5.groupby(['NewsPubCat']).size()

NewsPubCat
Animal Rights                11
Arts                          5
Books                        16
Business                     90
Celebrity                    35
Climate                      33
Economy                       3
Entertainment               235
Environmental                75
Fashion                       6
Food                         19
Health                       43
Human Rights                318
Immigration                   1
Law and Order                24
Obituary                    136
Politics                    855
Sexual Harrassment            1
Sexual Harrassment/Crime      7
Social Stories                8
Sports                      140
Stories                      71
Technology                   70
Terrorism                    37
Travel                       25
Unclassified Medley          62
Unknown                      11
Violence                     40
World Politics                8
dtype: int64

In [76]:
df_jato_fix_pass5.to_csv('D:/KOPro/PhD/Implementation/SourceCode/JatoPass5_HO1/data/JatoClassified.csv', index=False)

### Intercoder Reliability

#### Load Predictions

In [2]:
dfPass5 = pd.read_csv("results/GTxM_Pass5/GTxM_Intercoder_Pred_Pass5.csv", dtype='str')
len(dfPass5)

220

In [3]:
dfPass5.head()

Unnamed: 0,RecID,Label,Target,XLNet_CGTPred,XLNet_CGT_NoSSPred,BERT_CGTPred,BERT_CGT_NoSSPred,T5_CGTPred,T5_CGT_NoSSPred
0,1222603086558126080,Politics,6,6,6,6,6,6,6
1,1222474209374408705,Politics,6,6,6,6,6,6,6
2,1221936812786167810,Environmental,3,10,3,3,3,3,5
3,1221377542587928576,Politics,6,6,6,6,6,6,6
4,1220823578825887745,Politics,6,-1,-1,-1,-1,-1,-1


#### Get the agreement between XLNet, BERT and T5

In [4]:
def AgreeTargetWithA3(A1Target, A2Target, A3Target):
    AgreedTarget = '-1'
    if (AgreedTarget == '-1') and (A1Target == A3Target):
        AgreedTarget = A3Target
    if (AgreedTarget == '-1') and (A2Target == A3Target):
        AgreedTarget = A3Target
    if (AgreedTarget == '-1') and (A1Target == A2Target):
        AgreedTarget = A2Target
    return AgreedTarget

##### Calculate krippendorff's alpha for the agreement between XLNet, BERT and T5

In [5]:
# dfPass5['AgreedTarget'] = dfPass5.apply(lambda x: AgreeTargetWithA3(x['T5_CGT_NoSSPred'], x['BERT_CGT_NoSSPred'], x['XLNet_CGT_NoSSPred']), axis=1)
dfPass5['AgreedTarget'] = dfPass5.apply(lambda x: AgreeTargetWithA3(x['T5_CGTPred'], x['BERT_CGTPred'], x['XLNet_CGTPred']), axis=1)

In [6]:
dfPass5_GClf = dfPass5[dfPass5.AgreedTarget != '-1']
len(dfPass5_GClf)

135

In [7]:
# convert to horizontal array as expected by Krippendorff Alpha
XLNetPred = np.stack(dfPass5_GClf['XLNet_CGTPred'].astype("string"))
BERTPred = np.stack(dfPass5_GClf['BERT_CGTPred'].astype("string"))
T5Pred = np.stack(dfPass5_GClf['T5_CGTPred'].astype("string"))
AgreedTarget = np.stack(dfPass5_GClf['AgreedTarget'].astype("string"))

In [49]:
missing = '-1'
#arr = np.array((TargetPred,AgreedTarget))
alpha1 = krippendorff_alpha(np.array((T5Pred,AgreedTarget)), nominal_metric, missing_items=missing)
alpha2 = krippendorff_alpha(np.array((BERTPred,AgreedTarget)), nominal_metric, missing_items=missing)
alpha3 = krippendorff_alpha(np.array((XLNetPred,AgreedTarget)), nominal_metric, missing_items=missing)
alpha1, alpha2, alpha3

(0.8904347826086957, 0.8495278708498325, 0.9078503030022668)

In [8]:
missing = '-1'
alpha = krippendorff_alpha(np.array((T5Pred,BERTPred,XLNetPred)), nominal_metric, missing_items=missing)
alpha

0.7653946794592237

#### Get Krippendorff's Alpha between HO1 and GTxM Classifier and SBERT

In [9]:
def AgreeTargetWithHO1(A1Target, A2Target, HO1Target):
    AgreedTarget = '-1'
    if (AgreedTarget == '-1') and (A1Target == HO1Target):
        AgreedTarget = HO1Target
    if (AgreedTarget == '-1') and (A2Target == HO1Target):
        AgreedTarget = HO1Target
    return AgreedTarget

In [10]:
dfPass5.head()

Unnamed: 0,RecID,Label,Target,XLNet_CGTPred,XLNet_CGT_NoSSPred,BERT_CGTPred,BERT_CGT_NoSSPred,T5_CGTPred,T5_CGT_NoSSPred,AgreedTarget
0,1222603086558126080,Politics,6,6,6,6,6,6,6,6
1,1222474209374408705,Politics,6,6,6,6,6,6,6,6
2,1221936812786167810,Environmental,3,10,3,3,3,3,5,3
3,1221377542587928576,Politics,6,6,6,6,6,6,6,6
4,1220823578825887745,Politics,6,-1,-1,-1,-1,-1,-1,-1


In [11]:
dfPass5.drop(['XLNet_CGTPred','BERT_CGTPred','T5_CGTPred','XLNet_CGT_NoSSPred','BERT_CGT_NoSSPred','T5_CGT_NoSSPred'], axis=1, inplace=True)

In [12]:
dfPass5.rename(columns={'AgreedTarget': 'GClfTarget'}, inplace=True)

In [13]:
dfPass5.head(2)

Unnamed: 0,RecID,Label,Target,GClfTarget
0,1222603086558126080,Politics,6,6
1,1222474209374408705,Politics,6,6


In [14]:
df_jato_HO1_Labels = pd.read_csv('D:/KOPro/PhD/Implementation/SourceCode/JatoPass5_HO1/data/JatoClassified_HO1_Pass5.csv', usecols=['RecID','NewsPubCat'], dtype='str')

In [15]:
df_jato_HO1_Labels.head(2)

Unnamed: 0,RecID,NewsPubCat
0,222818213392678912,Politics
1,826262311560216578,Politics


In [16]:
Labels_Targets = pd.read_csv('data/GTxM_Pass5/Labels_Targets_CGTPass5.csv', dtype='str')

In [17]:
Labels_Targets.head(2)

Unnamed: 0,Target,Label
0,1,Business
1,2,Entertainment


In [18]:
df_jato_HO1_Labels.rename(columns={'NewsPubCat': 'Label'}, inplace=True)

In [19]:
df_HO1_Labels_Targets = pd.merge(df_jato_HO1_Labels,Labels_Targets, on='Label', how='left')

In [20]:
df_HO1_Labels_Targets

Unnamed: 0,RecID,Label,Target
0,222818213392678912,Politics,6
1,826262311560216578,Politics,6
2,833502973204459520,Entertainment,2
3,835347243020451840,Human Rights,5
4,867832469181128704,Entertainment,2
...,...,...,...
2380,1223262356496699394,Entertainment,2
2381,1223301592549556224,Politics,6
2382,1223302445889150976,Politics,6
2383,1223365339494453248,Politics,6


In [21]:
df_HO1_Labels_Targets.rename(columns={'Label': 'HO1Label', 'Target': 'HO1Target'}, inplace=True)

In [22]:
df_HO1_GClf = pd.merge(dfPass5, df_HO1_Labels_Targets, on='RecID', how='left')

In [23]:
df_HO1_GClf

Unnamed: 0,RecID,Label,Target,GClfTarget,HO1Label,HO1Target
0,1222603086558126080,Politics,6,6,Health,4
1,1222474209374408705,Politics,6,6,Politics,6
2,1221936812786167810,Environmental,3,3,Unknown,0
3,1221377542587928576,Politics,6,6,Politics,6
4,1220823578825887745,Politics,6,-1,Politics,6
...,...,...,...,...,...,...
215,1192717207895072769,Environmental,3,3,Environmental,3
216,1189678712490360833,Sports,11,10,Sports,11
217,1185507039008698368,Human Rights,5,-1,Human Rights,5
218,1197994951771951105,Human Rights,5,5,Human Rights,5


In [24]:
df_HO1_GClf.rename(columns={'Target': 'SBERTTarget', 'Label': 'SBERTLabel'}, inplace=True)

In [25]:
df_HO1_GClf.head(2)

Unnamed: 0,RecID,SBERTLabel,SBERTTarget,GClfTarget,HO1Label,HO1Target
0,1222603086558126080,Politics,6,6,Health,4
1,1222474209374408705,Politics,6,6,Politics,6


In [67]:
df_HO1_GClf.to_csv('df_HO1_GClf.csv')

In [26]:
# set HO1Target's Unknown to -1 for Krippendroff's Alpha calculation
df_HO1_GClf.loc[(df_HO1_GClf.HO1Target == '0'), 'HO1Target'] = '-1'

In [27]:
df_HO1_GClf['HO1Target'].fillna(value='-1', inplace=True)

In [28]:
df_HO1_GClf['HO1AgreedTarget'] = df_HO1_GClf.apply(lambda x: AgreeTargetWithHO1(x['SBERTTarget'], x['GClfTarget'], x['HO1Target']), axis=1)

In [29]:
HO1Pred = np.stack(df_HO1_GClf['HO1Target'].astype("string"))
GClfPred = np.stack(df_HO1_GClf['GClfTarget'].astype("string"))
SBERTPred = np.stack(df_HO1_GClf['SBERTTarget'].astype("string"))
HO1AgreedTarget = np.stack(df_HO1_GClf['HO1AgreedTarget'].astype("string"))

In [30]:
missing = '-1'
alpha1 = krippendorff_alpha(np.array((HO1Pred,HO1AgreedTarget)), nominal_metric, missing_items=missing)
alpha2 = krippendorff_alpha(np.array((GClfPred,HO1AgreedTarget)), nominal_metric, missing_items=missing)
alpha3 = krippendorff_alpha(np.array((SBERTPred,HO1AgreedTarget)), nominal_metric, missing_items=missing)
alpha1, alpha2, alpha3

(0.4246432857714362, 0.38641532400140943, 0.2660869133719559)

In [33]:
missing = '-1'
alpha = krippendorff_alpha(np.array((HO1Pred,GClfPred,SBERTPred)), nominal_metric, missing_items=missing)
alpha

0.2640310821824514

In [34]:
df_HO1_GClf_GTD = df_HO1_GClf[df_HO1_GClf.HO1AgreedTarget != '-1']
len(df_HO1_GClf_GTD)

99

In [74]:
df_Pass5_HO1_GTD = df_HO1_GClf_GTD[['RecID','HO1Label','HO1Target']]

In [75]:
df_Pass5_HO1_GTD.rename(columns={'HO1Target': 'Target', 'HO1Label': 'Label'}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_Pass5_HO1_GTD.rename(columns={'HO1Target': 'Target', 'HO1Label': 'Label'}, inplace=True)


In [76]:
df_Pass5_HO1_GTD.head()

Unnamed: 0,RecID,Label,Target
1,1222474209374408705,Politics,6
3,1221377542587928576,Politics,6
4,1220823578825887745,Politics,6
6,1220562224185405445,Politics,6
9,1220088156625588225,Obituary,9


In [77]:
len(df_Pass5_HO1_GTD)

99

In [78]:
#df_HO2HO1_GTD = df_HO2HO1_Labels[df_HO2HO1_Labels.HO2AgreedTarget != '-1']
#len(df_HO2HO1_GTD)

In [79]:
df_HO1_To_HO2 = df_HO1_GClf[df_HO1_GClf.HO1AgreedTarget == '-1']
len(df_HO1_To_HO2)

121

In [82]:
df_HO1_To_HO2.head(2)

Unnamed: 0,RecID,SBERTLabel,SBERTTarget,GClfTarget,HO1Label,HO1Target,HO1AgreedTarget
0,1222603086558126080,Politics,6,6,Health,4,-1
1,1221936812786167810,Environmental,3,3,Unknown,-1,-1


In [83]:
# Use the RecIDs in this file to extract JatoMaster from MasterTokens in SQL-SERVER and save into data/GTxM_Pass2/
df_HO1_To_HO2.to_csv('data/GTxM_Pass5/df_HO1_To_HO2.csv', index=False)

#### Acquire new GTD from HO1 and GClf agreement

In [81]:
df_Pass5_HO1_GTD.to_csv('data/GTxM_Pass5/GTxM_Pass5_GTD_HO1.csv', index=False)

#### Generate JatoMaster for HO2 Labeling

In [82]:
df_master_tokens = pd.read_csv('data/MasterTokens.csv', dtype='str')

In [83]:
df_jato_HO2_tokens = pd.merge(df_master_tokens,df_HO1_To_HO2['RecID'], on='RecID')
len(df_jato_HO2_tokens)

121

In [84]:
df_jato_HO2_tokens.head(2)

Unnamed: 0,rowid,RecID,PubTitle,RecDoc,countSupTweets,smrHashtags,smrMentions,smrNER,smrNouns,smrVerbs,smrAdverbs,smrAdjectives,smrTopText,smrSummary
0,155,1057291398880391170,Could fireworks be restricted at Scottish homes?,This is the effect fireworks can have on a dog...,45.0,banfireworks fireworks,neilmackay gamesshed didriksoderlind bbcradios...,juli timmi uk last night daili bob marley ub j...,effect firework dog juli hors goat lot firewor...,stand comfort reli built hear held purchas des...,outsid long care fairli ahead exactli seemingl...,gener licens wide gener last daili big loud so...,This is the effect fireworks can have on a dog...,This is the effect fireworks can have on a dog...
1,374,1165822705037217792,Cars Are Death Machines. Self-Driving Tech Won...,"Please RT if you, or someone you know, has bee...",17.0,,aarieff realdonaldtrump ttmitch,yard hoboken nj washington yanke stadium long ...,car experi mobil panel daughter car yard aspha...,hit hit thrown end broken land save pass place...,badli recent straight right nearli nearli slow...,littl upcom catastroph oncom danger high small...,"Please RT if you, or someone you know, has bee...","Please RT if you, or someone you know, has bee..."


In [85]:
df_jato_HO2_tokens.drop(['smrAdverbs','smrAdjectives'], axis=1, inplace=True)

In [88]:
df_jato_HO2_tokens['PubTitle'] = df_jato_HO2_tokens['PubTitle'].apply(removeSpChar4Jato)
df_jato_HO2_tokens['RecDoc'] = df_jato_HO2_tokens['RecDoc'].apply(removeSpChar4Jato)
df_jato_HO2_tokens['smrTopText'] = df_jato_HO2_tokens['smrTopText'].apply(removeSpChar4Jato)
df_jato_HO2_tokens['smrSummary'] = df_jato_HO2_tokens['smrSummary'].apply(removeSpChar4Jato)

In [89]:
df_jato_HO2_tokens['PubTitle'] = df_jato_HO2_tokens['PubTitle'].apply(cleanupText4Jato)
df_jato_HO2_tokens['RecDoc'] = df_jato_HO2_tokens['RecDoc'].apply(cleanupText4Jato)
df_jato_HO2_tokens['smrTopText'] = df_jato_HO2_tokens['smrTopText'].apply(cleanupText4Jato)
df_jato_HO2_tokens['smrSummary'] = df_jato_HO2_tokens['smrSummary'].apply(cleanupText4Jato)

In [103]:
# save the file to D:\KOPro\PhD\Implementation\SourceCode\JatoPass2\data -- 
df_jato_HO2_tokens.to_csv('D:/KOPro/PhD/Implementation/SourceCode/JatoPass5_HO2/data/JatoMaster.csv', index=False)

#### Generate JatoClassified for HO2

In [91]:
df_JatoClf_Blank = pd.read_csv('D:/KOPro/PhD/Implementation/SourceCode/jatoPass2_HO2/data/JatoClassified_BlankCat.csv')

In [92]:
df_JatoClf_Blank.head(2)

Unnamed: 0,rowid,RecID,SavedDataTime,JatoCat,JatoSubCat,JatoCat2,JatoSubCat2,JatoCat3,JatoSubCat3,NewRecClass,GTCodes,NewsPubCat
0,6,222818213392678912,2021:01:17 21:15:20,Political,Debate,,,,,,,Unknown
1,60,826262311560216578,2021:01:13 19:53:48,Political,Debate,,,,,,,Unknown


In [93]:
len(df_JatoClf_Blank)

2385

In [94]:
df_JatoCl_Pass0_in_4 = pd.read_csv('data/GTxM_Pass0/JatoClassified_HO2_Pass0_in_Pass4_task.csv')

In [95]:
df_JatoCl_Pass0_in_4.head(2)

Unnamed: 0,rowid,RecID,SavedDataTime,JatoCat,JatoSubCat,JatoCat2,JatoSubCat2,JatoCat3,JatoSubCat3,NewRecClass,GTCodes,NewsPubCat
0,557,1179660469759438849,2021:02:05 11:57:33,Political,Debate,,,,,,,Politics
1,669,1181047749678161920,2021:02:05 11:57:33,Violent Material,War Crimes,Political,Debate,,,,,Law and Order


In [96]:
len(df_JatoCl_Pass0_in_4)

60

In [97]:
df_JatoCl_Pass4_dedup = pd.concat([df_JatoClf_Blank,df_JatoCl_Pass0_in_4], axis=0)

In [98]:
len(df_JatoCl_Pass4_dedup)

2445

In [99]:
df_JatoCl_Pass4_dedup.drop_duplicates(subset='RecID', keep=False, inplace=True)

In [100]:
len(df_JatoCl_Pass4_dedup)

2325

In [101]:
# re-concatenate the datasets to add the new subset only
df_JatoCl_Pass4 = pd.concat([df_JatoCl_Pass4_dedup,df_JatoCl_Pass0_in_4], axis=0)

In [102]:
df_JatoCl_Pass4

Unnamed: 0,rowid,RecID,SavedDataTime,JatoCat,JatoSubCat,JatoCat2,JatoSubCat2,JatoCat3,JatoSubCat3,NewRecClass,GTCodes,NewsPubCat
0,6,222818213392678912,2021:01:17 21:15:20,Political,Debate,,,,,,,Unknown
1,60,826262311560216578,2021:01:13 19:53:48,Political,Debate,,,,,,,Unknown
2,63,833502973204459520,2023:03:16 20:23:22,Lifestyle,Music,,,,,,,Unknown
3,64,835347243020451840,2021:01:18 15:17:18,Abusive Material,Hate Speech,,,,,,,Unknown
4,69,867832469181128704,2021:02:03 13:42:38,Lifestyle,Music,,,,,,,Unknown
...,...,...,...,...,...,...,...,...,...,...,...,...
55,4143,1218564148172394496,2021:02:12 13:06:39,News,Weather,,,,,weather,,Social Stories
56,4165,1218849631016230913,2021:02:12 22:14:28,Violent Material,Violence,Conversational,,,,,,Social Stories
57,4214,1219299748248682496,2021:02:13 10:45:33,Political,Debate,,,,,,,Unknown
58,4373,1220439910378889216,2021:02:13 10:45:33,Conversational,Negative,,,,,,,Social Stories


In [104]:
# save to pass 5
df_JatoCl_Pass4.to_csv('D:/KOPro/PhD/Implementation/SourceCode/jatoPass5_HO2/data/JatoClassified.csv', index=False)

### Intercoder Reliability btw HO2, HO1 and GTxM Classifier

#### Get HO2 labels After running Jato

In [3]:
# use the RecIDs 
df_HO1_To_HO2 = pd.read_csv('data/GTxM_Pass5/df_HO1_To_HO2.csv', dtype='str')

In [4]:
df_HO1_To_HO2.head(2)

Unnamed: 0,RecID,SBERTLabel,SBERTTarget,GClfTarget,HO1Label,HO1Target,HO1AgreedTarget
0,1222603086558126080,Politics,6,6,Health,4,-1
1,1221936812786167810,Environmental,3,3,Unknown,-1,-1


In [5]:
len(df_HO1_To_HO2)

121

In [6]:
df_jato_HO2_Labels = pd.read_csv('D:/KOPro/PhD/Implementation/SourceCode/JatoPass5_HO2/data/JatoClassified_HO2_Pass5.csv', usecols=['RecID','NewsPubCat'], dtype='str')
df_jato_HO2_Labels.head(2)

Unnamed: 0,RecID,NewsPubCat
0,222818213392678912,Unknown
1,826262311560216578,Unknown


In [7]:
Labels_Targets = pd.read_csv('data/GTxM_Pass5/Labels_Targets_CGTPass5.csv', dtype='str')

In [8]:
Labels_Targets

Unnamed: 0,Target,Label
0,1,Business
1,2,Entertainment
2,3,Environmental
3,4,Health
4,5,Human Rights
5,6,Politics
6,7,Law and Order
7,0,Unknown
8,9,Obituary
9,10,Social Stories


In [9]:
df_jato_HO2_Labels.rename(columns={'NewsPubCat': 'Label'}, inplace=True)

In [10]:
df_HO2_Labels_Targets = pd.merge(df_jato_HO2_Labels,Labels_Targets, on='Label', how='left')

In [11]:
df_HO2_Labels_Targets

Unnamed: 0,RecID,Label,Target
0,222818213392678912,Unknown,0
1,826262311560216578,Unknown,0
2,833502973204459520,Unknown,0
3,835347243020451840,Unknown,0
4,867832469181128704,Unknown,0
...,...,...,...
2380,1218564148172394496,Social Stories,10
2381,1218849631016230913,Social Stories,10
2382,1219299748248682496,Unknown,0
2383,1220439910378889216,Social Stories,10


In [12]:
df_HO2_Labels_Targets.rename(columns={'Label': 'HO2Label', 'Target': 'HO2Target'}, inplace=True)

In [13]:
df_HO2HO1_Labels = pd.merge(df_HO2_Labels_Targets,df_HO1_To_HO2, on='RecID')
len(df_HO2HO1_Labels)

121

In [14]:
df_HO2HO1_Labels.head()

Unnamed: 0,RecID,HO2Label,HO2Target,SBERTLabel,SBERTTarget,GClfTarget,HO1Label,HO1Target,HO1AgreedTarget
0,1057291398880391170,Environmental,3,Social Stories,10,3,Unclassified Medley,-1,-1
1,1165822705037217792,Health,4,Social Stories,10,10,Unclassified Medley,-1,-1
2,1174449406172491776,Entertainment,2,Social Stories,10,6,Entertainment,2,-1
3,1179556773750927360,Unclassified Medley,16,World Politics,12,-1,Human Rights,5,-1
4,1180064036509175808,Politics,6,Social Stories,10,6,Human Rights,5,-1


In [15]:
df_HO2HO1_Labels.drop(['HO1AgreedTarget'], axis=1, inplace=True)

#### Compute Krippendorff's Alpha for HO2, HO3, GClf

In [16]:
def AgreeTargetWithHO2(A1Target, A2Target, HO1Target, HO2Target):
    AgreedTarget = '-1'
    if (AgreedTarget == '-1') and (HO1Target == HO2Target):
        AgreedTarget = HO2Target
    if (AgreedTarget == '-1') and (A2Target == HO2Target):
        AgreedTarget = HO2Target
    if (AgreedTarget == '-1') and (A2Target == HO2Target):
        AgreedTarget = HO2Target
    return AgreedTarget

In [17]:
# set HO1Target's Unknown to -1 for Krippendroff's Alpha calculation
df_HO2HO1_Labels.loc[(df_HO2HO1_Labels.HO2Target == '0'), 'HO2Target'] = '-1'

In [18]:
df_HO2HO1_Labels['HO2AgreedTarget'] = df_HO2HO1_Labels.apply(lambda x: AgreeTargetWithHO2(x['GClfTarget'], x['SBERTTarget'], x['HO1Target'], x['HO2Target']), axis=1)

In [19]:
# convert to horizontal array as expected by Krippendorff Alpha
HO2Pred = np.stack(df_HO2HO1_Labels['HO2Target'].astype("string"))
HO1Pred = np.stack(df_HO2HO1_Labels['HO1Target'].astype("string"))
GClfPred = np.stack(df_HO2HO1_Labels['GClfTarget'].astype("string"))
SBERTPred = np.stack(df_HO2HO1_Labels['SBERTTarget'].astype("string"))
HO2AgreedTarget = np.stack(df_HO2HO1_Labels['HO2AgreedTarget'].astype("string"))

In [20]:
missing = '-1'
alpha1 = krippendorff_alpha(np.array((HO2Pred,HO2AgreedTarget)), nominal_metric, missing_items=missing)
alpha2 = krippendorff_alpha(np.array((HO1Pred,HO2AgreedTarget)), nominal_metric, missing_items=missing)
alpha3 = krippendorff_alpha(np.array((GClfPred,HO2AgreedTarget)), nominal_metric, missing_items=missing)
alpha4 = krippendorff_alpha(np.array((SBERTPred,HO2AgreedTarget)), nominal_metric, missing_items=missing)
alpha1, alpha2, alpha3, alpha4

(0.09199632014719417,
 0.3342030714179216,
 0.029473550194156206,
 -0.008579200669596165)

In [21]:
len(df_HO2HO1_Labels)

121

In [22]:
df_HO2HO1_Labels.to_csv('df_HO2HO1_Labels.csv')

In [24]:
alpha_HO1HO2 = krippendorff_alpha(np.array((HO2Pred,HO1Pred)), nominal_metric, missing_items=missing)
alpha_HO1HO2

-0.08141072740631872

In [23]:
alpha = krippendorff_alpha(np.array((HO2Pred,HO1Pred,SBERTPred)), nominal_metric, missing_items=missing)
alpha

-0.02920155010419334

#### Acquire new GTD from HO2 agreement

In [38]:
df_HO2HO1_Labels.to_csv('df_HO2HO1_Labels.csv')

In [37]:
df_HO2HO1_GTD = df_HO2HO1_Labels[df_HO2HO1_Labels.HO2AgreedTarget != '-1']
len(df_HO2HO1_GTD)

34

In [95]:
df_Pass4_HO2_GTD = df_HO2HO1_GTD[['RecID','HO2Label','HO2Target']]
df_Pass4_HO2_GTD.rename(columns={'HO2Target': 'Target', 'HO2Label': 'Label'}, inplace=True)
df_Pass4_HO2_GTD.head(2)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_Pass4_HO2_GTD.rename(columns={'HO2Target': 'Target', 'HO2Label': 'Label'}, inplace=True)


Unnamed: 0,RecID,Label,Target
2,1180604548596916224,Law and Order,7
4,1181281583078223872,World Politics,12


In [96]:
df_Pass4_GTD = pd.concat([df_Pass4_HO1_GTD, df_Pass4_HO2_GTD], axis=0)
len(df_Pass4_GTD)

367

In [97]:
df_Pass4_GTD.groupby(['Target','Label']).size()

Target  Label         
10      Social Stories     27
11      Sports              9
12      World Politics     70
2       Entertainment      23
3       Environmental      38
4       Health             13
5       Human Rights       29
6       Politics          152
7       Law and Order       5
9       Obituary            1
dtype: int64

In [98]:
df_Pass4_GTD_WPolitics = df_Pass4_GTD[df_Pass4_GTD.Label == 'World Politics']
len(df_Pass4_GTD_WPolitics)

70

In [99]:
df_Pass4_GTD_WPolitics.to_csv('data/GTxM_Pass4/GTxM_Pass4_GTD_WPolitics.csv', index=False)

In [101]:
df_Pass4_GTD_New = df_Pass4_GTD[df_Pass4_GTD.Label != 'World Politics']
len(df_Pass4_GTD_New)

297

In [102]:
df_Pass4_GTD_New.to_csv('data/GTxM_Pass4/GTxM_Pass4_GTD_New.csv', index=False)

In [103]:
df_Pass3_GTD_UpTodate = pd.read_csv('data/GTxM_Pass3/GTxM_Pass3_GTD_UpTodate.csv', dtype='str')
len(df_Pass3_GTD_UpTodate)

1057

In [104]:
df_Pass4_GTD_UpTodate = pd.concat([df_Pass3_GTD_UpTodate, df_Pass4_GTD_New], axis=0)
len(df_Pass4_GTD_UpTodate)

1354

In [105]:
df_Pass4_GTD_UpTodate.groupby(['Target','Label']).size()

Target  Label         
1       Business           75
10      Social Stories     59
11      Sports             84
2       Entertainment     176
3       Environmental      55
4       Health             17
5       Human Rights      115
6       Politics          608
7       Law and Order      17
9       Obituary          148
dtype: int64

In [106]:
df_Pass4_GTD_UpTodate.to_csv('data/GTxM_Pass4/GTxM_Pass4_GTD_UpTodate.csv', index=False)

In [107]:
df_Pass4_Reject = df_HO2HO1_Labels[df_HO2HO1_Labels.HO2AgreedTarget == '-1']
len(df_Pass4_Reject)

58

In [108]:
df_Pass4_Reject = df_Pass4_Reject[['RecID']]

In [119]:
# Add World Politics to Pass 4's Reject
df_Pass4_Reject_New = pd.concat([df_Pass4_Reject, (df_Pass4_GTD_WPolitics['RecID'])], axis=0)
len(df_Pass4_Reject_New)

128

In [115]:
df_Pass4_Reject_New.to_csv('data/GTxM_Pass4/GTxM_Pass4_Reject_New.csv', index=False)

In [116]:
df_Pass3_Reject_UpTodate = pd.read_csv('data/GTxM_Pass3/GTxM_Pass3_Reject_UpTodate.csv', dtype='str')
len(df_Pass3_Reject_UpTodate)

223

In [117]:
df_Pass4_Reject_UpTodate = pd.concat([df_Pass3_Reject_UpTodate, df_Pass4_Reject_New], axis=0)
len(df_Pass4_Reject_UpTodate)

351

In [118]:
df_Pass4_Reject_UpTodate.to_csv('data/GTxM_Pass4/GTxM_Pass4_Reject_UpTodate.csv', index=False)

## Intercoder without "Social Stories"

#### Load Predictions

In [104]:
dfPass5 = pd.read_csv("results/GTxM_Pass5/GTxM_Intercoder_Pred_Pass5.csv", dtype='str')
len(dfPass5)

220

In [105]:
dfPass5.head()

Unnamed: 0,RecID,Label,Target,XLNet_CGTPred,XLNet_CGT_NoSSPred,BERT_CGTPred,BERT_CGT_NoSSPred,T5_CGTPred,T5_CGT_NoSSPred
0,1222603086558126080,Politics,6,6,6,6,6,6,6
1,1222474209374408705,Politics,6,6,6,6,6,6,6
2,1221936812786167810,Environmental,3,10,3,3,3,3,5
3,1221377542587928576,Politics,6,6,6,6,6,6,6
4,1220823578825887745,Politics,6,-1,-1,-1,-1,-1,-1


#### Get the agreement between XLNet, BERT and T5

In [106]:
def AgreeTargetWithA3(A1Target, A2Target, A3Target):
    AgreedTarget = '-1'
    if (AgreedTarget == '-1') and (A1Target == A3Target):
        AgreedTarget = A3Target
    if (AgreedTarget == '-1') and (A2Target == A3Target):
        AgreedTarget = A3Target
    if (AgreedTarget == '-1') and (A1Target == A2Target):
        AgreedTarget = A2Target
    return AgreedTarget

##### Calculate krippendorff's alpha for the agreement between XLNet, BERT and T5

In [107]:
dfPass5['AgreedTarget'] = dfPass5.apply(lambda x: AgreeTargetWithA3(x['T5_CGT_NoSSPred'], x['BERT_CGT_NoSSPred'], x['XLNet_CGT_NoSSPred']), axis=1)

In [108]:
dfPass5_GClf = dfPass5[dfPass5.AgreedTarget != '-1']
len(dfPass5_GClf)

107

In [109]:
# convert to horizontal array as expected by Krippendorff Alpha
XLNetPred = np.stack(dfPass5_GClf['XLNet_CGTPred'].astype("string"))
BERTPred = np.stack(dfPass5_GClf['BERT_CGTPred'].astype("string"))
T5Pred = np.stack(dfPass5_GClf['T5_CGTPred'].astype("string"))
AgreedTarget = np.stack(dfPass5_GClf['AgreedTarget'].astype("string"))

In [110]:
missing = '-1'
#arr = np.array((TargetPred,AgreedTarget))
alpha1 = krippendorff_alpha(np.array((T5Pred,AgreedTarget)), nominal_metric, missing_items=missing)
alpha2 = krippendorff_alpha(np.array((BERTPred,AgreedTarget)), nominal_metric, missing_items=missing)
alpha3 = krippendorff_alpha(np.array((XLNetPred,AgreedTarget)), nominal_metric, missing_items=missing)
alpha1, alpha2, alpha3

(0.8705756754537242, 0.7847611202635913, 0.7390153520381154)

#### Get Krippendorff's Alpha between HO1 and GTxM Classifier and SBERT

In [111]:
def AgreeTargetWithHO1(A1Target, A2Target, HO1Target):
    AgreedTarget = '-1'
    if (AgreedTarget == '-1') and (A1Target == HO1Target):
        AgreedTarget = HO1Target
    if (AgreedTarget == '-1') and (A2Target == HO1Target):
        AgreedTarget = HO1Target
    return AgreedTarget

In [112]:
dfPass5.head()

Unnamed: 0,RecID,Label,Target,XLNet_CGTPred,XLNet_CGT_NoSSPred,BERT_CGTPred,BERT_CGT_NoSSPred,T5_CGTPred,T5_CGT_NoSSPred,AgreedTarget
0,1222603086558126080,Politics,6,6,6,6,6,6,6,6
1,1222474209374408705,Politics,6,6,6,6,6,6,6,6
2,1221936812786167810,Environmental,3,10,3,3,3,3,5,3
3,1221377542587928576,Politics,6,6,6,6,6,6,6,6
4,1220823578825887745,Politics,6,-1,-1,-1,-1,-1,-1,-1


In [113]:
dfPass5.drop(['XLNet_CGTPred','BERT_CGTPred','T5_CGTPred','XLNet_CGT_NoSSPred','BERT_CGT_NoSSPred','T5_CGT_NoSSPred'], axis=1, inplace=True)

In [114]:
dfPass5.rename(columns={'AgreedTarget': 'GClfTarget'}, inplace=True)

In [115]:
dfPass5.head(2)

Unnamed: 0,RecID,Label,Target,GClfTarget
0,1222603086558126080,Politics,6,6
1,1222474209374408705,Politics,6,6


In [116]:
df_jato_HO1_Labels = pd.read_csv('D:/KOPro/PhD/Implementation/SourceCode/JatoPass5_HO1/data/JatoClassified_HO1_Pass5.csv', usecols=['RecID','NewsPubCat'], dtype='str')

In [117]:
df_jato_HO1_Labels.head(2)

Unnamed: 0,RecID,NewsPubCat
0,222818213392678912,Politics
1,826262311560216578,Politics


In [118]:
Labels_Targets = pd.read_csv('data/GTxM_Pass5/Labels_Targets_CGTPass5.csv', dtype='str')

In [119]:
Labels_Targets.head(2)

Unnamed: 0,Target,Label
0,1,Business
1,2,Entertainment


In [120]:
df_jato_HO1_Labels.rename(columns={'NewsPubCat': 'Label'}, inplace=True)

In [121]:
df_HO1_Labels_Targets = pd.merge(df_jato_HO1_Labels,Labels_Targets, on='Label', how='left')

In [122]:
df_HO1_Labels_Targets

Unnamed: 0,RecID,Label,Target
0,222818213392678912,Politics,6
1,826262311560216578,Politics,6
2,833502973204459520,Entertainment,2
3,835347243020451840,Human Rights,5
4,867832469181128704,Entertainment,2
...,...,...,...
2380,1223262356496699394,Entertainment,2
2381,1223301592549556224,Politics,6
2382,1223302445889150976,Politics,6
2383,1223365339494453248,Politics,6


In [123]:
df_HO1_Labels_Targets.rename(columns={'Label': 'HO1Label', 'Target': 'HO1Target'}, inplace=True)

In [124]:
df_HO1_GClf = pd.merge(dfPass5, df_HO1_Labels_Targets, on='RecID', how='left')

In [125]:
df_HO1_GClf

Unnamed: 0,RecID,Label,Target,GClfTarget,HO1Label,HO1Target
0,1222603086558126080,Politics,6,6,Health,4
1,1222474209374408705,Politics,6,6,Politics,6
2,1221936812786167810,Environmental,3,3,Unknown,0
3,1221377542587928576,Politics,6,6,Politics,6
4,1220823578825887745,Politics,6,-1,Politics,6
...,...,...,...,...,...,...
215,1192717207895072769,Environmental,3,3,Environmental,3
216,1189678712490360833,Sports,11,-1,Sports,11
217,1185507039008698368,Human Rights,5,-1,Human Rights,5
218,1197994951771951105,Human Rights,5,5,Human Rights,5


In [126]:
df_HO1_GClf.rename(columns={'Target': 'SBERTTarget', 'Label': 'SBERTLabel'}, inplace=True)

In [127]:
df_HO1_GClf.head(2)

Unnamed: 0,RecID,SBERTLabel,SBERTTarget,GClfTarget,HO1Label,HO1Target
0,1222603086558126080,Politics,6,6,Health,4
1,1222474209374408705,Politics,6,6,Politics,6


In [128]:
df_HO1_GClf.to_csv('df_HO1_GClf.csv')

In [129]:
# set HO1Target's Unknown to -1 for Krippendroff's Alpha calculation
df_HO1_GClf.loc[(df_HO1_GClf.HO1Target == '0'), 'HO1Target'] = '-1'

In [130]:
df_HO1_GClf['HO1Target'].fillna(value='-1', inplace=True)

In [131]:
df_HO1_GClf['HO1AgreedTarget'] = df_HO1_GClf.apply(lambda x: AgreeTargetWithHO1(x['SBERTTarget'], x['GClfTarget'], x['HO1Target']), axis=1)

In [132]:
HO1Pred = np.stack(df_HO1_GClf['HO1Target'].astype("string"))
GClfPred = np.stack(df_HO1_GClf['GClfTarget'].astype("string"))
SBERTPred = np.stack(df_HO1_GClf['SBERTTarget'].astype("string"))
HO1AgreedTarget = np.stack(df_HO1_GClf['HO1AgreedTarget'].astype("string"))

In [133]:
missing = '-1'
alpha1 = krippendorff_alpha(np.array((HO1Pred,HO1AgreedTarget)), nominal_metric, missing_items=missing)
alpha2 = krippendorff_alpha(np.array((GClfPred,HO1AgreedTarget)), nominal_metric, missing_items=missing)
alpha3 = krippendorff_alpha(np.array((SBERTPred,HO1AgreedTarget)), nominal_metric, missing_items=missing)
alpha1, alpha2, alpha3

(0.3726912395178904, 0.42948237059264804, 0.2570517354777062)

In [134]:
df_HO1_GClf_GTD = df_HO1_GClf[df_HO1_GClf.HO1AgreedTarget != '-1']
len(df_HO1_GClf_GTD)

90

In [135]:
df_Pass5_HO1_GTD = df_HO1_GClf_GTD[['RecID','HO1Label','HO1Target']]

In [136]:
df_Pass5_HO1_GTD.rename(columns={'HO1Target': 'Target', 'HO1Label': 'Label'}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_Pass5_HO1_GTD.rename(columns={'HO1Target': 'Target', 'HO1Label': 'Label'}, inplace=True)


In [137]:
df_Pass5_HO1_GTD.head()

Unnamed: 0,RecID,Label,Target
1,1222474209374408705,Politics,6
3,1221377542587928576,Politics,6
4,1220823578825887745,Politics,6
6,1220562224185405445,Politics,6
12,1219624063506829312,Social Stories,10


In [138]:
len(df_Pass5_HO1_GTD)

90

In [78]:
#df_HO2HO1_GTD = df_HO2HO1_Labels[df_HO2HO1_Labels.HO2AgreedTarget != '-1']
#len(df_HO2HO1_GTD)

In [139]:
df_HO1_To_HO2 = df_HO1_GClf[df_HO1_GClf.HO1AgreedTarget == '-1']
len(df_HO1_To_HO2)

130

In [140]:
df_HO1_To_HO2.head(2)

Unnamed: 0,RecID,SBERTLabel,SBERTTarget,GClfTarget,HO1Label,HO1Target,HO1AgreedTarget
0,1222603086558126080,Politics,6,6,Health,4,-1
2,1221936812786167810,Environmental,3,3,Unknown,-1,-1


In [141]:
# Use the RecIDs in this file to extract JatoMaster from MasterTokens in SQL-SERVER and save into data/GTxM_Pass2/
df_HO1_To_HO2.to_csv('data/GTxM_Pass5/df_HO1_To_HO2_130.csv', index=False)

### Intercoder Reliability btw HO2, HO1 and GTxM Classifier

#### Generate JatoMaster for HO2 Labeling

In [164]:
df_master_tokens = pd.read_csv('data/MasterTokens.csv', dtype='str')

In [165]:
df_jato_HO2_tokens = pd.merge(df_master_tokens,df_HO1_To_HO2['RecID'], on='RecID')
len(df_jato_HO2_tokens)

130

In [166]:
df_jato_HO2_tokens.head(2)

Unnamed: 0,rowid,RecID,PubTitle,RecDoc,countSupTweets,smrHashtags,smrMentions,smrNER,smrNouns,smrVerbs,smrAdverbs,smrAdjectives,smrTopText,smrSummary
0,155,1057291398880391170,Could fireworks be restricted at Scottish homes?,This is the effect fireworks can have on a dog...,45.0,banfireworks fireworks,neilmackay gamesshed didriksoderlind bbcradios...,juli timmi uk last night daili bob marley ub j...,effect firework dog juli hors goat lot firewor...,stand comfort reli built hear held purchas des...,outsid long care fairli ahead exactli seemingl...,gener licens wide gener last daili big loud so...,This is the effect fireworks can have on a dog...,This is the effect fireworks can have on a dog...
1,374,1165822705037217792,Cars Are Death Machines. Self-Driving Tech Won...,"Please RT if you, or someone you know, has bee...",17.0,,aarieff realdonaldtrump ttmitch,yard hoboken nj washington yanke stadium long ...,car experi mobil panel daughter car yard aspha...,hit hit thrown end broken land save pass place...,badli recent straight right nearli nearli slow...,littl upcom catastroph oncom danger high small...,"Please RT if you, or someone you know, has bee...","Please RT if you, or someone you know, has bee..."


In [167]:
df_jato_HO2_tokens.drop(['smrAdverbs','smrAdjectives'], axis=1, inplace=True)

In [170]:
df_jato_HO2_tokens['PubTitle'] = df_jato_HO2_tokens['PubTitle'].apply(removeSpChar4Jato)
df_jato_HO2_tokens['RecDoc'] = df_jato_HO2_tokens['RecDoc'].apply(removeSpChar4Jato)
df_jato_HO2_tokens['smrTopText'] = df_jato_HO2_tokens['smrTopText'].apply(removeSpChar4Jato)
df_jato_HO2_tokens['smrSummary'] = df_jato_HO2_tokens['smrSummary'].apply(removeSpChar4Jato)

In [171]:
df_jato_HO2_tokens['PubTitle'] = df_jato_HO2_tokens['PubTitle'].apply(cleanupText4Jato)
df_jato_HO2_tokens['RecDoc'] = df_jato_HO2_tokens['RecDoc'].apply(cleanupText4Jato)
df_jato_HO2_tokens['smrTopText'] = df_jato_HO2_tokens['smrTopText'].apply(cleanupText4Jato)
df_jato_HO2_tokens['smrSummary'] = df_jato_HO2_tokens['smrSummary'].apply(cleanupText4Jato)

In [172]:
# save the file to D:\KOPro\PhD\Implementation\SourceCode\JatoPass2\data -- 
df_jato_HO2_tokens.to_csv('D:/KOPro/PhD/Implementation/SourceCode/JatoPass5_HO2/data/JatoMaster.csv', index=False)

#### Get HO2 labels After running Jato

In [146]:
# use the RecIDs 
df_HO1_To_HO2 = pd.read_csv('data/GTxM_Pass5/df_HO1_To_HO2_130.csv', dtype='str')

In [147]:
df_HO1_To_HO2.head(2)

Unnamed: 0,RecID,SBERTLabel,SBERTTarget,GClfTarget,HO1Label,HO1Target,HO1AgreedTarget
0,1222603086558126080,Politics,6,6,Health,4,-1
1,1221936812786167810,Environmental,3,3,Unknown,-1,-1


In [148]:
len(df_HO1_To_HO2)

130

In [149]:
df_jato_HO2_Labels = pd.read_csv('D:/KOPro/PhD/Implementation/SourceCode/JatoPass5_HO2/data/JatoClassified_HO2_Pass5.csv', usecols=['RecID','NewsPubCat'], dtype='str')
df_jato_HO2_Labels.head(2)

Unnamed: 0,RecID,NewsPubCat
0,222818213392678912,Unknown
1,826262311560216578,Unknown


In [150]:
Labels_Targets = pd.read_csv('data/GTxM_Pass5/Labels_Targets_CGTPass5.csv', dtype='str')

In [151]:
Labels_Targets

Unnamed: 0,Target,Label
0,1,Business
1,2,Entertainment
2,3,Environmental
3,4,Health
4,5,Human Rights
5,6,Politics
6,7,Law and Order
7,0,Unknown
8,9,Obituary
9,10,Social Stories


In [152]:
df_jato_HO2_Labels.rename(columns={'NewsPubCat': 'Label'}, inplace=True)

In [153]:
df_HO2_Labels_Targets = pd.merge(df_jato_HO2_Labels,Labels_Targets, on='Label', how='left')

In [154]:
df_HO2_Labels_Targets

Unnamed: 0,RecID,Label,Target
0,222818213392678912,Unknown,0
1,826262311560216578,Unknown,0
2,833502973204459520,Unknown,0
3,835347243020451840,Unknown,0
4,867832469181128704,Unknown,0
...,...,...,...
2380,1218564148172394496,Social Stories,10
2381,1218849631016230913,Social Stories,10
2382,1219299748248682496,Unknown,0
2383,1220439910378889216,Social Stories,10


In [155]:
df_HO2_Labels_Targets.rename(columns={'Label': 'HO2Label', 'Target': 'HO2Target'}, inplace=True)

In [156]:
df_HO2HO1_Labels = pd.merge(df_HO2_Labels_Targets,df_HO1_To_HO2, on='RecID')
len(df_HO2HO1_Labels)

130

In [157]:
df_HO2HO1_Labels.head()

Unnamed: 0,RecID,HO2Label,HO2Target,SBERTLabel,SBERTTarget,GClfTarget,HO1Label,HO1Target,HO1AgreedTarget
0,1057291398880391170,Environmental,3,Social Stories,10,-1,Unclassified Medley,16,-1
1,1165822705037217792,Health,4,Social Stories,10,-1,Unclassified Medley,16,-1
2,1174449406172491776,Entertainment,2,Social Stories,10,-1,Entertainment,2,-1
3,1179556773750927360,Unclassified Medley,16,World Politics,12,-1,Human Rights,5,-1
4,1180064036509175808,Politics,6,Social Stories,10,-1,Human Rights,5,-1


In [158]:
df_HO2HO1_Labels.drop(['HO1AgreedTarget'], axis=1, inplace=True)

#### Compute Krippendorff's Alpha for HO2, HO3, GClf

In [159]:
def AgreeTargetWithHO2(A1Target, A2Target, HO1Target, HO2Target):
    AgreedTarget = '-1'
    if (AgreedTarget == '-1') and (HO1Target == HO2Target):
        AgreedTarget = HO2Target
    if (AgreedTarget == '-1') and (A2Target == HO2Target):
        AgreedTarget = HO2Target
    if (AgreedTarget == '-1') and (A2Target == HO2Target):
        AgreedTarget = HO2Target
    return AgreedTarget

In [160]:
# set HO1Target's Unknown to -1 for Krippendroff's Alpha calculation
df_HO2HO1_Labels.loc[(df_HO2HO1_Labels.HO2Target == '0'), 'HO2Target'] = '-1'

In [161]:
df_HO2HO1_Labels['HO2AgreedTarget'] = df_HO2HO1_Labels.apply(lambda x: AgreeTargetWithHO2(x['GClfTarget'], x['SBERTTarget'], x['HO1Target'], x['HO2Target']), axis=1)

In [162]:
# convert to horizontal array as expected by Krippendorff Alpha
HO2Pred = np.stack(df_HO2HO1_Labels['HO2Target'].astype("string"))
HO1Pred = np.stack(df_HO2HO1_Labels['HO1Target'].astype("string"))
GClfPred = np.stack(df_HO2HO1_Labels['GClfTarget'].astype("string"))
SBERTPred = np.stack(df_HO2HO1_Labels['SBERTTarget'].astype("string"))
HO2AgreedTarget = np.stack(df_HO2HO1_Labels['HO2AgreedTarget'].astype("string"))

In [163]:
missing = '-1'
alpha1 = krippendorff_alpha(np.array((HO2Pred,HO2AgreedTarget)), nominal_metric, missing_items=missing)
alpha2 = krippendorff_alpha(np.array((HO1Pred,HO2AgreedTarget)), nominal_metric, missing_items=missing)
alpha3 = krippendorff_alpha(np.array((GClfPred,HO2AgreedTarget)), nominal_metric, missing_items=missing)
alpha4 = krippendorff_alpha(np.array((SBERTPred,HO2AgreedTarget)), nominal_metric, missing_items=missing)
alpha1, alpha2, alpha3, alpha4

(0.2784027887814927,
 0.05334796926454444,
 -0.04009407548437682,
 -0.0002480158730158166)

#### Remove "Social Stories" from GTD (keep aside)

In [26]:
df_Pass4_GTD_UpTodate = pd.read_csv('data/GTxM_Pass4/GTxM_Pass4_GTD_UpTodate.csv', dtype='str')

In [27]:
df_Pass5_GTD_UpTodate = df_Pass4_GTD_UpTodate[df_Pass4_GTD_UpTodate.Label != 'Social Stories']
len(df_Pass5_GTD_UpTodate)

1295

In [29]:
df_Pass5_GTD_UpTodate.to_csv('data/GTxM_Pass5/GTxM_Pass5_GTD_UpTodate.csv', index=False)

In [28]:
df_GTD_SocialStories = df_Pass4_GTD_UpTodate[df_Pass4_GTD_UpTodate.Label == 'Social Stories']
len(df_GTD_SocialStories)

59

In [30]:
df_GTD_SocialStories.to_csv('data/GTxM_Pass5/GTxM_Pass5_GTD_SocialStories.csv', index=False)

#### Reject all Pass 5 CGT data

In [38]:
# From CGT step 1: Pattern Detection
len(df_CGT_Rec_NoReject)

681

In [40]:
df_Pass5_Reject_New = df_CGT_Rec_NoReject['RecID']

In [41]:
df_Pass5_Reject_New.to_csv('data/GTxM_Pass5/GTxM_Pass5_Reject_New.csv', index=False)

In [42]:
df_Pass4_Reject_UpTodate = pd.read_csv('data/GTxM_Pass4/GTxM_Pass4_Reject_UpTodate.csv', dtype='str')
len(df_Pass4_Reject_UpTodate)

350

In [43]:
df_Pass5_Reject_UpTodate = pd.concat([df_Pass4_Reject_UpTodate, df_Pass5_Reject_New], axis=0)
len(df_Pass5_Reject_UpTodate)

1031

In [44]:
df_Pass5_Reject_UpTodate.to_csv('data/GTxM_Pass5/GTxM_Pass5_Reject_UpTodate.csv', index=False)