##### Pass 4 objective: Perform CGT on the remaining unclassified SMRs, use the GTxM Classifier, HO1 and HO2 to code new GTD

In [1]:
import re
import numpy as np
import pandas as pd
from krippendorff_alpha import *

### Cleanup for Jato

In [3]:
# For use with Jato

def removeSpChar4Jato(text):
  #old text = re.sub(r'\W', ' ', text) #replace ALL non-word characters, including emojis with space
  # remove all non ASCII characters 
  text.replace('\n', ' ')
  text.replace('\r', ' ')
  # credit: https://stackoverflow.com/questions/2758921/regular-expression-that-finds-and-replaces-non-ascii-characters-with-python
  text = re.sub(r"[\u0080-\uFFFF]", " ", text) #see ASCII list: https://www.asciitable.com/
  text = " ".join(text.split()) # replace multiple spaces with a single space
  return(text)

def cleanupText4Jato(text):
  #replace this strange chars in the text with space
  text = text.replace("GCO"," ")
  text = text.replace("GY=n+"," ")
  text = text.replace("GCY"," ")
  text = text.replace("fcafc+"," ")
  text = text.replace("fAE+"," ")
  text = text.replace("#fAE"," ")
  text = text.replace("fnae"," ")
  text = text.replace("fye"," ")

  #Replace common abbreviations and slangs
  text = text.replace(" i m "," i am ")
  text = text.replace(" i ve "," i have ")
  text = text.replace(" i ll "," i will ")
  text = text.replace(" i d "," i had ")
  text = text.replace(" that s "," that is ")
  text = text.replace(" isn t "," is not ")
  text = text.replace(" it s "," it is ")
  text = text.replace(" she s "," she is ")
  text = text.replace(" he s "," he is ")
  text = text.replace(" u "," you ")
  text = text.replace(" ur "," your ")
  text = text.replace(" b4 "," before ")
  text = text.replace(" wasnt "," was not ")
  text = text.replace(" wasn t "," was not ")
  text = text.replace(" cant "," can not ")
  text = text.replace(" can t "," can not ")
  text = text.replace(" couldnt "," could not ")
  text = text.replace(" couldn t "," could not ")
  text = text.replace(" wouldnt "," would not ")
  text = text.replace(" wouldn t "," would not ")
  text = text.replace(" dont "," do not ")
  text = text.replace(" don t "," do not ")
  text = text.replace(" didnt "," did not ")
  text = text.replace(" didn t "," did not ")
  text = text.replace(" let s "," let us ")
  text = text.replace(" i'm "," i am ")
  text = text.replace(" i've "," i have ")
  text = text.replace(" i'll "," i will ")
  text = text.replace(" i'd "," i had ")
  text = text.replace(" that's "," that is ")
  text = text.replace(" isn't "," is not ")
  text = text.replace(" it's "," it is ")
  text = text.replace(" she's "," she is ")
  text = text.replace(" he's "," he is ")
  text = text.replace(" u "," you ")
  text = text.replace(" ur "," your ")
  text = text.replace(" b4 "," before ")
  text = text.replace(" wasn't "," was not ")
  text = text.replace(" can't "," can not ")
  text = text.replace(" couldn't "," could not ")
  text = text.replace(" wouldn't "," would not ")
  text = text.replace(" don't "," do not ")
  text = text.replace(" didn't "," did not ")
  text = text.replace(" let's "," let us ")
  text = text.replace(" luv "," love ")
  text = text.replace(" true "," truth ")
  text = text.replace(" ppl "," people ")
  text = text.replace(" fb "," facebook ")
  text = text.replace(" b day "," birthday ")
  text = text.replace(" bday "," birthday ")
  if (len(text.strip())  == 0):
      text = ' ' #replace None with a single space
  return(text)


### CGT Step 1: Pattern Detection

In [3]:
# Import libraries
from pprint import pprint
# Gensim for topic modeling functions
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
# spacy for lemmatization
import spacy
# Plotting tools
import pyLDAvis
import pyLDAvis.gensim_models
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)
# libraries to tokenize, clean up and calculate word counts
import nltk
from nltk.corpus import words
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
nltk.download('words')
nltk.download('punkt')
wordlist = nltk.corpus.words.words()
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()
wordlist = [stemmer.stem(lemmatizer.lemmatize(word)) for word in wordlist]

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\kazeem\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\kazeem\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\kazeem\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [17]:
# Import Dataset
df = pd.read_csv('data/MasterTokens_10to260SupTw_Not_GTr.csv')
df_GTD_Rec = pd.read_csv('data/GTxM_Pass3/GTxM_Pass3_GTD_UpTodate.csv')
df_Reject_Rec = pd.read_csv('data/GTxM_Pass3/GTxM_Pass3_Reject_UpTodate.csv')
len(df), len(df_GTD_Rec), len(df_Reject_Rec)

(1512, 1057, 223)

In [18]:
# get the CGT RecIDs by ensuring GTD and Rejected are not included
df_CGT_Rec = pd.merge(df['RecID'], df_GTD_Rec, on='RecID', how='left')
len(df_CGT_Rec)

1512

In [19]:
df_CGT_Rec.head(2)

Unnamed: 0,RecID,Label,Target
0,1057291398880391170,,
1,1124056098925944832,,


In [20]:
df_CGT_Rec = df_CGT_Rec[df_CGT_Rec.Target.isna()]
len(df_CGT_Rec)

1269

In [31]:
# remove Rejected
df_CGT_Rec_NoReject = pd.merge(df_CGT_Rec, df_Reject_Rec, on='RecID', how="outer", indicator=True
              ).query('_merge=="left_only"')

In [32]:
len(df_CGT_Rec_NoReject)

1105

In [33]:
df_CGT_Rec_NoReject.head(2)

Unnamed: 0,RecID,Label,Target,_merge
0,1057291398880391170,,,left_only
1,1124056098925944832,,,left_only


In [34]:
df_CGT = pd.merge(df, df_CGT_Rec_NoReject['RecID'], on='RecID', how='inner')

In [35]:
len(df_CGT)

1105

In [36]:
df_CGT.head(2)

Unnamed: 0,rowid,RecID,PubTitle,RecDoc,countSupTweets,smrHashtags,smrMentions,smrNER,smrNouns,smrVerbs,smrAdverbs,smrAdjectives,smrTopText,smrSummary
0,155,1057291398880391170,Could fireworks be restricted at Scottish homes?,This is the effect fireworks can have on a dog...,45.0,banfireworks fireworks,neilmackay gamesshed didriksoderlind bbcradios...,juli timmi uk last night daili bob marley ub j...,effect firework dog juli hors goat lot firewor...,stand comfort reli built hear held purchas des...,outsid long care fairli ahead exactli seemingl...,gener licens wide gener last daili big loud so...,This is the effect fireworks can have on a dog...,This is the effect fireworks can have on a dog...
1,258,1124056098925944832,Sonic movie: New trailer shows redesigned hedg...,Thank you for the support. And the criticism. ...,18.0,sonicmovie gottafixfast,fowltown,paramount sega sonic hollywood sonic jeff,support critic messag design chang paramount s...,happen care watch show listen handl learn wait...,fulli total definit actual,loud clear happi commit hard massiv awesom gla...,Thank you for the support. And the criticism. ...,Thank you for the support. And the criticism. ...


In [37]:
ExpName = "Pass4_CGTNounAdv"
df_CGT_corpus = df_CGT['smrNouns'].fillna(value='') + df_CGT['smrAdverbs'].fillna(value='')
data = df_CGT_corpus.str.split()
data_words = data.values.tolist()
print('Token list created successfully.')

Token list created successfully.


In [38]:
df_CGT_docs = df_CGT['RecID']
id2word = corpora.Dictionary(data_words)
print('Word dictionary created successfully.')

Word dictionary created successfully.


In [39]:
# Term Document Frequency
texts = data_words
corpus = [id2word.doc2bow(text) for text in texts]
print('Term-Doc-Frequency created successfully.')

Term-Doc-Frequency created successfully.


In [40]:
len(corpus)

1105

In [41]:
# Build LDA model
print('Building LDA model...')
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,  
                                           num_topics=20,
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

# Print the Keyword in the 20 topics
print('LDA model created successfully.')
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

Building LDA model...
LDA model created successfully.
[(0,
  '0.051*"space" + 0.039*"book" + 0.031*"movi" + 0.020*"art" + 0.019*"film" + '
  '0.018*"ad" + 0.015*"star" + 0.015*"twitter" + 0.012*"site" + '
  '0.012*"transgend"'),
 (1,
  '0.054*"food" + 0.053*"anim" + 0.042*"coffe" + 0.030*"bank" + 0.021*"lamb" + '
  '0.019*"shop" + 0.018*"chicken" + 0.016*"employe" + 0.014*"compani" + '
  '0.013*"custom"'),
 (2,
  '0.031*"parti" + 0.017*"tori" + 0.015*"power" + 0.013*"leader" + '
  '0.012*"lord" + 0.012*"peopl" + 0.012*"tweet" + 0.010*"seat" + '
  '0.010*"hypocrit" + 0.010*"principl"'),
 (3,
  '0.006*"cctv" + 0.000*"plymouth" + 0.000*"graffiti" + 0.000*"homophob" + '
  '0.000*"lotti" + 0.000*"pedo" + 0.000*"cemeteri" + 0.000*"chloe" + '
  '0.000*"slur" + 0.000*"homophobia"'),
 (4,
  '0.034*"peopl" + 0.020*"agps" + 0.009*"man" + 0.009*"dongcot" + '
  '0.008*"woman" + 0.008*"person" + 0.008*"life" + 0.006*"itgco" + '
  '0.006*"igcom" + 0.006*"famili"'),
 (5,
  '0.026*"anc" + 0.010*"anaan"

In [42]:
print('Generating performance scores for '+ExpName)
# Compute Perplexity
perplex_lda = lda_model.log_perplexity(corpus)
print('\nPerplexity: {:.2f}'.format(perplex_lda))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=texts, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: {:.2f}'.format(coherence_lda))

Generating performance scores for Pass4_CGTNounAdv

Perplexity: -11.22

Coherence Score: 0.45


In [45]:
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word, mds='tsne', sort_topics=False)
pyLDAvis.save_html(vis, 'lda_tsne_'+ExpName+'.html')

  default_term_info = default_term_info.sort_values(


In [46]:
# generate doc-topics lists
doc_topic = lda_model.get_document_topics(corpus, minimum_probability=0.1)

In [48]:
i = 0
docs=[]
topics=[]
scores=[]
for doc in doc_topic:
    doc_id = df_CGT_docs.iloc[i]
    i = i+1
    #if i>5: break
    for topic_id, score in doc:
        #print(doc_id, topic_id, score)
        scores.append(score)
        topics.append(topic_id)
        docs.append(doc_id)

In [49]:
df_doc_topic = pd.DataFrame({'RecID': docs, 'TopicID': topics, 'TopicScore': scores})

In [50]:
df_doc_topic

Unnamed: 0,RecID,TopicID,TopicScore
0,1057291398880391170,4,0.443390
1,1057291398880391170,10,0.386109
2,1124056098925944832,0,0.489180
3,1124056098925944832,4,0.423269
4,1135851552495865857,4,0.516032
...,...,...,...
2834,1223039083112517632,9,0.376500
2835,1223063868110524417,9,0.109598
2836,1223063868110524417,15,0.482510
2837,1223063868110524417,18,0.342467


In [51]:
df_doc_topic.groupby(by='TopicID').count()

Unnamed: 0_level_0,RecID,TopicScore
TopicID,Unnamed: 1_level_1,Unnamed: 2_level_1
0,112,112
1,44,44
2,286,286
4,1089,1089
5,5,5
6,2,2
7,4,4
8,3,3
9,418,418
10,216,216


In [52]:
df_doc_topic.to_csv('data/GTxM_Pass4/CGT/lda_doc_topic_all_'+ExpName+'.csv')

In [53]:
df_doc_topic_nodup = df_doc_topic.sort_values(['TopicScore'], ascending=(False)).drop_duplicates(['RecID'])
len(df_doc_topic_nodup)

1105

In [63]:
df_doc_topic_nodup.to_csv('data/GTxM_Pass4/CGT/lda_doc_dominant_topic_'+ExpName+'.csv', index=False)

In [55]:
topics_ndarray = df_doc_topic_nodup.TopicID.unique()
topic_list =sorted(topics_ndarray)
listTopicID = list(set(df_doc_topic_nodup['TopicID'].tolist())) # get the unique list of Topic IDs
listTopicID

[0, 1, 2, 4, 6, 7, 9, 10, 11, 13, 15, 16, 18]

In [61]:
# Select top 20%, upto 20 items max of each Topic
df_doc_topic_top20pcent = pd.DataFrame(columns=['RecID', 'TopicID', 'TopicScore'])
i=0
for i in range(len(listTopicID)):
    df_temp = df_doc_topic_nodup[(df_doc_topic_nodup.TopicID == listTopicID[i])]
    topic_items = len(df_temp)
    #if len(df_temp) > 9:
    topic20pc_items = round(len(df_temp)/5)
    if topic20pc_items > 20: #max 20 items
        topic20pc_items = 20
    df_temp = df_temp.head(topic20pc_items)
    df_doc_topic_top20pcent = pd.concat([df_doc_topic_top20pcent, df_temp])
    print(i, listTopicID[i], topic_items, topic20pc_items)

0 0 8 2
1 1 1 0
2 2 68 14
3 4 684 20
4 6 1 0
5 7 1 0
6 9 161 20
7 10 30 6
8 11 80 16
9 13 1 0
10 15 4 1
11 16 1 0
12 18 65 13


In [57]:
df_doc_topic_top20pcent

Unnamed: 0,RecID,TopicID,TopicScore
339,1183587236039684096,0,0.517135
2,1124056098925944832,0,0.489180
2838,222818213392678912,2,0.880813
2791,1222183802459418630,2,0.750784
1757,1206500578962083841,2,0.718966
...,...,...,...
1244,1196973332689833985,18,0.628727
698,1189332650185637888,18,0.622367
2080,1214139981649862657,18,0.614945
1859,1207820053414105089,18,0.607127


In [64]:
df_doc_topic_top20pcent.to_csv('data/GTxM_Pass4/CGT/lda_doc_topic_top20pct_'+ExpName+'.csv', index=False)

In [65]:
df_doc_topic_top20pcent.groupby(by='TopicID').count()

Unnamed: 0_level_0,RecID,TopicScore
TopicID,Unnamed: 1_level_1,Unnamed: 2_level_1
0,2,2
2,14,14
4,20,20
9,20,20
10,6,6
11,16,16
15,1,1
18,13,13


### CGT Step 2: Pattern Refinement

In [11]:
# Load Pass 4 JatoClassified after deep reading
df_lda_dread = pd.read_csv('D:/KOPro/PhD/Implementation/SourceCode/jatoDR_Pass4/genTopics/lda_doc_topic_top20pct_Pass4_CGTNounAdv.csv', dtype='str')
df_jato_dread = pd.read_csv('D:/KOPro/PhD/Implementation/SourceCode/jatoDR_Pass4/data/JatoClassified_Pass4DR.csv', dtype='str')
len(df_lda_dread), len(df_jato_dread)

(92, 2385)

In [12]:
df_dread = pd.merge(df_lda_dread,df_jato_dread, on='RecID')

In [13]:
df_dread.head(2)

Unnamed: 0,RecID,TopicID,TopicScore,rowid,SavedDataTime,JatoCat,JatoSubCat,JatoCat2,JatoSubCat2,JatoCat3,JatoSubCat3,NewRecClass,GTCodes,NewsPubCat
0,1183587236039684096,0,0.51713496,977,2023:03:29 04:42:45,Business,Economy,,,,,,,Sports
1,1124056098925944832,0,0.4891796,260,2023:03:29 04:42:45,Lifestyle,TV/Movie/Theater,,,,,,,Entertainment


In [14]:
df_dread.drop(['TopicID','TopicScore','rowid','SavedDataTime','JatoCat','JatoSubCat','JatoCat2','JatoSubCat2','JatoCat3','JatoSubCat3','NewRecClass','GTCodes'], axis=1, inplace=True)
len(df_dread)

92

In [15]:
df_dread.groupby(['NewsPubCat']).size()

NewsPubCat
Entertainment      8
Environmental      7
Health             4
Human Rights       5
Law and Order      2
Obituary           1
Social Stories     7
Sports             5
Travel             2
UK Politics       13
USA Politics      15
Unknown            7
World Politics    16
dtype: int64

In [16]:
df_dread.rename(columns={'NewsPubCat': 'Label'}, inplace=True)

In [17]:
df_dread.head(2)

Unnamed: 0,RecID,Label
0,1183587236039684096,Sports
1,1124056098925944832,Entertainment


In [28]:
Labels_Targets = pd.read_csv('data/GTxM_Pass4/Labels_Targets_CGTPass4.csv')

In [29]:
Labels_Targets.head(2)

Unnamed: 0,Target,Label
0,1,Business
1,2,Entertainment


In [30]:
df_dread_Target = pd.merge(df_dread,Labels_Targets, on='Label', how='left')

In [31]:
df_dread_Target.head(2)

Unnamed: 0,RecID,Label,Target
0,1183587236039684096,Sports,11
1,1124056098925944832,Entertainment,2


In [32]:
len(df_dread_Target)

92

In [33]:
# remove the 7 unknown SMRs, we don't want to generate unknown labels with SBERT.
df_dread_Target = df_dread_Target[df_dread_Target.Label!='Unknown']
len(df_dread_Target)

85

In [34]:
# NOT NECESSARY -- work it out in the SBERT PROGRAM.
# df_recTweets = pd.read_csv('data/MasterTokens.csv', usecols=['RecID','RecDoc'], dtype='str')

In [35]:
# save to GTxM Pass 4 folder and upload to Google Colab
df_dread_Target.to_csv('data/GTxM_Pass4/DeepRead_Pass4.csv', index=False)

### CGT Step 3: See Kaggle Notebook for Pattern Confirmation with SBERT

### CCL Supervised ML (Prep data)

#### Load the CGT confirmed Record Tweets with count of SupTweets

In [70]:
# This CSV file is generated from SQL-Server (see: CGT_Confirm_Pass4_analysis.sql)
df = pd.read_csv("data/GTxM_Pass4/CGT_Confirm_0.5_Pass4.csv", dtype='str')
df.head()

Unnamed: 0,InReplyTo,Label,Target,QueryCount
0,1057291398880391170,Social Stories,10,3
1,1057291398880391170,Obituary,9,1
2,1057291398880391170,Health,4,1
3,1057291398880391170,Human Rights,5,1
4,1135851552495865857,Social Stories,10,16


In [71]:
len(df)

2618

In [72]:
#Generate final SMR list with the accepted Label as the one with maximum count.
# df_confirm_SMR = df.sort_values(['InReplyTo','QueryCount'], ascending=([False, False])).drop_duplicates(['InReplyTo'])
df_confirm_SMR = df.drop_duplicates(['InReplyTo'])
len(df_confirm_SMR)

829

In [73]:
df_confirm_SMR['QueryCount'] = df_confirm_SMR['QueryCount'].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_confirm_SMR['QueryCount'] = df_confirm_SMR['QueryCount'].astype(int)


In [74]:
df_confirm_SMR = df_confirm_SMR[df_confirm_SMR.QueryCount>9]

In [75]:
len(df_confirm_SMR)

340

In [8]:
df_confirm_SMR.groupby(['Label']).size()

Label
Entertainment     21
Environmental     41
Health             6
Human Rights      19
Law and Order      6
Social Stories    28
Sports            11
UK Politics       54
USA Politics      72
World Politics    82
dtype: int64

#### Accept the InReplyTo as the SBERT predicted Record Tweet for use downstream in CCL

In [9]:
df_confirm_SMR.drop(['QueryCount'], axis=1, inplace=True)


In [10]:
df_confirm_SMR.rename(columns={'InReplyTo': 'TID'}, inplace=True)

#### Add the Deep Read labeled SMRs to the matched SMRs to create the new CGT Labeled Dataset

In [78]:
df_deepread = pd.read_csv("data/GTxM_Pass4/CGT_DeepRead_Pass4.csv", dtype='str')

In [79]:
df_deepread.head()

Unnamed: 0,TID,Label,Target,RecDoc
0,1183587236039684096,Sports,11,Loads of people asking about #RWC2019 ticket s...
1,1124056098925944832,Entertainment,2,Thank you for the support. And the criticism. ...
2,222818213392678912,UK Politics,14,Seedy lists of party apparatchiks appointed by...
3,1222183802459418630,UK Politics,14,Don?t vote for me if you think Labour 'won the...
4,1206500578962083841,UK Politics,14,Rather than making rash pronouncements on #ind...


In [80]:
len(df_deepread)

85

In [82]:
# Update: 17-Jul-2023
# Unfortunately, a duplicate was discovered in the deepread dataset reducing the dataset from 85 to 84.
# Luckily, the duplicate SMR is part of the Pass 4 rejected SMRs, so there is no consequence to the experimental results.
# The rest of this section is re-run and the duplicate is also removed from the reject at end of the pass, 
# but the experiments were retained as is.
# df_deepread.to_csv('df_deepread.csv')
df_deepread = df_deepread.drop_duplicates(['TID'])
len(df_deepread)

84

In [83]:
df_deepread.drop(['RecDoc'], axis=1, inplace=True)
df_deepread.head(2)

Unnamed: 0,TID,Label,Target
0,1183587236039684096,Sports,11
1,1124056098925944832,Entertainment,2


In [84]:
df_deepread.groupby(['Label']).size()

Label
Entertainment      7
Environmental      7
Health             4
Human Rights       5
Law and Order      2
Obituary           1
Social Stories     7
Sports             5
Travel             2
UK Politics       13
USA Politics      15
World Politics    16
dtype: int64

In [90]:
# NOTE: the duplicate was in Social Stories

In [85]:
df_confirm_FinalSMR = pd.concat([df_confirm_SMR, df_deepread], axis=0)

In [86]:
len(df_confirm_FinalSMR)

424

In [87]:
df_confirm_FinalSMR.head()

Unnamed: 0,InReplyTo,Label,Target,QueryCount,TID
4,1135851552495865857,Social Stories,10,16.0,
11,1151389038781390848,Human Rights,5,19.0,
20,1177679699369050112,Entertainment,2,11.0,
28,1178962636278382592,Environmental,3,20.0,
35,1179050498428682240,UK Politics,14,14.0,


In [88]:
df_confirm_FinalSMR.rename(columns={'TID': 'RecID'}, inplace=True)

In [89]:
df_confirm_FinalSMR.groupby(['Label','Target']).size()

Label           Target
Entertainment   2         28
Environmental   3         48
Health          4         10
Human Rights    5         24
Law and Order   7          8
Obituary        9          1
Social Stories  10        35
Sports          11        16
Travel          13         2
UK Politics     14        67
USA Politics    15        87
World Politics  12        98
dtype: int64

#### Update Politics

In [21]:
# Remove World Politics
# df_confirm_FinalSMR = df_confirm_FinalSMR[df_confirm_FinalSMR.Label!='World Politics']
# len(df_confirm_FinalSMR)

In [22]:
# Change USA and UK Politics to 'Politics'
df_confirm_FinalSMR.loc[(df_confirm_FinalSMR.Label == 'USA Politics'), 'Label'] = 'Politics'
df_confirm_FinalSMR.loc[(df_confirm_FinalSMR.Label == 'UK Politics'), 'Label'] = 'Politics'

In [23]:
df_confirm_FinalSMR.loc[(df_confirm_FinalSMR.Target == '14'), 'Target'] = '6'
df_confirm_FinalSMR.loc[(df_confirm_FinalSMR.Target == '15'), 'Target'] = '6'

In [24]:
df_confirm_FinalSMR.groupby(['Target', 'Label']).size()

Target  Label         
10      Social Stories     36
11      Sports             16
12      World Politics     98
13      Travel              2
2       Entertainment      28
3       Environmental      48
4       Health             10
5       Human Rights       24
6       Politics          154
7       Law and Order       8
9       Obituary            1
dtype: int64

In [25]:
df_confirm_FinalSMR.head(2)

Unnamed: 0,RecID,Label,Target
4,1135851552495865857,Social Stories,10
11,1151389038781390848,Human Rights,5


#### Save to data and result

In [108]:
# save data
df_confirm_FinalSMR.to_csv('data/GTxM_Pass4/GTxM_CGT_Labeled_Pass4.csv', index=False)

In [109]:
# save base Intercoder Prediction
df_confirm_FinalSMR.to_csv('results/GTxM_Pass4/GTxM_Intercoder_Pred_Pass4.csv', index=False)

### Prepare JatoMaster for HO1

In [25]:
data = pd.read_csv('data/MasterTokens.csv', encoding='ISO-8859-1', dtype='str')
len(data)

4684

In [26]:
df_jatoMaster_Pass4_HO1 = pd.merge(data, df_confirm_FinalSMR, on='RecID', how='inner')

In [27]:
df_jatoMaster_Pass4_HO1['PubTitle'] = df_jatoMaster_Pass4_HO1['PubTitle'].apply(removeSpCharLine)
df_jatoMaster_Pass4_HO1['RecDoc'] = df_jatoMaster_Pass4_HO1['RecDoc'].apply(removeSpCharLine)
df_jatoMaster_Pass4_HO1['smrTopText'] = df_jatoMaster_Pass4_HO1['smrTopText'].apply(removeSpCharLine)
df_jatoMaster_Pass4_HO1['smrSummary'] = df_jatoMaster_Pass4_HO1['smrSummary'].apply(removeSpCharLine)

In [30]:
len(df_jatoMaster_Pass4_HO1)

425

In [29]:
df_jatoMaster_Pass4_HO1.to_csv('D:/KOPro/PhD/Implementation/SourceCode/JatoPass4_HO1/data/JatoMaster.csv')

#### Generate JatoClassified

In [50]:
df_jato_HO1_pass0 = pd.read_csv('D:/KOPro/PhD/Implementation/SourceCode/Jato/data/JatoOp1Labels.csv', usecols=['RecID','Label'], dtype='str')
len(df_jato_HO1_pass0)

2042

In [51]:
df_jato_HO1_pass0.head(2)

Unnamed: 0,RecID,Label
0,222818213392678912,Politics
1,826262311560216578,Politics


In [52]:
df_jato_Clf_blank = pd.read_csv('D:/KOPro/PhD/Implementation/SourceCode/JatoPass4_HO1/data/JatoClassified_BlankCat.csv', dtype='str')
len(df_jato_Clf_blank)

2385

In [53]:
df_jato_Clf_blank.head(2)

Unnamed: 0,rowid,RecID,SavedDataTime,JatoCat,JatoSubCat,JatoCat2,JatoSubCat2,JatoCat3,JatoSubCat3,NewRecClass,GTCodes,NewsPubCat
0,6,222818213392678912,2021:01:17 21:15:20,Political,Debate,,,,,,,Unknown
1,60,826262311560216578,2021:01:13 19:53:48,Political,Debate,,,,,,,Unknown


In [54]:
df_jato_HO1_pass4 = pd.merge(df_jato_Clf_blank, df_jato_HO1_pass0, on='RecID', how='left')
df_jato_HO1_pass4.drop(['NewsPubCat'], axis=1, inplace=True)
df_jato_HO1_pass4.rename(columns={'Label': 'NewsPubCat'}, inplace=True)
df_jato_HO1_pass4.head(2)

Unnamed: 0,rowid,RecID,SavedDataTime,JatoCat,JatoSubCat,JatoCat2,JatoSubCat2,JatoCat3,JatoSubCat3,NewRecClass,GTCodes,NewsPubCat
0,6,222818213392678912,2021:01:17 21:15:20,Political,Debate,,,,,,,Politics
1,60,826262311560216578,2021:01:13 19:53:48,Political,Debate,,,,,,,Politics


In [55]:
df_jato_HO1_pass4.groupby(['NewsPubCat']).size()

NewsPubCat
Business          87
Entertainment    228
Environmental     78
Human Rights     338
Obituary         138
Politics         872
Sports           143
Stories           84
Technology        74
dtype: int64

In [56]:
# Change Stories, Technology and Blank
df_jato_HO1_pass4.loc[(df_jato_HO1_pass4.NewsPubCat == 'Stories'), 'NewsPubCat'] = 'Social Stories'
df_jato_HO1_pass4.loc[(df_jato_HO1_pass4.NewsPubCat == 'Technology'), 'NewsPubCat'] = 'Business'
df_jato_HO1_pass4.loc[(df_jato_HO1_pass4.NewsPubCat.isna()), 'NewsPubCat'] = 'Unknown'

In [57]:
df_jato_HO1_pass4.groupby(['NewsPubCat']).size()

NewsPubCat
Business          161
Entertainment     228
Environmental      78
Human Rights      338
Obituary          138
Politics          872
Social Stories     84
Sports            143
Unknown           343
dtype: int64

In [59]:
df_jato_HO1_pass4.to_csv('D:/KOPro/PhD/Implementation/SourceCode/JatoPass4_HO1/data/JatoClassified.csv', index=False)

### SVM Clf run

#### Setup environment

In [26]:
import time
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import confusion_matrix
from sklearn.svm import SVC
import csv
import warnings
import sklearn.exceptions
warnings.filterwarnings("ignore",category=DeprecationWarning)
warnings.filterwarnings("ignore",category=sklearn.exceptions.UndefinedMetricWarning)

In [27]:
data = pd.read_csv('data/MasterTokens.csv', encoding='ISO-8859-1', dtype='str')
len(data)

4684

In [28]:
data.head(2)

Unnamed: 0,rowid,RecID,PubTitle,RecDoc,countSupTweets,smrHashtags,smrMentions,smrNER,smrNouns,smrVerbs,smrAdverbs,smrAdjectives,smrTopText,smrSummary
0,0,11947603240,"Ed Sheeran, Drake and Justin Bieber: What were...",I think I have part created a pretty amazing s...,0.0,,,,part song lie,creat,pretti,amaz,I think I have part created a pretty amazing s...,Summarization skipped (text is 1000 characters...
1,1,12643331537,"Ed Sheeran, Drake and Justin Bieber: What were...",can i have one more follower please... i would...,1.0,,jessglynne,,follow igcom club gpsi,love,,top,can i have one more follower please... i would...,Summarization skipped (text is 1000 characters...


#### Prep Training data

In [29]:
df_GTD_Rec = pd.read_csv('data/GTxM_Pass3/GTxM_Pass3_GTD_UpTodate.csv', dtype='str')
len(df_GTD_Rec)

1057

In [30]:
df_GTD_Rec.head(2)

Unnamed: 0,RecID,Label,Target
0,1207761446513319936,Politics,6
1,1180079141087055872,Politics,6


In [31]:
# train on pass 3 GTD
df_train = pd.merge(data, df_GTD_Rec, on="RecID")
len(df_train)

1057

In [32]:
df_train.groupby(['Target','Label']).size()

Target  Label         
1       Business           75
10      Social Stories     32
11      Sports             75
2       Entertainment     153
3       Environmental      17
4       Health              4
5       Human Rights       86
6       Politics          456
7       Law and Order      12
9       Obituary          147
dtype: int64

In [33]:
# remove lower than 50 count SMRs
df_train = df_train[df_train.Label != 'Environmental']
df_train = df_train[df_train.Label != 'Health']
df_train = df_train[df_train.Label != 'Law and Order']
df_train = df_train[df_train.Label != 'Social Stories']

In [36]:
# remove Business and Obituary from training since test has 0 and only 1 SMRs
# insufficient for testing
df_train = df_train[df_train.Label != 'Business']
df_train = df_train[df_train.Label != 'Obituary']

In [37]:
len(df_train)

770

In [38]:
df_train.Target = df_train['Target'].astype(int)

In [119]:
# # Code the Targets 0-5
# df_train.loc[(df_train.Target == 1), 'Target'] = 0 # Business
# df_train.loc[(df_train.Target == 2), 'Target'] = 1 # Entertainment
# df_train.loc[(df_train.Target == 5), 'Target'] = 2 # Human Rights
# df_train.loc[(df_train.Target == 6), 'Target'] = 3 # Politics
# df_train.loc[(df_train.Target == 9), 'Target'] = 4 # Obituary
# df_train.loc[(df_train.Target == 11), 'Target'] = 5 # Sports

In [39]:
# Code the Targets 0-3
df_train.loc[(df_train.Target == 2), 'Target'] = 0 # Entertainment
df_train.loc[(df_train.Target == 5), 'Target'] = 1 # Human Rights
df_train.loc[(df_train.Target == 6), 'Target'] = 2 # Politics
df_train.loc[(df_train.Target == 11), 'Target'] = 3 # Sports

In [40]:
df_train.groupby(['Target','Label']).size()

Target  Label        
0       Entertainment    153
1       Human Rights      86
2       Politics         456
3       Sports            75
dtype: int64

#### Reduce Training Politics by 1/2 (NOT USED)

In [91]:
df_train_politics = df_train[df_train.Label == 'Politics']
df_train_nopolitics = df_train[df_train.Label != 'Politics']

In [92]:
len(df_train_politics)

456

In [93]:
df_train_politics = df_train_politics.sample(frac=0.5, random_state=1, axis=0)

In [94]:
len(df_train_politics)

228

In [95]:
df_train = pd.concat([df_train_politics, df_train_nopolitics], axis=0)
df_train = df_train.sample(frac=1) # shuffle the dataframe

In [96]:
len(df_train)

764

In [99]:
df_train.groupby(['Target','Label']).size()

Target  Label        
0       Business          75
1       Entertainment    153
2       Human Rights      86
3       Politics         228
4       Obituary         147
5       Sports            75
dtype: int64

#### Prep Test data

In [41]:
df_CGT_Rec = pd.read_csv('data/GTxM_Pass4/GTxM_CGT_Labeled_Pass4.csv', dtype='str')
df_CGT_Rec.head(2)

Unnamed: 0,RecID,Label,Target
0,1135851552495865857,Social Stories,10
1,1151389038781390848,Human Rights,5


In [42]:
len(df_CGT_Rec)

425

In [43]:
# Remove 'World Politics' as per research decision in pass 3
# unify UK and USA politics
df_CGT_Rec = df_CGT_Rec[df_CGT_Rec.Label != 'World Politics']
df_CGT_Rec.loc[(df_CGT_Rec.Label == 'USA Politics'), 'Label'] = 'Politics'
df_CGT_Rec.loc[(df_CGT_Rec.Label == 'UK Politics'), 'Label'] = 'Politics'
df_CGT_Rec.loc[(df_CGT_Rec.Target == '14'), 'Target'] = '6'
df_CGT_Rec.loc[(df_CGT_Rec.Target == '15'), 'Target'] = '6'
len(df_CGT_Rec)

327

In [51]:
# remove lower than 50 traing count SMRs
df_CGT_Rec = df_CGT_Rec[df_CGT_Rec.Label != 'Environmental']
df_CGT_Rec = df_CGT_Rec[df_CGT_Rec.Label != 'Health']
df_CGT_Rec = df_CGT_Rec[df_CGT_Rec.Label != 'Law and Order']
df_CGT_Rec = df_CGT_Rec[df_CGT_Rec.Label != 'Social Stories']
df_CGT_Rec = df_CGT_Rec[df_CGT_Rec.Label != 'Travel']

In [52]:
# Remove Obitaury - insufficient samples
df_CGT_Rec = df_CGT_Rec[df_CGT_Rec.Label != 'Obituary']

In [53]:
df_test = pd.merge(data, df_CGT_Rec, on="RecID")
len(df_test)

222

In [54]:
df_test.groupby(['Target','Label']).size()

Target  Label        
11      Sports            16
2       Entertainment     28
5       Human Rights      24
6       Politics         154
dtype: int64

#### Reduce Test Politics by 1/2 (NOT USED)

In [105]:
df_test_politics = df_test[df_test.Label == 'Politics']
df_test_nopolitics = df_test[df_test.Label != 'Politics']

In [106]:
len(df_test_politics)

154

In [107]:
df_test_politics = df_test_politics.sample(frac=0.5, random_state=1, axis=0)

In [108]:
len(df_test_politics)

77

In [109]:
df_test = pd.concat([df_test_politics, df_test_nopolitics], axis=0)
df_test = df_test.sample(frac=1) # shuffle the dataframe

In [110]:
len(df_test)

250

#### Run Predictions

In [55]:
df_test.Target = df_test['Target'].astype(int)

In [129]:
# # Code the Targets 0-5
# df_test.loc[(df_test.Target == 1), 'Target'] = 0 # Business
# df_test.loc[(df_test.Target == 2), 'Target'] = 1 # Entertainment
# df_test.loc[(df_test.Target == 5), 'Target'] = 2 # Human Rights
# df_test.loc[(df_test.Target == 6), 'Target'] = 3 # Politics
# df_test.loc[(df_test.Target == 9), 'Target'] = 4 # Obituary
# df_test.loc[(df_test.Target == 11), 'Target'] = 5 # Sports


In [56]:
# Code the Targets 0-3
df_test.loc[(df_test.Target == 2), 'Target'] = 0 # Entertainment
df_test.loc[(df_test.Target == 5), 'Target'] = 1 # Human Rights
df_test.loc[(df_test.Target == 6), 'Target'] = 2 # Politics
df_test.loc[(df_test.Target == 11), 'Target'] = 3 # Sports


In [57]:
df_test.groupby(['Target','Label']).size()

Target  Label        
0       Entertainment     28
1       Human Rights      24
2       Politics         154
3       Sports            16
dtype: int64

In [58]:
scoring = {'acc': 'accuracy',
           'prec': 'precision_weighted',
           'recall': 'recall_weighted',
           'f1': 'f1_weighted'}

#test_ratio = 0.20
corpus = df_train['smrNouns'] + df_train['smrAdverbs']
# corpus = df_train['smrNouns'] + df_train['smrNER'] +df_train['smrAdverbs'] + df_test['smrAdjectives']
# corpus = df_train['smrNER'] + df_test['smrAdjectives']
corpus = corpus.fillna(value='')
vec = 'TFIDF'
# vectorizer = TfidfVectorizer(min_df=2,ngram_range=(1,2),max_features=10000)
vectorizer = TfidfVectorizer(min_df=2,ngram_range=(1,2),max_features=5000)
data_vec = vectorizer.fit_transform(corpus)
vec_dtm = pd.DataFrame(data_vec.toarray(), columns=vectorizer.get_feature_names())
vec_dtm.index = df_train['RecID']
y = df_train['Target']
y.index = df_train['RecID']
y=y.astype('int')
X = vec_dtm


In [59]:
clf = SVC(kernel='linear')
clf.fit(X, y)

SVC(kernel='linear')

In [60]:
# setup test data
corpus_test = df_test['smrNouns'] + df_test['smrAdverbs']
# corpus_test = df_test['smrNouns'] + df_test['smrNER'] + df_test['smrAdverbs'] + df_test['smrAdjectives']
# corpus_test = df_test['smrNER'] + df_test['smrAdjectives']
corpus_test = corpus_test.fillna(value='')
# vectorizer = TfidfVectorizer(min_df=2,ngram_range=(1,2),max_features=10000)
vectorizer = TfidfVectorizer(min_df=2,ngram_range=(1,2),max_features=5000)
data_vec = vectorizer.fit_transform(corpus_test)
vec_dtm = pd.DataFrame(data_vec.toarray(), columns=vectorizer.get_feature_names())
vec_dtm.index = df_test['RecID']
y_test = df_test['Target']
y_test.index = df_test['RecID']
y_test=y_test.astype('int')
X_test = vec_dtm

In [61]:
test_pred = clf.predict(X_test)

In [62]:
test_acc = accuracy_score(y_test, test_pred)
test_prec_recall_f1 = precision_recall_fscore_support(y_test, test_pred, average = 'weighted')

# For Prod:
result = ['GTxM Pass 4', 'SVM', 0,
        test_acc*100, test_prec_recall_f1[0]*100, test_prec_recall_f1[1]*100, test_prec_recall_f1[2]*100]

In [63]:
result

['GTxM Pass 4',
 'SVM',
 0,
 63.51351351351351,
 62.745098039215684,
 63.51351351351351,
 57.82505085730892]

#### Generate Confusion Matrix

In [65]:
svm_cm = confusion_matrix(y_test, test_pred)
# df_svm_cm = pd.DataFrame(svm_cm, 
#                         columns=['Business','Entertainment','Human Rights','Politics','Obituary','Sports'],
#                         index=['Business','Entertainment','Human Rights','Politics','Obituary','Sports'])

# Removed Business and Obituary since there is none in test data
df_svm_cm = pd.DataFrame(svm_cm, 
                        columns=['Entertainment','Human Rights','Politics','Sports'],
                        index=['Entertainment','Human Rights','Politics','Sports'])
df_svm_cm.to_csv('results/GTxM_Pass4/GTxM_Clf_SVM_Confusion.csv')

In [102]:
# train on pass 3 GTD
df_train = pd.merge(data, df_GTD_Rec, on="RecID")
len(df_train)

1057

In [103]:
# test (predict) on Pass2's CGT labeled data with semantic score >= 0.7
df_test = pd.merge(data, df_CGT_Rec, on="RecID")
len(df_test)

327

In [104]:
df_train.groupby(['Target','Label']).size()

Target  Label         
1       Business           75
10      Social Stories     32
11      Sports             75
2       Entertainment     153
3       Environmental      17
4       Health              4
5       Human Rights       86
6       Politics          456
7       Law and Order      12
9       Obituary          147
dtype: int64

In [105]:
# remove lower than 50 count SMRs
# + Politics
df_train = df_train[df_train.Label != 'Environmental']
df_train = df_train[df_train.Label != 'Health']
df_train = df_train[df_train.Label != 'Law and Order']
df_train = df_train[df_train.Label != 'Social Stories']
df_train = df_train[df_train.Label != 'Politics']
# df_train = df_train[df_train.Label != 'Business'] # removed because it is not in test data

In [106]:
len(df_train)

536

In [107]:
df_train.Target = df_train['Target'].astype(int)

In [108]:
# Code the Targets 0-5
df_train.loc[(df_train.Target == 1), 'Target'] = 0 # Business
df_train.loc[(df_train.Target == 2), 'Target'] = 1 # Entertainment
df_train.loc[(df_train.Target == 5), 'Target'] = 2 # Human Rights
df_train.loc[(df_train.Target == 9), 'Target'] = 3 # Obituary
df_train.loc[(df_train.Target == 11), 'Target'] = 4 # Sports

In [109]:
df_train.groupby(['Target','Label']).size()

Target  Label        
0       Business          75
1       Entertainment    153
2       Human Rights      86
3       Obituary         147
4       Sports            75
dtype: int64

In [110]:
df_test.groupby(['Target','Label']).size()

Target  Label         
10      Social Stories     36
11      Sports             16
13      Travel              2
2       Entertainment      28
3       Environmental      48
4       Health             10
5       Human Rights       24
6       Politics          154
7       Law and Order       8
9       Obituary            1
dtype: int64

In [111]:
# remove lower than 50 traing count SMRs
df_test = df_test[df_test.Label != 'Environmental']
df_test = df_test[df_test.Label != 'Health']
df_test = df_test[df_test.Label != 'Law and Order']
df_test = df_test[df_test.Label != 'Social Stories']
df_test = df_test[df_test.Label != 'Travel']
df_test = df_test[df_test.Label != 'Politics']

In [112]:
len(df_test)

69

In [113]:
df_test.Target = df_test['Target'].astype(int)

In [114]:
# Code the Targets 0-5
df_test.loc[(df_test.Target == 1), 'Target'] = 0 # Business
df_test.loc[(df_test.Target == 2), 'Target'] = 1 # Entertainment
df_test.loc[(df_test.Target == 5), 'Target'] = 2 # Human Rights
df_test.loc[(df_test.Target == 9), 'Target'] = 3 # Obituary
df_test.loc[(df_test.Target == 11), 'Target'] = 4 # Sports


In [115]:
df_test.groupby(['Target','Label']).size()

Target  Label        
1       Entertainment    28
2       Human Rights     24
3       Obituary          1
4       Sports           16
dtype: int64

In [116]:
scoring = {'acc': 'accuracy',
           'prec': 'precision_weighted',
           'recall': 'recall_weighted',
           'f1': 'f1_weighted'}

#test_ratio = 0.20
corpus = df_train['smrNouns'] + df_train['smrAdverbs']
# corpus = df_train['smrNouns'] + df_train['smrNER'] +df_train['smrAdverbs'] + df_test['smrAdjectives']
# corpus = df_train['smrNER'] + df_test['smrAdjectives']
corpus = corpus.fillna(value='')
vec = 'TFIDF'
# vectorizer = TfidfVectorizer(min_df=2,ngram_range=(1,2),max_features=10000)
vectorizer = TfidfVectorizer(min_df=2,ngram_range=(1,2),max_features=3000)
data_vec = vectorizer.fit_transform(corpus)
vec_dtm = pd.DataFrame(data_vec.toarray(), columns=vectorizer.get_feature_names())
vec_dtm.index = df_train['RecID']
y = df_train['Target']
y.index = df_train['RecID']
y=y.astype('int')
X = vec_dtm


In [117]:
clf = SVC(kernel='linear')
clf.fit(X, y)

SVC(kernel='linear')

In [118]:
# setup test data
corpus_test = df_test['smrNouns'] + df_test['smrAdverbs']
# corpus_test = df_test['smrNouns'] + df_test['smrNER'] + df_test['smrAdverbs'] + df_test['smrAdjectives']
# corpus_test = df_test['smrNER'] + df_test['smrAdjectives']
corpus_test = corpus_test.fillna(value='')
# vectorizer = TfidfVectorizer(min_df=2,ngram_range=(1,2),max_features=10000)
vectorizer = TfidfVectorizer(min_df=2,ngram_range=(1,2),max_features=3000)
data_vec = vectorizer.fit_transform(corpus_test)
vec_dtm = pd.DataFrame(data_vec.toarray(), columns=vectorizer.get_feature_names())
vec_dtm.index = df_test['RecID']
y_test = df_test['Target']
y_test.index = df_test['RecID']
y_test=y_test.astype('int')
X_test = vec_dtm

In [119]:
test_pred = clf.predict(X_test)

In [120]:
test_acc = accuracy_score(y_test, test_pred)
test_prec_recall_f1 = precision_recall_fscore_support(y_test, test_pred, average = 'weighted')

# For Prod:
result = ['GTxM Pass 4', 'SVM', 0,
        test_acc*100, test_prec_recall_f1[0]*100, test_prec_recall_f1[1]*100, test_prec_recall_f1[2]*100]

In [121]:
result

['GTxM Pass 4',
 'SVM',
 0,
 33.33333333333333,
 15.053763440860216,
 33.33333333333333,
 20.740740740740744]

#### Generate Confusion Matrix

In [122]:
svm_cm = confusion_matrix(y_test, test_pred)
# df_svm_cm = pd.DataFrame(svm_cm, 
#                         columns=['Business','Entertainment','Human Rights','Politics','Obituary','Sports'],
#                         index=['Business','Entertainment','Human Rights','Politics','Obituary','Sports'])

# No Politics
df_svm_cm = pd.DataFrame(svm_cm, 
                        columns=['Business','Entertainment','Human Rights','Obituary','Sports'],
                        index=['Business','Entertainment','Human Rights','Obituary','Sports'])
df_svm_cm.to_csv('results/GTxM_Pass4/GTxM_Clf_SVM_Confusion_NoPolitics.csv')

In [34]:
#NOTE: The predictions for BERT and XLNet need to collected from GDrive, 
#placed in results\GTxM_Pass1 for continuing the code below.

#### Write the predictions

In [138]:
df_test_pred = pd.DataFrame(test_pred, columns=['SVMPred']).set_index(y_test.index)

In [139]:
df_test_pred

Unnamed: 0_level_0,SVMPred
RecID,Unnamed: 1_level_1
222818213392678912,3
1124056098925944832,3
1151389038781390848,3
1177679699369050112,1
1179050498428682240,3
...,...
1221819670442979328,3
1221833488321646593,3
1221866319420760069,1
1222183802459418630,3


In [140]:
df_pred = pd.concat([y_test.to_frame(),df_test_pred['SVMPred']], axis=1)

In [141]:
df_pred

Unnamed: 0_level_0,Target,SVMPred
RecID,Unnamed: 1_level_1,Unnamed: 2_level_1
222818213392678912,3,3
1124056098925944832,1,3
1151389038781390848,2,3
1177679699369050112,1,1
1179050498428682240,3,3
...,...,...
1221819670442979328,3,3
1221833488321646593,3,3
1221866319420760069,3,1
1222183802459418630,3,3


In [142]:
# Reverse the target codes
df_pred.loc[(df_pred.SVMPred == 5), 'SVMPred'] = 11 # Sports
df_pred.loc[(df_pred.SVMPred == 4), 'SVMPred'] = 9 # Obituary
df_pred.loc[(df_pred.SVMPred == 3), 'SVMPred'] = 6 # Politics
df_pred.loc[(df_pred.SVMPred == 2), 'SVMPred'] = 5 # Human Rights
df_pred.loc[(df_pred.SVMPred == 1), 'SVMPred'] = 2 # Entertainment
df_pred.loc[(df_pred.SVMPred == 0), 'SVMPred'] = 1 # Busines

df_pred.loc[(df_pred.Target == 5), 'Target'] = 11 # Sports
df_pred.loc[(df_pred.Target == 4), 'Target'] = 9 # Obituary
df_pred.loc[(df_pred.Target == 3), 'Target'] = 6 # Politics
df_pred.loc[(df_pred.Target == 2), 'Target'] = 5 # Human Rights
df_pred.loc[(df_pred.Target == 1), 'Target'] = 2 # Entertainment
df_pred.loc[(df_pred.Target == 0), 'Target'] = 1 # Busines

In [143]:
# convert the index named RecID to proper column
df_pred.reset_index(inplace=True)

In [144]:
df_pred

Unnamed: 0,RecID,Target,SVMPred
0,222818213392678912,6,6
1,1124056098925944832,2,6
2,1151389038781390848,5,6
3,1177679699369050112,2,2
4,1179050498428682240,6,6
...,...,...,...
218,1221819670442979328,6,6
219,1221833488321646593,6,6
220,1221866319420760069,6,2
221,1222183802459418630,6,6


In [145]:
df_pred.drop(['Target'], axis=1, inplace=True)

In [146]:
df_intercoder = pd.read_csv('results/GTxM_Pass4/GTxM_Intercoder_Pred_Pass4.csv', dtype='str')

In [147]:
df_intercoder

Unnamed: 0,RecID,Label,Target
0,1135851552495865857,Social Stories,10
1,1151389038781390848,Human Rights,5
2,1177679699369050112,Entertainment,2
3,1178962636278382592,Environmental,3
4,1179050498428682240,Politics,6
...,...,...,...
420,1196973332689833985,World Politics,12
421,1189332650185637888,World Politics,12
422,1214139981649862657,World Politics,12
423,1207820053414105089,World Politics,12


In [148]:
df_intercoder_updated = pd.merge(df_intercoder, df_pred, on=['RecID'], how='left')

In [149]:
df_intercoder_updated

Unnamed: 0,RecID,Label,Target,SVMPred
0,1135851552495865857,Social Stories,10,
1,1151389038781390848,Human Rights,5,6.0
2,1177679699369050112,Entertainment,2,2.0
3,1178962636278382592,Environmental,3,
4,1179050498428682240,Politics,6,6.0
...,...,...,...,...
420,1196973332689833985,World Politics,12,
421,1189332650185637888,World Politics,12,
422,1214139981649862657,World Politics,12,
423,1207820053414105089,World Politics,12,


In [150]:
# NOTE: -1 is used in the intercoder code for missing values
df_intercoder_updated.SVMPred = df_intercoder_updated.SVMPred.fillna(-1).astype(int)

In [151]:
df_intercoder_updated

Unnamed: 0,RecID,Label,Target,SVMPred
0,1135851552495865857,Social Stories,10,-1
1,1151389038781390848,Human Rights,5,6
2,1177679699369050112,Entertainment,2,2
3,1178962636278382592,Environmental,3,-1
4,1179050498428682240,Politics,6,6
...,...,...,...,...
420,1196973332689833985,World Politics,12,-1
421,1189332650185637888,World Politics,12,-1
422,1214139981649862657,World Politics,12,-1
423,1207820053414105089,World Politics,12,-1


In [152]:
df_intercoder_updated.to_csv('results/GTxM_Pass4/GTxM_Intercoder_Pred_Pass4.csv', index=False)

### Intercoder Reliability

#### Load Predictions

In [2]:
dfPass4 = pd.read_csv("results/GTxM_Pass4/GTxM_Intercoder_Pred_Pass4.csv", dtype='str')
len(dfPass4)

425

In [3]:
dfPass4.head()

Unnamed: 0,RecID,Label,Target,BERTPred,XLNetPred,T5Pred,BERT_PostCodingPred,BERT_PostCoding_NoPoliticsPred,XLNet_PostCodingPred,T5_PostCodingPred,T5_CGTlabeledPred
0,1135851552495865857,Social Stories,10,-1,-1,-1,-1,-1,-1,-1,-1
1,1151389038781390848,Human Rights,5,6,6,5,6,6,6,5,5
2,1177679699369050112,Entertainment,2,2,2,2,2,2,2,5,5
3,1178962636278382592,Environmental,3,-1,-1,-1,-1,-1,-1,-1,-1
4,1179050498428682240,Politics,6,6,6,6,6,-1,6,11,11


#### Get the agreement between XLNet, BERT and T5

In [4]:
def AgreeTargetWithA3(A1Target, A2Target, A3Target):
    AgreedTarget = '-1'
    if (AgreedTarget == '-1') and (A1Target == A3Target):
        AgreedTarget = A3Target
    if (AgreedTarget == '-1') and (A2Target == A3Target):
        AgreedTarget = A3Target
    if (AgreedTarget == '-1') and (A1Target == A2Target):
        AgreedTarget = A2Target
    return AgreedTarget

##### Calculate krippendorff's alpha for the agreement between XLNet, BERT and T5

In [5]:
dfPass4['AgreedTarget'] = dfPass4.apply(lambda x: AgreeTargetWithA3(x['T5Pred'], x['BERTPred'], x['XLNetPred']), axis=1)

In [6]:
dfPass4_GClf = dfPass4[dfPass4.AgreedTarget != '-1']
len(dfPass4_GClf)

203

In [7]:
# convert to horizontal array as expected by Krippendorff Alpha
XLNetPred = np.stack(dfPass4_GClf['XLNetPred'].astype("string"))
BERTPred = np.stack(dfPass4_GClf['BERTPred'].astype("string"))
T5Pred = np.stack(dfPass4_GClf['T5Pred'].astype("string"))
AgreedTarget = np.stack(dfPass4_GClf['AgreedTarget'].astype("string"))

In [8]:
missing = '-1'
#arr = np.array((TargetPred,AgreedTarget))
alpha1 = krippendorff_alpha(np.array((T5Pred,AgreedTarget)), nominal_metric, missing_items=missing)
alpha2 = krippendorff_alpha(np.array((BERTPred,AgreedTarget)), nominal_metric, missing_items=missing)
alpha3 = krippendorff_alpha(np.array((XLNetPred,AgreedTarget)), nominal_metric, missing_items=missing)
alpha1, alpha2, alpha3

(0.7896599766539896, 0.9351201176662853, 0.9687878906402785)

In [9]:
alpha = krippendorff_alpha(np.array((T5Pred,BERTPred,XLNetPred)), nominal_metric, missing_items=missing)
alpha

0.797006008350589

#### Get Krippendorff's Alpha between HO1 and GTxM Classifier and SBERT

In [10]:
def AgreeTargetWithHO1(A1Target, A2Target, HO1Target):
    AgreedTarget = '-1'
    if (AgreedTarget == '-1') and (A1Target == HO1Target):
        AgreedTarget = HO1Target
    if (AgreedTarget == '-1') and (A2Target == HO1Target):
        AgreedTarget = HO1Target
    return AgreedTarget

In [11]:
dfPass4.head()

Unnamed: 0,RecID,Label,Target,BERTPred,XLNetPred,T5Pred,BERT_PostCodingPred,BERT_PostCoding_NoPoliticsPred,XLNet_PostCodingPred,T5_PostCodingPred,T5_CGTlabeledPred,AgreedTarget
0,1135851552495865857,Social Stories,10,-1,-1,-1,-1,-1,-1,-1,-1,-1
1,1151389038781390848,Human Rights,5,6,6,5,6,6,6,5,5,6
2,1177679699369050112,Entertainment,2,2,2,2,2,2,2,5,5,2
3,1178962636278382592,Environmental,3,-1,-1,-1,-1,-1,-1,-1,-1,-1
4,1179050498428682240,Politics,6,6,6,6,6,-1,6,11,11,6


In [12]:
dfPass4.drop(['XLNetPred','BERTPred','T5Pred'], axis=1, inplace=True)

In [13]:
dfPass4.rename(columns={'AgreedTarget': 'GClfTarget'}, inplace=True)

In [14]:
dfPass4.head(2)

Unnamed: 0,RecID,Label,Target,BERT_PostCodingPred,BERT_PostCoding_NoPoliticsPred,XLNet_PostCodingPred,T5_PostCodingPred,T5_CGTlabeledPred,GClfTarget
0,1135851552495865857,Social Stories,10,-1,-1,-1,-1,-1,-1
1,1151389038781390848,Human Rights,5,6,6,6,5,5,6


In [15]:
df_jato_HO1_Labels = pd.read_csv('D:/KOPro/PhD/Implementation/SourceCode/JatoPass4_HO1/data/JatoClassified_HO1_Pass4.csv', usecols=['RecID','NewsPubCat'], dtype='str')

In [16]:
df_jato_HO1_Labels.head(2)

Unnamed: 0,RecID,NewsPubCat
0,222818213392678912,Politics
1,826262311560216578,Politics


In [17]:
Labels_Targets = pd.read_csv('data/Labels_TargetsV3.csv', dtype='str')

In [18]:
Labels_Targets.head(2)

Unnamed: 0,Target,Label
0,1,Business
1,2,Entertainment


In [19]:
df_jato_HO1_Labels.rename(columns={'NewsPubCat': 'Label'}, inplace=True)

In [20]:
df_HO1_Labels_Targets = pd.merge(df_jato_HO1_Labels,Labels_Targets, on='Label', how='left')

In [21]:
df_HO1_Labels_Targets

Unnamed: 0,RecID,Label,Target
0,222818213392678912,Politics,6
1,826262311560216578,Politics,6
2,833502973204459520,Entertainment,2
3,835347243020451840,Human Rights,5
4,867832469181128704,Entertainment,2
...,...,...,...
2380,1223262356496699394,Entertainment,2
2381,1223301592549556224,Politics,6
2382,1223302445889150976,Politics,6
2383,1223365339494453248,Politics,6


In [22]:
df_HO1_Labels_Targets.rename(columns={'Label': 'HO1Label', 'Target': 'HO1Target'}, inplace=True)

In [23]:
df_HO1_GClf = pd.merge(dfPass4, df_HO1_Labels_Targets, on='RecID', how='left')

In [24]:
df_HO1_GClf

Unnamed: 0,RecID,Label,Target,BERT_PostCodingPred,BERT_PostCoding_NoPoliticsPred,XLNet_PostCodingPred,T5_PostCodingPred,T5_CGTlabeledPred,GClfTarget,HO1Label,HO1Target
0,1135851552495865857,Social Stories,10,-1,-1,-1,-1,-1,-1,Politics,6
1,1151389038781390848,Human Rights,5,6,6,6,5,5,6,Human Rights,5
2,1177679699369050112,Entertainment,2,2,2,2,5,5,2,Entertainment,2
3,1178962636278382592,Environmental,3,-1,-1,-1,-1,-1,-1,Unknown,0
4,1179050498428682240,Politics,6,6,-1,6,11,11,6,Politics,6
...,...,...,...,...,...,...,...,...,...,...,...
420,1196973332689833985,World Politics,12,-1,-1,-1,-1,-1,-1,World Politics,12
421,1189332650185637888,World Politics,12,-1,-1,-1,-1,-1,-1,Human Rights,5
422,1214139981649862657,World Politics,12,-1,-1,-1,-1,-1,-1,World Politics,12
423,1207820053414105089,World Politics,12,-1,-1,-1,-1,-1,-1,Human Rights,5


In [25]:
df_HO1_GClf.rename(columns={'Target': 'SBERTTarget', 'Label': 'SBERTLabel'}, inplace=True)

In [26]:
df_HO1_GClf.head(2)

Unnamed: 0,RecID,SBERTLabel,SBERTTarget,BERT_PostCodingPred,BERT_PostCoding_NoPoliticsPred,XLNet_PostCodingPred,T5_PostCodingPred,T5_CGTlabeledPred,GClfTarget,HO1Label,HO1Target
0,1135851552495865857,Social Stories,10,-1,-1,-1,-1,-1,-1,Politics,6
1,1151389038781390848,Human Rights,5,6,6,6,5,5,6,Human Rights,5


In [27]:
# set HO1Target's Unknown to -1 for Krippendroff's Alpha calculation
df_HO1_GClf.loc[(df_HO1_GClf.HO1Target == '0'), 'HO1Target'] = '-1'

In [28]:
df_HO1_GClf['HO1AgreedTarget'] = df_HO1_GClf.apply(lambda x: AgreeTargetWithHO1(x['SBERTTarget'], x['GClfTarget'], x['HO1Target']), axis=1)

In [29]:
HO1Pred = np.stack(df_HO1_GClf['HO1Target'].astype("string"))
GClfPred = np.stack(df_HO1_GClf['GClfTarget'].astype("string"))
SBERTPred = np.stack(df_HO1_GClf['SBERTTarget'].astype("string"))
HO1AgreedTarget = np.stack(df_HO1_GClf['HO1AgreedTarget'].astype("string"))

In [30]:
missing = '-1'
alpha1 = krippendorff_alpha(np.array((HO1Pred,HO1AgreedTarget)), nominal_metric, missing_items=missing)
alpha2 = krippendorff_alpha(np.array((GClfPred,HO1AgreedTarget)), nominal_metric, missing_items=missing)
alpha3 = krippendorff_alpha(np.array((SBERTPred,HO1AgreedTarget)), nominal_metric, missing_items=missing)
alpha1, alpha2, alpha3

(0.6748113230760577, 0.44030872650317443, 0.5494494583601528)

#### Get HO1 K-Alpha with SBERT subset only

In [32]:
alpha = krippendorff_alpha(np.array((HO1Pred,SBERTPred)), nominal_metric, missing_items=missing)
alpha

0.5529509091229121

#### Get HO1 K-Alpha with GClf subset only

In [34]:
df_HO1_GClf_subset = df_HO1_GClf[df_HO1_GClf.GClfTarget != '-1']
len(df_HO1_GClf_subset)

203

In [38]:
HO1Pred = np.stack(df_HO1_GClf_subset['HO1Target'].astype("string"))
GClfPred = np.stack(df_HO1_GClf_subset['GClfTarget'].astype("string"))
SBERTPred = np.stack(df_HO1_GClf_subset['SBERTTarget'].astype("string"))

In [41]:
alpha = krippendorff_alpha(np.array((HO1Pred,GClfPred,SBERTPred)), nominal_metric, missing_items=missing)
alpha

0.5531189734010941

#### Get Ground Truth Data

In [28]:
df_HO1_GClf_GTD = df_HO1_GClf[df_HO1_GClf.HO1AgreedTarget != '-1']
len(df_HO1_GClf_GTD)

276

In [29]:
df_Pass4_HO1_GTD = df_HO1_GClf_GTD[['RecID','HO1Label','HO1Target']]

In [30]:
df_Pass4_HO1_GTD.rename(columns={'HO1Target': 'Target', 'HO1Label': 'Label'}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_Pass4_HO1_GTD.rename(columns={'HO1Target': 'Target', 'HO1Label': 'Label'}, inplace=True)


In [31]:
df_Pass4_HO1_GTD.head()

Unnamed: 0,RecID,Label,Target
1,1151389038781390848,Human Rights,5
2,1177679699369050112,Entertainment,2
4,1179050498428682240,Politics,6
5,1179429715867787264,Human Rights,5
6,1179660469759438849,World Politics,12


In [None]:
# df_HO2HO1_GTD = df_HO2HO1_Labels[df_HO2HO1_Labels.HO2AgreedTarget != '-1']
# len(df_HO2HO1_GTD)

In [33]:
df_HO1_To_HO2 = df_HO1_GClf[df_HO1_GClf.HO1AgreedTarget == '-1']
len(df_HO1_To_HO2)

149

In [34]:
df_HO1_To_HO2.head(2)

Unnamed: 0,RecID,SBERTLabel,SBERTTarget,GClfTarget,HO1Label,HO1Target,HO1AgreedTarget
0,1135851552495865857,Social Stories,10,-1,Politics,6,-1
3,1178962636278382592,Environmental,3,-1,Unknown,-1,-1


In [68]:
# Use the RecIDs in this file to extract JatoMaster from MasterTokens in SQL-SERVER and save into data/GTxM_Pass2/
df_HO1_To_HO2.to_csv('data/GTxM_Pass4/df_HO1_To_HO2.csv', index=False)

#### Acquire new GTD from HO1 and GClf agreement

In [35]:
df_HO1_GClf_GTD.head(2)

Unnamed: 0,RecID,SBERTLabel,SBERTTarget,GClfTarget,HO1Label,HO1Target,HO1AgreedTarget
1,1151389038781390848,Human Rights,5,6,Human Rights,5,5
2,1177679699369050112,Entertainment,2,2,Entertainment,2,2


In [36]:
df_HO1_GClf_GTD.drop(['SBERTLabel','SBERTTarget','GClfTarget','HO1AgreedTarget'], axis=1, inplace=True)
df_HO1_GClf_GTD.rename(columns={'HO1Label': 'Label', 'HO1Target': 'Target'}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_HO1_GClf_GTD.drop(['SBERTLabel','SBERTTarget','GClfTarget','HO1AgreedTarget'], axis=1, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_HO1_GClf_GTD.rename(columns={'HO1Label': 'Label', 'HO1Target': 'Target'}, inplace=True)


In [37]:
df_HO1_GClf_GTD.head(2)

Unnamed: 0,RecID,Label,Target
1,1151389038781390848,Human Rights,5
2,1177679699369050112,Entertainment,2


In [75]:
df_HO1_GClf_GTD.to_csv('data/GTxM_Pass4/GTxM_Pass4_GTD_HO1.csv', index=False)

#### Generate JatoMaster for HO2 Labeling

In [60]:
df_master_tokens = pd.read_csv('data/MasterTokens.csv', dtype='str')

In [62]:
df_jato_HO2_tokens = pd.merge(df_master_tokens,df_HO1_To_HO2['RecID'], on='RecID')
len(df_jato_HO2_tokens)

284

In [63]:
df_jato_HO2_tokens.head(2)

Unnamed: 0,rowid,RecID,PubTitle,RecDoc,countSupTweets,smrHashtags,smrMentions,smrNER,smrNouns,smrVerbs,smrAdverbs,smrAdjectives,smrTopText,smrSummary
0,285,1135851552495865857,"'I Don't Know Prince Andrew,' Trump Says. Phot...","????????On Day 2 of the #USStateVisit, The Duk...",115.0,trump2020 usstatevisit realtalk liarinchief pe...,10downingstreet teram323tere fox5atlanta thedu...,duke york donald trump st jame palac uk us tru...,duke york prime minist presid donald trump st ...,clarifi surpris jump hear rememb mean swear fi...,usual besid cours anymor though enough clearli...,person dumber truth proud profession polit hug...,"????????On Day 2 of the #USStateVisit, The Duk...",????????On Day 2 of the The Duke of York Prime...
1,334,1151389038781390848,"Naga Munchetty, BBC News Anchor, Has Reprimand...","""I've been told as a woman of colour to 'go ho...",260.0,racist britains istandwithnaga trump2020 faken...,bbcworld foxnew washingtonpost jam99percent sp...,trump truth welldon speak faeifa fiffaeifa fif...,woman colour experi reaction comment presid tr...,share discu suppress speak experi listen trump...,home probabl freeli everytim regularli total s...,faeifaei anti trump fals sad bad vile pervert ...,"""I've been told as a woman of colour to 'go ho...","""I've been told as a woman of colour to 'go ho..."


In [64]:
df_jato_HO2_tokens.drop(['smrAdverbs','smrAdjectives'], axis=1, inplace=True)

In [67]:
df_jato_HO2_tokens['PubTitle'] = df_jato_HO2_tokens['PubTitle'].apply(removeSpChar4Jato)
df_jato_HO2_tokens['RecDoc'] = df_jato_HO2_tokens['RecDoc'].apply(removeSpChar4Jato)
df_jato_HO2_tokens['smrTopText'] = df_jato_HO2_tokens['smrTopText'].apply(removeSpChar4Jato)
df_jato_HO2_tokens['smrSummary'] = df_jato_HO2_tokens['smrSummary'].apply(removeSpChar4Jato)

In [68]:
df_jato_HO2_tokens['PubTitle'] = df_jato_HO2_tokens['PubTitle'].apply(cleanupText4Jato)
df_jato_HO2_tokens['RecDoc'] = df_jato_HO2_tokens['RecDoc'].apply(cleanupText4Jato)
df_jato_HO2_tokens['smrTopText'] = df_jato_HO2_tokens['smrTopText'].apply(cleanupText4Jato)
df_jato_HO2_tokens['smrSummary'] = df_jato_HO2_tokens['smrSummary'].apply(cleanupText4Jato)

In [69]:
# save the file to D:\KOPro\PhD\Implementation\SourceCode\JatoPass2\data -- 
df_jato_HO2_tokens.to_csv('D:/KOPro/PhD/Implementation/SourceCode/JatoPass4_HO2/data/JatoMaster.csv', index=False)

#### Generate JatoClassified for HO2

In [70]:
df_JatoClf_Blank = pd.read_csv('D:/KOPro/PhD/Implementation/SourceCode/jatoPass2_HO2/data/JatoClassified_BlankCat.csv')

In [71]:
df_JatoClf_Blank.head(2)

Unnamed: 0,rowid,RecID,SavedDataTime,JatoCat,JatoSubCat,JatoCat2,JatoSubCat2,JatoCat3,JatoSubCat3,NewRecClass,GTCodes,NewsPubCat
0,6,222818213392678912,2021:01:17 21:15:20,Political,Debate,,,,,,,Unknown
1,60,826262311560216578,2021:01:13 19:53:48,Political,Debate,,,,,,,Unknown


In [72]:
len(df_JatoClf_Blank)

2385

In [82]:
df_JatoCl_Pass0_in_4 = pd.read_csv('data/GTxM_Pass0/JatoClassified_HO2_Pass0_in_Pass4_task.csv')

In [83]:
df_JatoCl_Pass0_in_4.head(2)

Unnamed: 0,rowid,RecID,SavedDataTime,JatoCat,JatoSubCat,JatoCat2,JatoSubCat2,JatoCat3,JatoSubCat3,NewRecClass,GTCodes,NewsPubCat
0,557,1179660469759438849,2021:02:05 11:57:33,Political,Debate,,,,,,,Politics
1,669,1181047749678161920,2021:02:05 11:57:33,Violent Material,War Crimes,Political,Debate,,,,,Law and Order


In [84]:
len(df_JatoCl_Pass0_in_4)

60

In [85]:
df_JatoCl_Pass4_dedup = pd.concat([df_JatoClf_Blank,df_JatoCl_Pass0_in_4], axis=0)

In [89]:
len(df_JatoCl_Pass4_dedup)

2445

In [90]:
df_JatoCl_Pass4_dedup.drop_duplicates(subset='RecID', keep=False, inplace=True)

In [91]:
len(df_JatoCl_Pass4_dedup)

2325

In [92]:
# re-concatenate the datasets to add the new subset only
df_JatoCl_Pass4 = pd.concat([df_JatoCl_Pass4_dedup,df_JatoCl_Pass0_in_4], axis=0)

In [93]:
df_JatoCl_Pass4

Unnamed: 0,rowid,RecID,SavedDataTime,JatoCat,JatoSubCat,JatoCat2,JatoSubCat2,JatoCat3,JatoSubCat3,NewRecClass,GTCodes,NewsPubCat
0,6,222818213392678912,2021:01:17 21:15:20,Political,Debate,,,,,,,Unknown
1,60,826262311560216578,2021:01:13 19:53:48,Political,Debate,,,,,,,Unknown
2,63,833502973204459520,2023:03:16 20:23:22,Lifestyle,Music,,,,,,,Unknown
3,64,835347243020451840,2021:01:18 15:17:18,Abusive Material,Hate Speech,,,,,,,Unknown
4,69,867832469181128704,2021:02:03 13:42:38,Lifestyle,Music,,,,,,,Unknown
...,...,...,...,...,...,...,...,...,...,...,...,...
55,4143,1218564148172394496,2021:02:12 13:06:39,News,Weather,,,,,weather,,Social Stories
56,4165,1218849631016230913,2021:02:12 22:14:28,Violent Material,Violence,Conversational,,,,,,Social Stories
57,4214,1219299748248682496,2021:02:13 10:45:33,Political,Debate,,,,,,,Unknown
58,4373,1220439910378889216,2021:02:13 10:45:33,Conversational,Negative,,,,,,,Social Stories


In [94]:
df_JatoCl_Pass4.to_csv('D:/KOPro/PhD/Implementation/SourceCode/jatoPass4_HO2/data/JatoClassified.csv', index=False)

### Intercoder Reliability btw HO2, HO1 and GTxM Classifier

#### Get HO2 labels After running Jato

In [77]:
# use the RecIDs 
df_HO1_To_HO2 = pd.read_csv('data/GTxM_Pass4/df_HO1_To_HO2.csv', dtype='str')

In [38]:
df_HO1_To_HO2.head(2)

Unnamed: 0,RecID,SBERTLabel,SBERTTarget,GClfTarget,HO1Label,HO1Target,HO1AgreedTarget
0,1135851552495865857,Social Stories,10,-1,Politics,6,-1
3,1178962636278382592,Environmental,3,-1,Unknown,-1,-1


In [39]:
len(df_HO1_To_HO2)

149

In [40]:
df_jato_HO2_Labels = pd.read_csv('D:/KOPro/PhD/Implementation/SourceCode/JatoPass4_HO2/data/JatoClassified_HO2_Pass4.csv', usecols=['RecID','NewsPubCat'], dtype='str')
df_jato_HO2_Labels.head(2)

Unnamed: 0,RecID,NewsPubCat
0,222818213392678912,Unknown
1,826262311560216578,Unknown


In [41]:
Labels_Targets = pd.read_csv('data/Labels_TargetsV3.csv', dtype='str')

In [42]:
Labels_Targets.head(2)

Unnamed: 0,Target,Label
0,1,Business
1,2,Entertainment


In [43]:
df_jato_HO2_Labels.rename(columns={'NewsPubCat': 'Label'}, inplace=True)

In [44]:
df_HO2_Labels_Targets = pd.merge(df_jato_HO2_Labels,Labels_Targets, on='Label', how='left')

In [45]:
df_HO2_Labels_Targets

Unnamed: 0,RecID,Label,Target
0,222818213392678912,Unknown,0
1,826262311560216578,Unknown,0
2,833502973204459520,Unknown,0
3,835347243020451840,Unknown,0
4,867832469181128704,Unknown,0
...,...,...,...
2380,1218564148172394496,Social Stories,10
2381,1218849631016230913,Social Stories,10
2382,1219299748248682496,Unknown,0
2383,1220439910378889216,Social Stories,10


In [46]:
df_HO2_Labels_Targets.rename(columns={'Label': 'HO2Label', 'Target': 'HO2Target'}, inplace=True)

In [47]:
df_HO2HO1_Labels = pd.merge(df_HO2_Labels_Targets,df_HO1_To_HO2, on='RecID')
len(df_HO2HO1_Labels)

149

In [48]:
df_HO2HO1_Labels.head()

Unnamed: 0,RecID,HO2Label,HO2Target,SBERTLabel,SBERTTarget,GClfTarget,HO1Label,HO1Target,HO1AgreedTarget
0,1135851552495865857,World Politics,12,Social Stories,10,-1,Politics,6,-1
1,1178962636278382592,Social Stories,10,Environmental,3,-1,Unknown,-1,-1
2,1180604548596916224,Law and Order,7,Law and Order,7,-1,Human Rights,5,-1
3,1181140315635376128,Social Stories,10,Sports,11,-1,Unknown,-1,-1
4,1181281583078223872,World Politics,12,World Politics,12,-1,Politics,6,-1


In [49]:
df_HO2HO1_Labels.drop(['HO1AgreedTarget'], axis=1, inplace=True)

#### Compute Krippendorff's Alpha for HO2, HO3, GClf

In [50]:
def AgreeTargetWithHO2(A1Target, A2Target, HO1Target, HO2Target):
    AgreedTarget = '-1'
    if (AgreedTarget == '-1') and (HO1Target == HO2Target):
        AgreedTarget = HO2Target
    if (AgreedTarget == '-1') and (A2Target == HO2Target):
        AgreedTarget = HO2Target
    if (AgreedTarget == '-1') and (A2Target == HO2Target):
        AgreedTarget = HO2Target
    return AgreedTarget

In [51]:
# set HO1Target's Unknown to -1 for Krippendroff's Alpha calculation
df_HO2HO1_Labels.loc[(df_HO2HO1_Labels.HO2Target == '0'), 'HO2Target'] = '-1'

In [52]:
df_HO2HO1_Labels['HO2AgreedTarget'] = df_HO2HO1_Labels.apply(lambda x: AgreeTargetWithHO2(x['GClfTarget'], x['SBERTTarget'], x['HO1Target'], x['HO2Target']), axis=1)

In [53]:
# convert to horizontal array as expected by Krippendorff Alpha
HO2Pred = np.stack(df_HO2HO1_Labels['HO2Target'].astype("string"))
HO1Pred = np.stack(df_HO2HO1_Labels['HO1Target'].astype("string"))
GClfPred = np.stack(df_HO2HO1_Labels['GClfTarget'].astype("string"))
SBERTPred = np.stack(df_HO2HO1_Labels['SBERTTarget'].astype("string"))
HO2AgreedTarget = np.stack(df_HO2HO1_Labels['HO2AgreedTarget'].astype("string"))

In [86]:
missing = '-1'
alpha1 = krippendorff_alpha(np.array((HO2Pred,HO2AgreedTarget)), nominal_metric, missing_items=missing)
alpha2 = krippendorff_alpha(np.array((HO1Pred,HO2AgreedTarget)), nominal_metric, missing_items=missing)
alpha3 = krippendorff_alpha(np.array((GClfPred,HO2AgreedTarget)), nominal_metric, missing_items=missing)
alpha4 = krippendorff_alpha(np.array((SBERTPred,HO2AgreedTarget)), nominal_metric, missing_items=missing)
alpha1, alpha2, alpha3, alpha4

(0.647316299173825,
 0.33362155930249315,
 -0.01646657720068201,
 0.18474002611731466)

#### Acquire new GTD from HO2 agreement

In [54]:
df_HO2HO1_GTD = df_HO2HO1_Labels[df_HO2HO1_Labels.HO2AgreedTarget != '-1']
len(df_HO2HO1_GTD)

91

In [55]:
df_Pass4_HO2_GTD = df_HO2HO1_GTD[['RecID','HO2Label','HO2Target']]
df_Pass4_HO2_GTD.rename(columns={'HO2Target': 'Target', 'HO2Label': 'Label'}, inplace=True)
df_Pass4_HO2_GTD.head(2)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_Pass4_HO2_GTD.rename(columns={'HO2Target': 'Target', 'HO2Label': 'Label'}, inplace=True)


Unnamed: 0,RecID,Label,Target
2,1180604548596916224,Law and Order,7
4,1181281583078223872,World Politics,12


In [56]:
df_Pass4_GTD = pd.concat([df_Pass4_HO1_GTD, df_Pass4_HO2_GTD], axis=0)
len(df_Pass4_GTD)

367

In [57]:
df_Pass4_GTD.groupby(['Target','Label']).size()

Target  Label         
10      Social Stories     27
11      Sports              9
12      World Politics     70
2       Entertainment      23
3       Environmental      38
4       Health             13
5       Human Rights       29
6       Politics          152
7       Law and Order       5
9       Obituary            1
dtype: int64

In [58]:
df_Pass4_GTD_WPolitics = df_Pass4_GTD[df_Pass4_GTD.Label == 'World Politics']
len(df_Pass4_GTD_WPolitics)

70

In [99]:
df_Pass4_GTD_WPolitics.to_csv('data/GTxM_Pass4/GTxM_Pass4_GTD_WPolitics.csv', index=False)

In [59]:
df_Pass4_GTD_New = df_Pass4_GTD[df_Pass4_GTD.Label != 'World Politics']
len(df_Pass4_GTD_New)

297

In [102]:
df_Pass4_GTD_New.to_csv('data/GTxM_Pass4/GTxM_Pass4_GTD_New.csv', index=False)

In [60]:
df_Pass3_GTD_UpTodate = pd.read_csv('data/GTxM_Pass3/GTxM_Pass3_GTD_UpTodate.csv', dtype='str')
len(df_Pass3_GTD_UpTodate)

1057

In [61]:
df_Pass4_GTD_UpTodate = pd.concat([df_Pass3_GTD_UpTodate, df_Pass4_GTD_New], axis=0)
len(df_Pass4_GTD_UpTodate)

1354

In [62]:
df_Pass4_GTD_UpTodate.groupby(['Target','Label']).size()

Target  Label         
1       Business           75
10      Social Stories     59
11      Sports             84
2       Entertainment     176
3       Environmental      55
4       Health             17
5       Human Rights      115
6       Politics          608
7       Law and Order      17
9       Obituary          148
dtype: int64

In [106]:
df_Pass4_GTD_UpTodate.to_csv('data/GTxM_Pass4/GTxM_Pass4_GTD_UpTodate.csv', index=False)

In [63]:
df_Pass4_Reject = df_HO2HO1_Labels[df_HO2HO1_Labels.HO2AgreedTarget == '-1']
len(df_Pass4_Reject)

58

In [68]:
df_Pass4_Reject.to_csv('df_Pass4_Reject.csv')

In [64]:
df_Pass4_Reject = df_Pass4_Reject['RecID']

In [65]:
# Add World Politics to Pass 4's Reject
df_Pass4_Reject_New = pd.concat([df_Pass4_Reject, (df_Pass4_GTD_WPolitics['RecID'])], axis=0)
len(df_Pass4_Reject_New)

128

In [115]:
df_Pass4_Reject_New.to_csv('data/GTxM_Pass4/GTxM_Pass4_Reject_New.csv', index=False)

In [66]:
df_Pass3_Reject_UpTodate = pd.read_csv('data/GTxM_Pass3/GTxM_Pass3_Reject_UpTodate.csv', dtype='str')
len(df_Pass3_Reject_UpTodate)

223

In [67]:
df_Pass4_Reject_UpTodate = pd.concat([df_Pass3_Reject_UpTodate, df_Pass4_Reject_New], axis=0)
len(df_Pass4_Reject_UpTodate)

351

In [118]:
df_Pass4_Reject_UpTodate.to_csv('data/GTxM_Pass4/GTxM_Pass4_Reject_UpTodate.csv', index=False)

In [92]:
# Update 17-Jul-2023
# Duplicate discovered in df_Pass4_Reject, reduced from 128 to 127 (a Social Stories SMR)
# Fix: update the reject files
df_Pass4_Reject_New = pd.read_csv('data/GTxM_Pass4/GTxM_Pass4_Reject_New.csv', dtype='str')
len(df_Pass4_Reject_New)

128

In [94]:
df_Pass4_Reject_New = df_Pass4_Reject_New.drop_duplicates(['RecID'])
len(df_Pass4_Reject_New)

127

In [95]:
df_Pass4_Reject_New.to_csv('data/GTxM_Pass4/GTxM_Pass4_Reject_New.csv', index=False)

In [96]:
df_Pass3_Reject_UpTodate = pd.read_csv('data/GTxM_Pass3/GTxM_Pass3_Reject_UpTodate.csv', dtype='str')
len(df_Pass3_Reject_UpTodate)

223

In [97]:
df_Pass4_Reject_UpTodate = pd.concat([df_Pass3_Reject_UpTodate, df_Pass4_Reject_New], axis=0)
len(df_Pass4_Reject_UpTodate)

350

In [98]:
df_Pass4_Reject_UpTodate.to_csv('data/GTxM_Pass4/GTxM_Pass4_Reject_UpTodate.csv', index=False)