### Setup Env

In [1]:
# credit: https://github.com/grrrr/krippendorff-alpha (Thomas Grill)
# downloaded: https://github.com/grrrr/krippendorff-alpha/blob/master/krippendorff_alpha.py

In [4]:
# krippendorff_alpha.py must be locally stored in the same folder as this notebook
from krippendorff_alpha import *

In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
def removeSpCharLine(text):
    # remove all non ASCII characters and line breaks.
    # credit: https://stackoverflow.com/questions/2758921/regular-expression-that-finds-and-replaces-non-ascii-characters-with-python
    text.replace('\n', ' ')
    text.replace('\r', ' ')
    text = re.sub(r"[\u0080-\uFFFF]", " ", text) #see ASCII list: https://www.asciitable.com/
    text = " ".join(text.split()) # replace multiple spaces with a single space
    return(text)

### Prepare Intercoder DataFrame

In [7]:
# Use the result of individual algorithm experiments to create a unified  GTxM Classifier prediction
# SVM: GTxM/GTxM_Pass2_SVM.ipynb
# BERT: Colab/
# XLNet: Colab/

In [8]:
df = pd.read_csv("results/GTxM_Pass2/GClf_Intercoder_Pred.csv", dtype='str')

In [9]:
len(df)

407

In [10]:
df.head()

Unnamed: 0,RecID,Label,Target,SVMPred,BERTPred,XLNetPred
0,1223365339494453248,Politics,6,6,-1,-1
1,1222952347548164098,Politics,6,6,6,6
2,1222288749813518339,Politics,6,6,6,6
3,1222281539100250114,Entertainment,2,6,-1,-1
4,1222242112307187712,Obituary,9,6,9,9


### Get the subset with XLNet and BERT agreement

In [11]:
df_AgreeXLNetBERT = df[(df.XLNetPred == df.BERTPred) & (df.XLNetPred != '-1')]

In [12]:
len(df_AgreeXLNetBERT)

266

In [13]:
df_AgreeXLNetBERT.to_csv('df_AgreeXLNetBERT.csv')

In [14]:
df_AgreeXLNetBERT.head(2)

Unnamed: 0,RecID,Label,Target,SVMPred,BERTPred,XLNetPred
1,1222952347548164098,Politics,6,6,6,6
2,1222288749813518339,Politics,6,6,6,6


### Find krippendorff's alpha for the agreement between XLNet and BERT

In [15]:
# convert to horizontal array as expected by Krippendorff Alpha
XLNetPred = np.stack(df['XLNetPred'].astype("string"))
BERTPred = np.stack(df['BERTPred'].astype("string"))

In [16]:
arr = np.array((XLNetPred,BERTPred))
missing = '-1'
alpha1 = krippendorff_alpha(arr, nominal_metric, missing_items=missing)
alpha1

0.9211240586926798

### Get the subset with XLNet or BERT agreement with SVM

In [17]:
def AgreeTargetWithA3(A1Target, A2Target, A3Target):
    AgreedTarget = -1
    if (A1Target == A3Target):
        AgreedTarget = A3Target
    if (A2Target == A3Target):
        AgreedTarget = A3Target
    return AgreedTarget

In [18]:
df2 = df[df.SVMPred != '-1']

In [19]:
len(df2)

317

In [20]:
df2.head(2)

Unnamed: 0,RecID,Label,Target,SVMPred,BERTPred,XLNetPred
0,1223365339494453248,Politics,6,6,-1,-1
1,1222952347548164098,Politics,6,6,6,6


### Find krippendorff's alpha for the agreement between (XLNet or BERT) and SVM

In [21]:
df2['AgreedTarget'] = df2.apply(lambda x: AgreeTargetWithA3(x['XLNetPred'], x['BERTPred'], x['SVMPred']), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2['AgreedTarget'] = df2.apply(lambda x: AgreeTargetWithA3(x['XLNetPred'], x['BERTPred'], x['SVMPred']), axis=1)


In [22]:
# convert to horizontal array as expected by Krippendorff Alpha
SVMPred = np.stack(df2['SVMPred'].astype("string"))
AgreedTarget = np.stack(df2['AgreedTarget'].astype("string"))

In [23]:
missing = '-1'
arr = np.array((SVMPred,AgreedTarget))
alpha2 = krippendorff_alpha(arr, nominal_metric, missing_items=missing)
alpha2

0.058255070096637995

### Get HO1 labels After running Jato

In [24]:
# use the RecIDs to generate the JatoMaster.csv in SQL-Server (subset of MasterTokens)
df_RecID = df['RecID']

In [25]:
df_jato_HO1_Labels = pd.read_csv('D:/KOPro/PhD/Implementation/SourceCode/JatoPass2/data/JatoClassified_HO1_Pass2.csv', dtype='str')

In [26]:
df_jato_HO1_Labels.head(2)

Unnamed: 0,rowid,RecID,SavedDataTime,JatoCat,JatoSubCat,JatoCat2,JatoSubCat2,JatoCat3,JatoSubCat3,NewRecClass,GTCodes,NewsPubCat
0,6.0,222818213392678912,2021:01:17 21:15:20,Political,Debate,,,,,,,Politics
1,60.0,826262311560216578,2021:01:13 19:53:48,Political,Debate,,,,,,,Politics


In [27]:
len(df_jato_HO1_Labels)

2385

In [28]:
df_jato_HO1_Labels.drop(['rowid','SavedDataTime','JatoCat','JatoSubCat','JatoCat2','JatoSubCat2','JatoCat3','JatoSubCat3','NewRecClass','GTCodes'], axis=1, inplace=True)

In [29]:
df_jato_HO1_Labels.head(2)

Unnamed: 0,RecID,NewsPubCat
0,222818213392678912,Politics
1,826262311560216578,Politics


In [30]:
len(df_RecID)

407

In [31]:
df_HO1_Labels = pd.merge(df_jato_HO1_Labels,df_RecID, on='RecID')

In [32]:
df_HO1_Labels.head()

Unnamed: 0,RecID,NewsPubCat
0,833502973204459520,Entertainment
1,1175105751846510592,Environmental
2,1178650857430274049,Human Rights
3,1178745116758020097,Social Stories
4,1178756273124335624,Obituary


In [33]:
len(df_HO1_Labels)

407

In [34]:
df_HO1_Labels.to_csv('df_HO1_Labels.csv')

In [35]:
Labels_Targets = pd.read_csv('data/Labels_TargetsV2.csv')

In [36]:
Labels_Targets.head(2)

Unnamed: 0,Target,Label
0,1,Business
1,2,Entertainment


In [37]:
Labels_Targets.rename(columns={'Label': 'NewsPubCat'}, inplace=True)

In [38]:
df_HO1_Label_Target = pd.merge(df_HO1_Labels,Labels_Targets, on='NewsPubCat', how='left')

In [39]:
df_HO1_Label_Target

Unnamed: 0,RecID,NewsPubCat,Target
0,833502973204459520,Entertainment,2
1,1175105751846510592,Environmental,3
2,1178650857430274049,Human Rights,5
3,1178745116758020097,Social Stories,10
4,1178756273124335624,Obituary,9
...,...,...,...
402,1222872177604775937,Politics,6
403,1222910064236744704,Politics,6
404,1222952347548164098,Politics,6
405,1223302445889150976,Politics,6


In [40]:
df_HO1_Label_Target.rename(columns={'NewsPubCat': 'HO1Label'}, inplace=True)

In [41]:
df_HO1_Label_Target['Target'] = df_HO1_Label_Target['Target'].astype('Int64')
df_HO1_Label_Target.rename(columns={'Target': 'HO1Target'}, inplace=True)

In [42]:
df_HO1_Label_Target.head(2)

Unnamed: 0,RecID,HO1Label,HO1Target
0,833502973204459520,Entertainment,2
1,1175105751846510592,Environmental,3


In [68]:
df_HO1_Label_Target.to_csv('data/GTxM_Pass2/JatoHO1_Labels.csv')

### Get Krippendorff's Alpha between HO1 and GTxM Classifier (the accepted XLNet/BERT subset only)

In [43]:
df_AgreeXLNetBERT.head(2)

Unnamed: 0,RecID,Label,Target,SVMPred,BERTPred,XLNetPred
1,1222952347548164098,Politics,6,6,6,6
2,1222288749813518339,Politics,6,6,6,6


In [44]:
len(df_AgreeXLNetBERT)

266

In [45]:
df_AgreeXLNetBERT.drop(['Label','Target','SVMPred','BERTPred'], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [46]:
df_AgreeXLNetBERT.rename(columns={'XLNetPred': 'GClfTarget'}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(


In [47]:
df_AgreeXLNetBERT.head(2)

Unnamed: 0,RecID,GClfTarget
1,1222952347548164098,6
2,1222288749813518339,6


In [48]:
df_HO1_GClf = pd.merge(df_HO1_Label_Target,df_AgreeXLNetBERT, on='RecID', how='left')

In [49]:
df_HO1_GClf

Unnamed: 0,RecID,HO1Label,HO1Target,GClfTarget
0,833502973204459520,Entertainment,2,9
1,1175105751846510592,Environmental,3,1
2,1178650857430274049,Human Rights,5,2
3,1178745116758020097,Social Stories,10,
4,1178756273124335624,Obituary,9,9
...,...,...,...,...
402,1222872177604775937,Politics,6,6
403,1222910064236744704,Politics,6,6
404,1222952347548164098,Politics,6,6
405,1223302445889150976,Politics,6,


In [50]:
df_HO1_GClf_Pred = df_HO1_GClf[(df_HO1_GClf.GClfTarget.notna())]

In [51]:
HO1Pred = np.stack(df_HO1_GClf_Pred['HO1Target'].astype("string"))
GClfPred = np.stack(df_HO1_GClf_Pred['GClfTarget'].astype("string"))

In [52]:
missing = '0'
arr = np.array((HO1Pred,GClfPred))
alpha3 = krippendorff_alpha(arr, nominal_metric, missing_items=missing)
alpha3

0.6052469070079427

In [53]:
df_HO1_GClf_Pred.to_csv('df_HO1_GClf_Pred.csv')

In [54]:
df_HO1_GClf_GTD = df_HO1_GClf[(df_HO1_GClf.GClfTarget.astype("string")==df_HO1_GClf.HO1Target.astype("string"))]

In [55]:
len(df_HO1_GClf_GTD)

156

In [56]:
df_HO1_To_HO2 = df_HO1_GClf[(df_HO1_GClf.GClfTarget.isna()) | (df_HO1_GClf.GClfTarget.astype("string")!=df_HO1_GClf.HO1Target.astype("string"))]

In [57]:
len(df_HO1_To_HO2)

251

In [58]:
df_HO1_To_HO2.head(2)

Unnamed: 0,RecID,HO1Label,HO1Target,GClfTarget
0,833502973204459520,Entertainment,2,9
1,1175105751846510592,Environmental,3,1


In [59]:
# Use the RecIDs in this file to extract JatoMaster from MasterTokens in SQL-SERVER and save into data/GTxM_Pass2/
df_HO1_To_HO2.to_csv('data/GTxM_Pass2/df_HO1_To_HO2.csv', index=False)

### Acquire new GTD from HO1 and GClf agreement

In [61]:
df_HO1_GClf_GTD

Unnamed: 0,RecID,HO1Label,HO1Target,GClfTarget
4,1178756273124335624,Obituary,9,9
5,1178805608331526145,Politics,6,6
11,1179746754058739712,Politics,6,6
13,1180079141087055872,Politics,6,6
14,1180195222212562946,Politics,6,6
...,...,...,...,...
394,1222142443463663616,Politics,6,6
399,1222288749813518339,Politics,6,6
402,1222872177604775937,Politics,6,6
403,1222910064236744704,Politics,6,6


In [62]:
df_HO1_GClf_GTD.drop(['GClfTarget'], axis=1, inplace=True)
df_HO1_GClf_GTD.rename(columns={'HO1Label': 'Label', 'HO1Target': 'Target'}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(


In [63]:
df_HO1_GClf_GTD.head(2)

Unnamed: 0,RecID,Label,Target
4,1178756273124335624,Obituary,9
5,1178805608331526145,Politics,6


In [110]:
df_HO1_GClf_GTD.to_csv('data/GTxM_Pass2/GTxM_Pass2_GTD_HO1_GClf.csv', index=False)

### Generate JatoMaster for HO2 Labeling

In [64]:
df_HO2_RecID = df_HO1_To_HO2['RecID']

In [65]:
df_master_tokens = pd.read_csv('data/MasterTokens.csv', dtype='str')

In [66]:
df_jato_HO2_tokens = pd.merge(df_master_tokens,df_HO2_RecID, on='RecID')

In [67]:
len(df_jato_HO2_tokens)

251

In [68]:
df_jato_HO2_tokens.head(2)

Unnamed: 0,rowid,RecID,PubTitle,RecDoc,countSupTweets,smrHashtags,smrMentions,smrNER,smrNouns,smrVerbs,smrAdverbs,smrAdjectives,smrTopText,smrSummary
0,63,833502973204459520,Ringo Starr: Abbey Road wasn't meant to be The...,Thanks for coming over man and playing Great ...,13.0,,ringostarrmusic,ringo paul fn fn fn fn fn fn fyifyifyifyifyify...,man bass peac love photo memori band music guy...,play love man love order live play love beat f...,togeth forev especi nearli,great great greatest happi love ador beauti wo...,Thanks for coming over man and playing Great ...,Summarization skipped (text is 1000 characters...
1,437,1175105751846510592,Aarey protests: Supreme Court steps in to save...,We at #MMRC plant trees &amp; build @MumbaiMet...,37.0,infrastructure climatechange shame saveaarey m...,cmomaharashtra ashwinibhide authackeray fayeds...,anuanc anoan ancan anoanian anian bkc midc kal...,plant tree cut tree suit citi tree tree wel an...,build build mitig plant plant geotag maintain ...,newli kindli late ahead kindli properli fulli ...,present futur local inhuman small depend truth...,We at #MMRC plant trees &amp; build @MumbaiMet...,We at plant trees build also.\nWe do need 2 cu...


In [69]:
df_jato_HO2_tokens.drop(['smrAdverbs','smrAdjectives'], axis=1, inplace=True)

In [70]:
df_jato_HO2_tokens['PubTitle'] = df_jato_HO2_tokens['PubTitle'].apply(removeSpCharLine)
df_jato_HO2_tokens['RecDoc'] = df_jato_HO2_tokens['RecDoc'].apply(removeSpCharLine)
df_jato_HO2_tokens['smrTopText'] = df_jato_HO2_tokens['smrTopText'].apply(removeSpCharLine)
df_jato_HO2_tokens['smrSummary'] = df_jato_HO2_tokens['smrSummary'].apply(removeSpCharLine)

In [69]:
# save the file to D:\KOPro\PhD\Implementation\SourceCode\JatoPass2\data -- 
df_jato_HO2_tokens.to_csv('D:/KOPro/PhD/Implementation/SourceCode/JatoPass2_HO2/data/JatoMaster.csv', index=False)

### Generate JatoClassified for HO2

In [65]:
df_JatoClf_Blank = pd.read_csv('D:/KOPro/PhD/Implementation/SourceCode/jatoPass2_HO2/data/JatoClassified_BlankCat.csv')

In [66]:
df_JatoClf_Blank.head(2)

Unnamed: 0,rowid,RecID,SavedDataTime,JatoCat,JatoSubCat,JatoCat2,JatoSubCat2,JatoCat3,JatoSubCat3,NewRecClass,GTCodes,NewsPubCat
0,6,222818213392678912,2021:01:17 21:15:20,Political,Debate,,,,,,,Unknown
1,60,826262311560216578,2021:01:13 19:53:48,Political,Debate,,,,,,,Unknown


In [67]:
len(df_JatoClf_Blank)

2385

In [68]:
df_JatoCl_Pass0_in_2 = pd.read_csv('data/GTxM_Pass0/JatoClassified_HO2_Pass0_in_Pass2_task.csv')

In [69]:
df_JatoCl_Pass0_in_2.head(2)

Unnamed: 0,rowid,RecID,SavedDataTime,JatoCat,JatoSubCat,JatoCat2,JatoSubCat2,JatoCat3,JatoSubCat3,NewRecClass,GTCodes,NewsPubCat
0,747,1181547287014789120,2021:02:06 10:19:04,Business,Complaint,Business,Technology,,,,,Politics
1,797,1181864136155828224,2021:02:06 10:19:04,Conversational,Negative,,,,,,,Unknown


In [70]:
len(df_JatoCl_Pass0_in_2)

36

In [71]:
df_JatoCl_Pass2_dedup = pd.concat([df_JatoClf_Blank,df_JatoCl_Pass0_in_2], axis=0)

In [72]:
len(df_JatoCl_Pass2_dedup)

2421

In [73]:
df_JatoCl_Pass2_dedup.drop_duplicates(subset='RecID', keep=False, inplace=True)

In [74]:
len(df_JatoCl_Pass2_dedup)

2349

In [75]:
# re-concatenate the datasets to add the new subset only
df_JatoCl_Pass2 = pd.concat([df_JatoCl_Pass2_dedup,df_JatoCl_Pass0_in_2], axis=0)

In [76]:
df_JatoCl_Pass2

Unnamed: 0,rowid,RecID,SavedDataTime,JatoCat,JatoSubCat,JatoCat2,JatoSubCat2,JatoCat3,JatoSubCat3,NewRecClass,GTCodes,NewsPubCat
0,6,222818213392678912,2021:01:17 21:15:20,Political,Debate,,,,,,,Unknown
1,60,826262311560216578,2021:01:13 19:53:48,Political,Debate,,,,,,,Unknown
2,63,833502973204459520,2023:03:16 20:23:22,Lifestyle,Music,,,,,,,Unknown
3,64,835347243020451840,2021:01:18 15:17:18,Abusive Material,Hate Speech,,,,,,,Unknown
4,69,867832469181128704,2021:02:03 13:42:38,Lifestyle,Music,,,,,,,Unknown
...,...,...,...,...,...,...,...,...,...,...,...,...
31,4599,1222177357261180933,2021:02:15 18:57:22,Conversational,,,,,,travel,,Unknown
32,4608,1222223980909813761,2021:02:15 20:23:29,Human Rights,Police Brutality,,,,,violence,,Unknown
33,4612,1222242112307187712,2021:02:15 20:23:29,Conversational,Positive,Lifestyle,Music,,,,,Social Stories
34,4621,1222281539100250114,2021:02:15 20:23:29,Conversational,Negative,,,,,,,Sports


In [70]:
df_JatoCl_Pass2.to_csv('D:/KOPro/PhD/Implementation/SourceCode/jatoPass2_HO2/data/JatoClassified.csv', index=False)

### Get HO2 labels After running Jato

In [71]:
#DEL
#use the RecIDs to generate the JatoMaster.csv in SQL-Server (subset of MasterTokens)
#df_RecID = df['RecID']

In [72]:
df_jato_HO2_Labels = pd.read_csv('D:/KOPro/PhD/Implementation/SourceCode/jatoPass2_HO2/data/JatoClassified_Pass2_HO2.csv', dtype='str')

In [73]:
df_jato_HO2_Labels.head(2)

Unnamed: 0,rowid,RecID,SavedDataTime,JatoCat,JatoSubCat,JatoCat2,JatoSubCat2,JatoCat3,JatoSubCat3,NewRecClass,GTCodes,NewsPubCat
0,6.0,222818213392678912,2021:01:17 21:15:20,Political,Debate,,,,,,,Unknown
1,60.0,826262311560216578,2021:01:13 19:53:48,Political,Debate,,,,,,,Unknown


In [74]:
len(df_jato_HO2_Labels)

2385

In [75]:
df_jato_HO2_Labels.drop(['rowid','SavedDataTime','JatoCat','JatoSubCat','JatoCat2','JatoSubCat2','JatoCat3','JatoSubCat3','NewRecClass','GTCodes'], axis=1, inplace=True)

In [76]:
df_jato_HO2_Labels.head(2)

Unnamed: 0,RecID,NewsPubCat
0,222818213392678912,Unknown
1,826262311560216578,Unknown


In [77]:
Labels_Targets = pd.read_csv('data/Labels_TargetsV2.csv')

In [78]:
Labels_Targets.head(2)

Unnamed: 0,Target,Label
0,1,Business
1,2,Entertainment


In [79]:
Labels_Targets.rename(columns={'Label': 'NewsPubCat'}, inplace=True)

In [80]:
df_jato_HO2_Label_Target = pd.merge(df_jato_HO2_Labels,Labels_Targets, on='NewsPubCat', how='left')

In [81]:
df_jato_HO2_Label_Target

Unnamed: 0,RecID,NewsPubCat,Target
0,222818213392678912,Unknown,0
1,826262311560216578,Unknown,0
2,833502973204459520,Entertainment,2
3,835347243020451840,Unknown,0
4,867832469181128704,Unknown,0
...,...,...,...
2380,1222177357261180933,Social Stories,10
2381,1222223980909813761,Law and Order,7
2382,1222242112307187712,Social Stories,10
2383,1222281539100250114,Sports,11


In [82]:
df_jato_HO2_Label_Target.rename(columns={'NewsPubCat': 'HO2Label'}, inplace=True)

In [83]:
#df_jato_HO2_Label_Target['Target'] = df_jato_HO2_Label_Target['Target'].astype('Int64')
df_jato_HO2_Label_Target.rename(columns={'Target': 'HO2Target'}, inplace=True)

In [84]:
df_jato_HO2_Label_Target.head(2)

Unnamed: 0,RecID,HO2Label,HO2Target
0,222818213392678912,Unknown,0
1,826262311560216578,Unknown,0


In [85]:
len(df_HO1_To_HO2)

251

In [86]:
df_HO1_To_HO2.head(2)

Unnamed: 0,RecID,HO1Label,HO1Target,GClfTarget
0,833502973204459520,Entertainment,2,9
1,1175105751846510592,Environmental,3,1


In [87]:
df_HO2_HO1_GClf_Targets = pd.merge(df_jato_HO2_Label_Target,df_HO1_To_HO2, on='RecID')

In [88]:
df_HO2_HO1_GClf_Targets.head()

Unnamed: 0,RecID,HO2Label,HO2Target,HO1Label,HO1Target,GClfTarget
0,833502973204459520,Entertainment,2,Entertainment,2,9.0
1,1175105751846510592,Environmental,3,Environmental,3,1.0
2,1178650857430274049,Politics,6,Human Rights,5,2.0
3,1178745116758020097,Social Stories,10,Social Stories,10,
4,1178961065918500864,Entertainment,2,Unknown,0,2.0


In [89]:
len(df_HO2_HO1_GClf_Targets)

251

In [99]:
df_HO2_HO1_GClf_Targets.to_csv('data/GTxM_Pass2/HO2_HO1_GClf_Targets.csv')

### Get Krippendorff's Alpha between HO2 and HO1

In [90]:
HO1Target = np.stack(df_HO2_HO1_GClf_Targets['HO1Target'].astype("string"))
HO2Target = np.stack(df_HO2_HO1_GClf_Targets['HO2Target'].astype("string"))

In [91]:
missing = '0'
arr = np.array((HO1Target,HO2Target))
alpha4 = krippendorff_alpha(arr, nominal_metric, missing_items=missing)
alpha4

0.6038420409410328

### Get the new GTD from Pass 2 to Pass 3

In [92]:
# HO2 Agreement with HO1
df_HO2_HO1_GTD = df_HO2_HO1_GClf_Targets[
    (df_HO2_HO1_GClf_Targets.HO2Target.astype("string")==df_HO2_HO1_GClf_Targets.HO1Target.astype("string"))]
len(df_HO2_HO1_GTD)

86

In [93]:
df_HO2_HO1_GClf_GTD = df_HO2_HO1_GClf_Targets[
    (df_HO2_HO1_GClf_Targets.HO2Target.astype("string")==df_HO2_HO1_GClf_Targets.HO1Target.astype("string"))
    | (df_HO2_HO1_GClf_Targets.HO2Target.astype("string")==df_HO2_HO1_GClf_Targets.GClfTarget.astype("string"))
    ]
len(df_HO2_HO1_GClf_GTD)

127

In [94]:
df_HO2_HO1_GClf_GTD.head(2)

Unnamed: 0,RecID,HO2Label,HO2Target,HO1Label,HO1Target,GClfTarget
0,833502973204459520,Entertainment,2,Entertainment,2,9
1,1175105751846510592,Environmental,3,Environmental,3,1


In [95]:
df_HO2_HO1_GClf_GTD.drop(['HO1Label','HO1Target','GClfTarget'], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [96]:
df_HO2_HO1_GClf_GTD.rename(columns={'HO2Label': 'Label', 'HO2Target': 'Target'}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(


In [97]:
df_HO2_HO1_GClf_GTD.head(2)

Unnamed: 0,RecID,Label,Target
0,833502973204459520,Entertainment,2
1,1175105751846510592,Environmental,3


In [98]:
df_HO1_GClf_GTD.head(2)

Unnamed: 0,RecID,Label,Target
4,1178756273124335624,Obituary,9
5,1178805608331526145,Politics,6


In [99]:
len(df_HO1_GClf_GTD)

156

In [100]:
df_Pass2_GTD = pd.concat([df_HO2_HO1_GClf_GTD,df_HO1_GClf_GTD], axis=0)

In [101]:
df_Pass2_GTD.drop_duplicates(subset='RecID', keep=False, inplace=True)

In [102]:
len(df_Pass2_GTD)

283

### Create GTD Tokens dataset for SVM

In [103]:
# core table required for BERT/XLNet Algorithm requirement
df_core1 = pd.read_csv("data/MasterTokens.csv", dtype='str')

In [104]:
df_core1.head(2)

Unnamed: 0,rowid,RecID,PubTitle,RecDoc,countSupTweets,smrHashtags,smrMentions,smrNER,smrNouns,smrVerbs,smrAdverbs,smrAdjectives,smrTopText,smrSummary
0,0,11947603240,"Ed Sheeran, Drake and Justin Bieber: What were...",I think I have part created a pretty amazing s...,0.0,,,,part song lie,creat,pretti,amaz,I think I have part created a pretty amazing s...,Summarization skipped (text is 1000 characters...
1,1,12643331537,"Ed Sheeran, Drake and Justin Bieber: What were...",can i have one more follower please... i would...,1.0,,jessglynne,,follow igcom club gpsi,love,,top,can i have one more follower please... i would...,Summarization skipped (text is 1000 characters...


In [105]:
df_Pass2_To_3_GTD_tokens = pd.merge(df_Pass2_GTD,df_core1, on='RecID', how='inner')

In [106]:
len(df_Pass2_To_3_GTD_tokens)

283

In [107]:
df_Pass2_To_3_GTD_tokens.head(2)

Unnamed: 0,RecID,Label,Target,rowid,PubTitle,RecDoc,countSupTweets,smrHashtags,smrMentions,smrNER,smrNouns,smrVerbs,smrAdverbs,smrAdjectives,smrTopText,smrSummary
0,833502973204459520,Entertainment,2,63,Ringo Starr: Abbey Road wasn't meant to be The...,Thanks for coming over man and playing Great ...,13.0,,ringostarrmusic,ringo paul fn fn fn fn fn fn fyifyifyifyifyify...,man bass peac love photo memori band music guy...,play love man love order live play love beat f...,togeth forev especi nearli,great great greatest happi love ador beauti wo...,Thanks for coming over man and playing Great ...,Summarization skipped (text is 1000 characters...
1,1175105751846510592,Environmental,3,437,Aarey protests: Supreme Court steps in to save...,We at #MMRC plant trees &amp; build @MumbaiMet...,37.0,infrastructure climatechange shame saveaarey m...,cmomaharashtra ashwinibhide authackeray fayeds...,anuanc anoan ancan anoanian anian bkc midc kal...,plant tree cut tree suit citi tree tree wel an...,build build mitig plant plant geotag maintain ...,newli kindli late ahead kindli properli fulli ...,present futur local inhuman small depend truth...,We at #MMRC plant trees &amp; build @MumbaiMet...,We at plant trees build also.\nWe do need 2 cu...


In [142]:
# Save Tokens for SVM to Pass3
df_Pass2_To_3_GTD_tokens.to_csv('data/GTxM_Pass2/GTD_Pass2_To_3_tokens.csv')

#### Create GTD decomposed SMT dataset for DL BERT/XLNet

In [158]:
# core tables required for BERT/XLNet Algorithm requirement
# the concat of GroundTruthBERT and CGTexpandedSMR_Data are the equivalent of MasterTokens
# but with decomposed SMR tweets
df_core2 = pd.read_csv("data/GroundTruthBERT.csv", dtype='str')
df_core2.head(2)

Unnamed: 0.1,Unnamed: 0,TID,OrigTweet,CleanTweetNoHttp,CleanTweetNoSpecChar,HashTags,Mentions,InReplyTo,ArticleTitle,CountReplyTweets,CountReplyWords,CountReplyChars,Target,Label
0,0,826262311560216578,#coup has started. First of many steps. #rebel...,#coup has started. first of many steps. #rebel...,coup has started first of many steps rebellion...,coup rebellion impeachment lawyers,,,Assessing the Impeachment Defenses Offered by ...,76.0,,,10,Politics
1,1,1193437298303438858,@MarkSZaidEsq @jody_prichard Funny you want to...,@markszaidesq @jody_prichard funny you want to...,markszaidesq jody_prichard funny you want to d...,,@jody_prichard @markszaidesq,8.262623115602164e+17,,,51.0,313.0,10,Politics


In [159]:
df_core3 = pd.read_csv("data/CGTexpandedSMR_Data.csv", dtype='str')
df_core3.head(2)

Unnamed: 0.1,Unnamed: 0,TID,OrigTweet,CleanTweetNoHttp,CleanTweetNoSpecChar,HashTags,Mentions,InReplyTo,ArticleTitle,CountReplyTweets,CountReplyWords,CountReplyChars
0,0,222818213392678912,Seedy lists of party apparatchiks appointed by...,seedy lists of party apparatchiks appointed by...,seedy lists of party apparatchiks appointed by...,,,,No 10 dismisses Goldsmith 'cronyism' claims,260.0,,
1,1,1207942378688040961,@ZacGoldsmith @mrjamesob Do you think losing y...,@zacgoldsmith @mrjamesob do you think losing y...,zacgoldsmith mrjamesob do you think losing you...,,@mrjamesob @zacgoldsmith,2.2281821339267888e+17,,,47.0,307.0


In [160]:
# drop Label and Target columns on core2 -- to match the fields in core3
df_core2.drop(['Label','Target'], axis=1, inplace=True)

In [161]:
df_core_decomposed = pd.concat([df_core2,df_core3], axis=0)

In [162]:
len(df_core_decomposed)

322199

In [163]:
# Get the Rec subset, InReplyTo is null for all the records

In [164]:
df_core_decomposed_rec = df_core_decomposed[(df_core_decomposed.InReplyTo.isna())]

In [165]:
len(df_core_decomposed_rec)

2385

In [166]:
df_Pass2_GTD_TID = df_Pass2_GTD.rename(columns={'RecID': 'TID'})

In [167]:
df_Pass2_GTD_decomposedRec = pd.merge(df_core_decomposed_rec, df_Pass2_GTD_TID, on='TID', how='inner')

In [168]:
len(df_Pass2_GTD_decomposedRec)

283

In [169]:
df_Pass2_GTD_decomposedRec.head(2)

Unnamed: 0.1,Unnamed: 0,TID,OrigTweet,CleanTweetNoHttp,CleanTweetNoSpecChar,HashTags,Mentions,InReplyTo,ArticleTitle,CountReplyTweets,CountReplyWords,CountReplyChars,Label,Target
0,261,833502973204459520,Thanks for coming over man and playing Great ...,thanks for coming over man and playing great b...,thanks for coming over man and playing great b...,,,,Ringo Starr: Abbey Road wasn't meant to be The...,13.0,,,Entertainment,2
1,749,1175105751846510592,We at #MMRC plant trees &amp; build @MumbaiMet...,we at #mmrc plant trees build @mumbaimetro3 al...,we at mmrc plant trees build mumbaimetro also ...,MMRC infrastructure MMRC,@mumbaimetro3 @cmomaharashtra,,Aarey protests: Supreme Court steps in to save...,37.0,,,Environmental,3


In [170]:
# Get the supporting tweets based on InReplyTo

In [171]:
df_core_decomposed_sup = df_core_decomposed[(df_core_decomposed.InReplyTo.notna())]

In [172]:
len(df_core_decomposed_sup)

319814

In [174]:
df_core_decomposed_sup.head(2)

Unnamed: 0.1,Unnamed: 0,TID,OrigTweet,CleanTweetNoHttp,CleanTweetNoSpecChar,HashTags,Mentions,InReplyTo,ArticleTitle,CountReplyTweets,CountReplyWords,CountReplyChars
1,1,1193437298303438858,@MarkSZaidEsq @jody_prichard Funny you want to...,@markszaidesq @jody_prichard funny you want to...,markszaidesq jody_prichard funny you want to d...,,@jody_prichard @markszaidesq,826262311560216578,,,51,313
2,2,1194280882540036098,"@MarkSZaidEsq at THAT time, the only ""stepping...","@markszaidesq at that time, the only ""stepping...",markszaidesq at that time the only stepping ov...,,@markszaidesq,826262311560216578,,,57,291


In [175]:
df_Pass2_GTD_InReplyTo = df_Pass2_GTD.rename(columns={'RecID': 'InReplyTo'})

In [176]:
df_Pass2_GTD_decomposedSup = pd.merge(df_core_decomposed_sup, df_Pass2_GTD_InReplyTo, on='InReplyTo', how='inner')

In [177]:
len(df_Pass2_GTD_decomposedSup)

51068

In [178]:
df_Pass2_GTD_decomposedSup.tail(2)

Unnamed: 0.1,Unnamed: 0,TID,OrigTweet,CleanTweetNoHttp,CleanTweetNoSpecChar,HashTags,Mentions,InReplyTo,ArticleTitle,CountReplyTweets,CountReplyWords,CountReplyChars,Label,Target
51066,197220,1224290723584978944,@BorisJohnson Together Boris and Donald can di...,@borisjohnson together boris and donald can di...,borisjohnson together boris and donald can dir...,,@borisjohnson,1223365339494453248,,,41,251,Politics,6
51067,197221,1223366743906824194,@BorisJohnson I donΓÇÖt wanna be brought toget...,@borisjohnson i dongcot wanna be brought toget...,borisjohnson i dongcot wanna be brought togeth...,,@borisjohnson,1223365339494453248,,,44,250,Politics,6


In [179]:
# Prepare the final file for df_Pass2_GTD_decomposedRec + df_Pass2_GTD_decomposedSup
df_Pass2_GTD_decomposed_final = pd.concat([df_Pass2_GTD_decomposedRec, df_Pass2_GTD_decomposedSup], axis = 0)

In [180]:
df_Pass2_GTD_decomposed_final.head(2)

Unnamed: 0.1,Unnamed: 0,TID,OrigTweet,CleanTweetNoHttp,CleanTweetNoSpecChar,HashTags,Mentions,InReplyTo,ArticleTitle,CountReplyTweets,CountReplyWords,CountReplyChars,Label,Target
0,261,833502973204459520,Thanks for coming over man and playing Great ...,thanks for coming over man and playing great b...,thanks for coming over man and playing great b...,,,,Ringo Starr: Abbey Road wasn't meant to be The...,13.0,,,Entertainment,2
1,749,1175105751846510592,We at #MMRC plant trees &amp; build @MumbaiMet...,we at #mmrc plant trees build @mumbaimetro3 al...,we at mmrc plant trees build mumbaimetro also ...,MMRC infrastructure MMRC,@mumbaimetro3 @cmomaharashtra,,Aarey protests: Supreme Court steps in to save...,37.0,,,Environmental,3


In [182]:
df_Pass2_GTD_decomposed_final.tail(2)

Unnamed: 0.1,Unnamed: 0,TID,OrigTweet,CleanTweetNoHttp,CleanTweetNoSpecChar,HashTags,Mentions,InReplyTo,ArticleTitle,CountReplyTweets,CountReplyWords,CountReplyChars,Label,Target
51066,197220,1224290723584978944,@BorisJohnson Together Boris and Donald can di...,@borisjohnson together boris and donald can di...,borisjohnson together boris and donald can dir...,,@borisjohnson,1223365339494453248,,,41,251,Politics,6
51067,197221,1223366743906824194,@BorisJohnson I donΓÇÖt wanna be brought toget...,@borisjohnson i dongcot wanna be brought toget...,borisjohnson i dongcot wanna be brought togeth...,,@borisjohnson,1223365339494453248,,,44,250,Politics,6


In [183]:
len(df_Pass2_GTD_decomposed_final)

51351

In [184]:
# Save new decomposed SMR for BERT/XLNet to Pass 3
df_Pass2_GTD_decomposed_final.to_csv('data/GTxM_Pass2/GTD_Pass2_To_3_decomposedSMR.csv')

### Generate GTD Rejected

In [109]:
df_Rec = df['RecID']
len(df_Rec)

407

In [120]:
# New GTD
df_NewGTD_Rec = df_Pass2_To_3_GTD_tokens[['RecID','Label']]
len(df_NewGTD_Rec)

283

In [121]:
df_Rejected_Rec = pd.merge(df_Rec, df_NewGTD_Rec, on='RecID', how='left')

In [122]:
len(df_Rejected_Rec)

407

In [123]:
df_Rejected_Rec = df_Rejected_Rec[df_Rejected_Rec.Label.isna()]

In [124]:
len(df_Rejected_Rec)

124

In [126]:
df_Rejected_Rec['RecID'].to_csv('data/GTxM_Pass2/GTxM_Pass2_Reject.csv', index=False)

### Generate JatoMaster for Rejected (rerun)

In [3]:
df_Rejected_Rec = pd.read_csv('data/GTxM_Pass2/GTxM_Pass2_Reject.csv', dtype='str')
df_master_tokens = pd.read_csv('data/MasterTokens.csv', dtype='str')

In [4]:
df_jato_Pass2Rej_tokens = pd.merge(df_master_tokens,df_Rejected_Rec, on='RecID')
len(df_jato_Pass2Rej_tokens)

124

In [5]:
df_jato_Pass2Rej_tokens.head(2)

Unnamed: 0,rowid,RecID,PubTitle,RecDoc,countSupTweets,smrHashtags,smrMentions,smrNER,smrNouns,smrVerbs,smrAdverbs,smrAdjectives,smrTopText,smrSummary
0,479,1178650857430274049,"Naga Munchetty, BBC News Anchor, Has Reprimand...",BBC now has to explain; 1. why it reprimanded ...,210.0,racist bbcbias catalan istandwithnaga benn yes...,conservatives iaindocherty bbcdavidjordan miri...,naga munchetti dan walker toni hall execut com...,naga munchetti racism host dan walker air reas...,explain reprimand condemn reprimand draw lie q...,twice perhap twice intens abroad bare real hon...,big senior high evid difficult subtl sophist p...,BBC now has to explain; 1. why it reprimanded ...,Also needs to be asked is the fact that only 1...
1,519,1179074719049494528,Woman in swimsuit photo-shamed by potential em...,i was objectified earlier today by a company b...,260.0,realfeminism theyneverdeservedyou daringgreatl...,u_movingup realyeyoza kim_amsterdam cbsaustinc...,earlier ig ashley smith dalla tx us bc ms clow...,compani pictur bikini photo insta stori compan...,claim screenshot post baffl handl itgco live c...,earlier actual fulli obvious absolut clearli s...,objectifi unprofession strang fearless intern ...,i was objectified earlier today by a company b...,i was objectified earlier today by a company b...


In [6]:
df_jato_Pass2Rej_tokens.drop(['smrAdverbs','smrAdjectives'], axis=1, inplace=True)

In [7]:
df_jato_Pass2Rej_tokens['PubTitle'] = df_jato_Pass2Rej_tokens['PubTitle'].apply(removeSpCharLine)
df_jato_Pass2Rej_tokens['RecDoc'] = df_jato_Pass2Rej_tokens['RecDoc'].apply(removeSpCharLine)
df_jato_Pass2Rej_tokens['smrTopText'] = df_jato_Pass2Rej_tokens['smrTopText'].apply(removeSpCharLine)
df_jato_Pass2Rej_tokens['smrSummary'] = df_jato_Pass2Rej_tokens['smrSummary'].apply(removeSpCharLine)

In [8]:
# save the file to D:\KOPro\PhD\Implementation\SourceCode\JatoPass2\data -- 
df_jato_Pass2Rej_tokens.to_csv('D:/KOPro/PhD/Implementation/SourceCode/jatoPass2_rerun_HO1/data/JatoMaster.csv', index=False)