### Compute Krippendorff's Alpha for the reGTr Intercoder HO1, HO2 and HO2 labels

In [1]:
# credit: https://github.com/grrrr/krippendorff-alpha (Thomas Grill)
# downloaded: https://github.com/grrrr/krippendorff-alpha/blob/master/krippendorff_alpha.py

In [2]:
# krippendorff_alpha.py must be locally stored in the same folder as this notebook
from krippendorff_alpha import *

In [3]:
import pandas as pd
import numpy as np

In [4]:
df = pd.read_csv("data/GTxM_Pass1/reGTr_InterCoder_HO1_HO2_HO3.csv", dtype='str')

In [5]:
len(df)

873

In [6]:
df.head()

Unnamed: 0,RecID,PubTitle,RecDoc,TargetHO1,TargetHO2,TargetHO3,LabelHO1,LabelHO2,LabelHO3
0,1089699762331217920,Peloton exercise bike ad mocked as being 'sexi...,Love putting my Peloton bike in the most strik...,1,1,-1,Business,Business,-1
1,1122651175688515584,Biden Campaign Drops Opposition to Super PAC S...,I've said it before and I'll say it again. To ...,10,6,6,Social Stories,Politics,Politics
2,1139309394968096768,Six Takeaways From Senators' Questions to Impe...,I would not have thought that I needed to say ...,6,6,-1,Politics,Politics,-1
3,1159148971106942981,N.F.L. Invites Teams to Watch Colin Kaepernick...,5am. 5 days a week. For 3 years. Still Ready. ...,11,11,-1,Sports,Sports,-1
4,1166443046361153537,"Popeyes Chicken Sandwich Returns, but Will the...",Y?all. We love that you love The Sandwich. Unf...,1,1,-1,Business,Business,-1


### Get the subset with HO1 and HO2 agreement

In [7]:
# issue: there are 29 records where HO1 & HO2 agree but these were still sent to HO3 (shouldn't have been) 
# resolution: blank out HO1 labels for the 29 records and use the HO2 and HO3 labels

In [8]:
df_BlankHO1 = df[(df.LabelHO1 == df.LabelHO2) & (df.LabelHO3 != '-1')]

In [9]:
len(df_BlankHO1)

29

In [10]:
df_BlankHO1.head(2)

Unnamed: 0,RecID,PubTitle,RecDoc,TargetHO1,TargetHO2,TargetHO3,LabelHO1,LabelHO2,LabelHO3
68,1182005931317583872,Mark Ruffalo breaks Hollywood ranks over Ellen...,"Sorry, until George W. Bush is brought to just...",6,6,6,Politics,Politics,Politics
98,1184950984591712257,15 Times Trump and His Allies Claimed 'No Quid...,Mulvaney statement cleaning up his briefing co...,6,6,6,Politics,Politics,Politics


In [11]:
df.loc[(df.LabelHO1 == df.LabelHO2) & (df.LabelHO3 != '-1'), 'LabelHO1'] = '-1'
df.loc[(df.LabelHO1 == '-1'), 'TargetHO1'] = '-1'

In [13]:
df[(df.LabelHO1 == '-1')].head()

Unnamed: 0,RecID,PubTitle,RecDoc,TargetHO1,TargetHO2,TargetHO3,LabelHO1,LabelHO2,LabelHO3
68,1182005931317583872,Mark Ruffalo breaks Hollywood ranks over Ellen...,"Sorry, until George W. Bush is brought to just...",-1,6,6,-1,Politics,Politics
98,1184950984591712257,15 Times Trump and His Allies Claimed 'No Quid...,Mulvaney statement cleaning up his briefing co...,-1,6,6,-1,Politics,Politics
142,1188458334086684673,"With al-Baghdadi Raid, Trump Makes the Made-fo...",President @realDonaldTrump watches as U.S. Spe...,-1,6,6,-1,Politics,Politics
182,1190059166129426432,"Trump, Lifelong New Yorker, Declares Himself a...",Good riddance.\n\nIt?s not like @realDonaldTru...,-1,6,6,-1,Politics,Politics
212,1191983177725861892,Trump's Twitter War Room Aims Its Punches at D...,"I also served with Vindman, and interacted wit...",-1,6,6,-1,Politics,Politics


In [14]:
df_AgreeHO1HO2 = df[(df.LabelHO1 == df.LabelHO2)]

In [15]:
len(df_AgreeHO1HO2)

700

In [16]:
df_AgreeHO1HO2.drop(['PubTitle', 'RecDoc', 'TargetHO2', 'LabelHO2', 'TargetHO3', 'LabelHO3'], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [17]:
df_AgreeHO1HO2.rename(columns={'TargetHO1': 'Target', 'LabelHO1': 'Label'}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(


In [18]:
df_AgreeHO1HO2.head(2)

Unnamed: 0,RecID,Target,Label
0,1089699762331217920,1,Business
2,1139309394968096768,6,Politics


### Find krippendorff's alpha for the agreement between HO1 and HO2

In [19]:
HO1 = np.stack(df['TargetHO1'].astype("string"))
HO2 = np.stack(df['TargetHO2'].astype("string"))
# HO3 = np.stack(df['TargetHO3'].astype("string"))

In [20]:
# arr = np.array((HO1,HO2,HO3))
arr = np.array((HO1,HO2))

In [21]:
missing = '-1'

In [22]:
alpha1 = krippendorff_alpha(arr, nominal_metric, missing_items=missing)

In [23]:
alpha1

0.7627418234389511

### Get the subset with HO1 or HO2 agreement with HO3

In [24]:
df2 = df[df.TargetHO3 != '-1']

In [25]:
len(df2)

173

In [26]:
df2.head(2)

Unnamed: 0,RecID,PubTitle,RecDoc,TargetHO1,TargetHO2,TargetHO3,LabelHO1,LabelHO2,LabelHO3
1,1122651175688515584,Biden Campaign Drops Opposition to Super PAC S...,I've said it before and I'll say it again. To ...,10,6,6,Social Stories,Politics,Politics
15,1179115344180330496,Lupita Nyong'o: Colourism is the daughter of r...,This is 5-year-old me. I reflected on this lit...,5,10,10,Human Rights,Social Stories,Social Stories


In [27]:
def AgreeTargetWithHO3(HO1Target, HO2Target, HO3Target):
    AgreedTarget = -1
    if (HO1Target == HO3Target):
        AgreedTarget = HO3Target
    if (HO2Target == HO3Target):
        AgreedTarget = HO3Target
    return AgreedTarget

def AgreeLabelWithHO3(HO1Label, HO2Label, HO3Label):
    AgreedLabel = -1
    if (HO1Label == HO3Label):
        AgreedLabel = HO3Label
    if (HO2Label == HO3Label):
        AgreedLabel = HO3Label
    return AgreedLabel

In [29]:
df2['AgreedTarget'] = df2.apply(lambda x: AgreeTargetWithHO3(x['TargetHO1'], x['TargetHO2'], x['TargetHO3']), axis=1)
df2['AgreedLabel'] = df2.apply(lambda x: AgreeLabelWithHO3(x['LabelHO1'], x['LabelHO2'], x['LabelHO3']), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2['AgreedTarget'] = df2.apply(lambda x: AgreeTargetWithHO3(x['TargetHO1'], x['TargetHO2'], x['TargetHO3']), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2['AgreedLabel'] = df2.apply(lambda x: AgreeLabelWithHO3(x['LabelHO1'], x['LabelHO2'], x['LabelHO3']), axis=1)


In [30]:
df2.head(2)

Unnamed: 0,RecID,PubTitle,RecDoc,TargetHO1,TargetHO2,TargetHO3,LabelHO1,LabelHO2,LabelHO3,AgreedTarget,AgreedLabel
1,1122651175688515584,Biden Campaign Drops Opposition to Super PAC S...,I've said it before and I'll say it again. To ...,10,6,6,Social Stories,Politics,Politics,6,Politics
15,1179115344180330496,Lupita Nyong'o: Colourism is the daughter of r...,This is 5-year-old me. I reflected on this lit...,5,10,10,Human Rights,Social Stories,Social Stories,10,Social Stories


In [31]:
df_AgreeHO1HO2HO3 = df2[(df2.AgreedTarget != -1)]

In [32]:
len(df_AgreeHO1HO2HO3)

154

In [33]:
df_AgreeHO1HO2HO3.drop(['PubTitle', 'RecDoc', 'TargetHO1', 'LabelHO1', 'TargetHO2', 'LabelHO2', 'TargetHO3', 'LabelHO3'], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [34]:
df_AgreeHO1HO2HO3.head(2)

Unnamed: 0,RecID,AgreedTarget,AgreedLabel
1,1122651175688515584,6,Politics
15,1179115344180330496,10,Social Stories


In [35]:
df_AgreeHO1HO2HO3.rename(columns={'AgreedTarget': 'Target', 'AgreedLabel': 'Label'}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(


### Find krippendorff's alpha for HO3 and the agreement between HO1 and HO2

In [36]:
HO3_2 = np.stack(df2['TargetHO3'].astype("string"))
HOA_2 = np.stack(df2['AgreedTarget'].astype("string")) #Agreed Target

In [37]:
arr2 = np.array((HO3_2, HOA_2))
missing = '-1'
alpha2 = krippendorff_alpha(arr2, nominal_metric, missing_items=missing)
alpha2

0.8496124031007752

In [38]:
alpha = (alpha1 + alpha2) / 2

In [39]:
alpha

0.8061771132698632

### Get the subset with no agreement between HO1, HO2 and HO3

In [40]:
df3 = df2[df2.AgreedLabel == -1] 

In [41]:
len(df3)

19

In [42]:
df3.head(2)

Unnamed: 0,RecID,PubTitle,RecDoc,TargetHO1,TargetHO2,TargetHO3,LabelHO1,LabelHO2,LabelHO3,AgreedTarget,AgreedLabel
39,1180954142396710912,Hearthstone gamer banned for Hong Kong protest,[BREAKING] Hong Kong Hearthstone player @blitz...,5,1,6,Human Rights,Business,Politics,-1,-1
43,1181212982128328705,Passport facial recognition checks fail to wor...,This is our @algoface 209 point Facial Landmar...,5,10,1,Human Rights,Social Stories,Business,-1,-1


In [43]:
df_NoAgreeHO1HO2HO3 = pd.DataFrame()

In [44]:
df_NoAgreeHO1HO2HO3['RecID'] = df3['RecID']

In [45]:
df_NoAgreeHO1HO2HO3['Label'] = "Unknown"
df_NoAgreeHO1HO2HO3['Target'] = 0

In [46]:
df_NoAgreeHO1HO2HO3.head(2)

Unnamed: 0,RecID,Label,Target
39,1180954142396710912,Unknown,0
43,1181212982128328705,Unknown,0


In [47]:
len(df_NoAgreeHO1HO2HO3)

19

In [48]:
df_reGoundTruth = pd.concat([df_AgreeHO1HO2, df_AgreeHO1HO2HO3], axis = 0)

In [49]:
len(df_reGoundTruth)

854

In [50]:
df_reGoundTruth.head(2)

Unnamed: 0,RecID,Target,Label
0,1089699762331217920,1,Business
2,1139309394968096768,6,Politics


### Write Output files

In [51]:
#df_reGoundTruth.to_csv('data/GTxM_Pass1/reGTr_RecIDs.csv')

In [52]:
#df_NoAgreeHO1HO2HO3.to_csv('data/GTxM_Pass1/reGTr_noAgree_RecIDs.csv')

### Create reGTrTokens dataset for SVM

In [166]:
# core table required for BERT/XLNet Algorithm requirement
df_tokens = pd.read_csv("data/MasterTokens.csv", dtype='str')

In [167]:
df_tokens.head(2)

Unnamed: 0,rowid,RecID,PubTitle,RecDoc,countSupTweets,smrHashtags,smrMentions,smrNER,smrNouns,smrVerbs,smrAdverbs,smrAdjectives,smrTopText,smrSummary
0,0,11947603240,"Ed Sheeran, Drake and Justin Bieber: What were...",I think I have part created a pretty amazing s...,0.0,,,,part song lie,creat,pretti,amaz,I think I have part created a pretty amazing s...,Summarization skipped (text is 1000 characters...
1,1,12643331537,"Ed Sheeran, Drake and Justin Bieber: What were...",can i have one more follower please... i would...,1.0,,jessglynne,,follow igcom club gpsi,love,,top,can i have one more follower please... i would...,Summarization skipped (text is 1000 characters...


In [170]:
df_reGoundTruthTokens = pd.DataFrame()

In [172]:
df_reGoundTruthTokens = pd.merge(df_reGoundTruth,df_tokens, on='RecID', how='inner')

In [173]:
df_reGoundTruthTokens.head(2)

Unnamed: 0,RecID,Target,Label,rowid,PubTitle,RecDoc,countSupTweets,smrHashtags,smrMentions,smrNER,smrNouns,smrVerbs,smrAdverbs,smrAdjectives,smrTopText,smrSummary
0,1089699762331217920,1,Business,192,Peloton exercise bike ad mocked as being 'sexi...,Love putting my Peloton bike in the most strik...,118.0,hilarious pton homesteadexemption realproblems...,profgalloway dpshow dirrtydut danglebus nicksc...,peloton serious later coupl monthli minut king...,love peloton bike area hous wife glanc hubbi e...,keep trap doe pedal whogco buy broke buy play ...,matter away serious later fast btw though outd...,strike ultra modern nervou dark right perfect ...,Love putting my Peloton bike in the most strik...,Love putting my Peloton bike in the most strik...
1,1139309394968096768,6,Politics,291,Six Takeaways From Senators' Questions to Impe...,I would not have thought that I needed to say ...,25.0,clintonfoundation corruptcomplicitgop clintons...,ellenlweintraub k9dancerpovey nypapajoe killer...,agenda us elect statu quo unsustain american b...,intern corpor polit agenda us elect polit dona...,control influenc destroy wipe cheat breath myb...,care right behind forward total appar,vast unsustain unaccept question possibl fair ...,I would not have thought that I needed to say ...,I would not have thought that I needed to say ...


In [174]:
len(df_reGoundTruthTokens)

854

In [176]:
df_reGoundTruthTokens.to_csv('data/GTxM_Pass1/reGTr_Tokens.csv')

In [181]:
df_reGTrTokensNoAgree = pd.merge(df_NoAgreeHO1HO2HO3,df_tokens, on='RecID', how='inner')

In [182]:
df_reGTrTokensNoAgree.head(2)

Unnamed: 0,RecID,Label,Target,rowid,PubTitle,RecDoc,countSupTweets,smrHashtags,smrMentions,smrNER,smrNouns,smrVerbs,smrAdverbs,smrAdjectives,smrTopText,smrSummary
0,1180954142396710912,Unknown,0,651,Hearthstone gamer banned for Hong Kong protest,[BREAKING] Hong Kong Hearthstone player @blitz...,260.0,hongkong thaou+>> antimaskban glorytohongkong ...,gen_heleno sf_moro galileocheng blitzcjunghs t...,hong kong hearthston mandarin bc china free ho...,hong kong hearthston player liber countri inte...,burn destroy hurt bc revolut watch allow ruin ...,afterward instead truli longer suddenli overni...,post game innoc pathet corpor terribl polit in...,[BREAKING] Hong Kong Hearthstone player @blitz...,[BREAKING] Hong Kong Hearthstone player calls ...
1,1181212982128328705,Unknown,0,696,Passport facial recognition checks fail to wor...,This is our @algoface 209 point Facial Landmar...,57.0,facemask radssoon developer genderbias compute...,drjamiegraves hal_9000_ai sisofmoses bourdakos...,facial landmark tracker ff london bc western c...,point facial landmark tracker face problem fac...,work face mean detect resurfac sell tri includ...,actual notori deliber properli super pretti aw...,virtual minimum particular bandag bias brillia...,This is our @algoface 209 point Facial Landmar...,"This is our 209 point Facial Landmark Tracker,..."


In [183]:
len(df_reGTrTokensNoAgree)

19

In [232]:
df_reGTrTokensNoAgree.to_csv('data/GTxM_Pass1/reGTr_Tokens_NoAgree.csv')

### Create reGTrDecomposed dataset for BERT/XLNet

In [210]:
# core table required for BERT/XLNet Algorithm requirement
df_decomposed = pd.read_csv("data/GrounTruthBERT.csv", dtype='str')

In [211]:
df_decomposed.head(2)

Unnamed: 0.1,Unnamed: 0,TID,OrigTweet,CleanTweetNoHttp,CleanTweetNoSpecChar,HashTags,Mentions,InReplyTo,ArticleTitle,CountReplyTweets,CountReplyWords,CountReplyChars,Target,Label
0,0,826262311560216578,#coup has started. First of many steps. #rebel...,#coup has started. first of many steps. #rebel...,coup has started first of many steps rebellion...,coup rebellion impeachment lawyers,,,Assessing the Impeachment Defenses Offered by ...,76.0,,,10,Politics
1,1,1193437298303438858,@MarkSZaidEsq @jody_prichard Funny you want to...,@markszaidesq @jody_prichard funny you want to...,markszaidesq jody_prichard funny you want to d...,,@jody_prichard @markszaidesq,8.262623115602164e+17,,,51.0,313.0,10,Politics


In [212]:
len(df_decomposed)

124977

In [213]:
# Remove the existing Target and Label columns -- we will replace them with the new reGTr values
df_decomposed.drop(['Target', 'Label'], axis=1, inplace=True)

In [214]:
# subset with TIDs as RecIDs
df_decomposed_TID = df_decomposed[df_decomposed.InReplyTo.isnull()]

In [215]:
len(df_decomposed_TIDs)

873

In [216]:
df_reGoundTruth.rename(columns={'RecID': 'TID'}, inplace=True)

In [217]:
df_reGTrDecomposed1 = pd.merge(df_reGoundTruth,df_decomposed_TID, on='TID', how='inner')

In [218]:
df_reGTrDecomposed1.head(2)

Unnamed: 0.1,TID,Target,Label,Unnamed: 0,OrigTweet,CleanTweetNoHttp,CleanTweetNoSpecChar,HashTags,Mentions,InReplyTo,ArticleTitle,CountReplyTweets,CountReplyWords,CountReplyChars
0,1089699762331217920,1,Business,142,Love putting my Peloton bike in the most strik...,love putting my peloton bike in the most strik...,love putting my peloton bike in the most strik...,,,,Peloton exercise bike ad mocked as being 'sexi...,118,,
1,1139309394968096768,6,Politics,363,I would not have thought that I needed to say ...,i would not have thought that i needed to say ...,i would not have thought that i needed to say ...,,,,Six Takeaways From Senators' Questions to Impe...,25,,


In [219]:
len(df_reGTrDecomposed1)

854

In [220]:
# Reconnect to df_decomposed1 on InReplyTo to get the SubTweets
df_reGoundTruth.rename(columns={'TID': 'InReplyTo'}, inplace=True)

In [221]:
df_reGTrDecomposed2 = pd.merge(df_reGoundTruth,df_decomposed, on='InReplyTo', how='inner')

In [222]:
df_reGTrDecomposed2.head(2)

Unnamed: 0.1,InReplyTo,Target,Label,Unnamed: 0,TID,OrigTweet,CleanTweetNoHttp,CleanTweetNoSpecChar,HashTags,Mentions,ArticleTitle,CountReplyTweets,CountReplyWords,CountReplyChars
0,1089699762331217920,1,Business,143,1202241703870160898,@ClueHeywood The wife always looks nervous and...,@clueheywood the wife always looks nervous and...,clueheywood the wife always looks nervous and ...,,@clueheywood,,,54,293
1,1089699762331217920,1,Business,144,1201142676990631936,"@ClueHeywood Seriously, whoΓÇÖs the mindless i...","@clueheywood seriously, whogcos the mindless i...",clueheywood seriously whogcos the mindless idi...,,@clueheywood,,,47,269


In [223]:
len(df_reGTrDecomposed2)

121740

#### Update the Label and Target for the SupTweets based on the RecTweets

In [259]:
# Loop through RecTweets, find all SubTweets, update the Labels and Targets
for i in range(len(df_reGTrDecomposed1)):
    iTID = df_reGTrDecomposed1.loc[i,'TID']
    iLabel = df_reGTrDecomposed1.loc[i,'Label']
    iTarget = df_reGTrDecomposed1.loc[i,'Target']
    df_reGTrDecomposed2.loc[(df_reGTrDecomposed2.InReplyTo == iTID), 'Label'] = iLabel
    df_reGTrDecomposed2.loc[(df_reGTrDecomposed2.InReplyTo == iTID), 'Target'] = iTarget

In [260]:
# Prepare the final file for df_reGTrDecomposed1 + df_reGTrDecomposed2
df_reGTrDecomposed = pd.DataFrame()

In [261]:
df_reGTrDecomposed = pd.concat([df_reGTrDecomposed1, df_reGTrDecomposed2], axis = 0)

In [262]:
df_reGTrDecomposed.head()

Unnamed: 0.1,TID,Target,Label,Unnamed: 0,OrigTweet,CleanTweetNoHttp,CleanTweetNoSpecChar,HashTags,Mentions,InReplyTo,ArticleTitle,CountReplyTweets,CountReplyWords,CountReplyChars
0,1089699762331217920,1,Business,142,Love putting my Peloton bike in the most strik...,love putting my peloton bike in the most strik...,love putting my peloton bike in the most strik...,,,,Peloton exercise bike ad mocked as being 'sexi...,118,,
1,1139309394968096768,6,Politics,363,I would not have thought that I needed to say ...,i would not have thought that i needed to say ...,i would not have thought that i needed to say ...,,,,Six Takeaways From Senators' Questions to Impe...,25,,
2,1159148971106942981,11,Sports,389,5am. 5 days a week. For 3 years. Still Ready. ...,5am. 5 days a week. for 3 years. still ready.,am days a week for years still ready,,,,N.F.L. Invites Teams to Watch Colin Kaepernick...,38,,
3,1166443046361153537,1,Business,428,Y?all. We love that you love The Sandwich. Unf...,y?all. we love that you love the sandwich. unf...,y all we love that you love the sandwich unfor...,,,,"Popeyes Chicken Sandwich Returns, but Will the...",14,,
4,1175764155359465478,11,Sports,443,Will not be playing in the @NFL anymore these ...,will not be playing in the @nfl anymore these ...,will not be playing in the nfl anymore these o...,,@nflpa @nfl,,N.F.L. Players Association Files Grievances on...,19,,


In [264]:
df_reGTrDecomposed.tail()

Unnamed: 0.1,TID,Target,Label,Unnamed: 0,OrigTweet,CleanTweetNoHttp,CleanTweetNoSpecChar,HashTags,Mentions,InReplyTo,ArticleTitle,CountReplyTweets,CountReplyWords,CountReplyChars
121735,1223621336205090816,1,Business,123667,@elonmusk ELON I GOTTA KNOW ABOUT THE N64 CONT...,@elonmusk elon i gotta know about the n64 cont...,elonmusk elon i gotta know about the n control...,,@elonmusk,1223133867772502019,,,13,76
121736,1223134212762542080,1,Business,123668,@elonmusk listening to music on a sick studio ...,@elonmusk listening to music on a sick studio ...,elonmusk listening to music on a sick studio o...,,@elonmusk,1223133867772502019,,,13,75
121737,1223134530082467841,1,Business,123669,@elonmusk Kakoii Elon Chan ≡ƒñÖ≡ƒÅ╗≡ƒñÖ≡ƒÅ╗≡ƒñ...,@elonmusk kakoii elon chan fnofa+fnofa+fnofa+f...,elonmusk kakoii elon chan fnofa fnofa fnofa fo...,,@elonmusk,1223133867772502019,,,5,75
121738,1223135031549235200,1,Business,123670,@elonmusk If this doesnΓÇÖt secure your Nobel ...,@elonmusk if this doesngcot secure your nobel ...,elonmusk if this doesngcot secure your nobel t...,,@elonmusk,1223133867772502019,,,13,75
121739,1223137805544587269,1,Business,123671,"@elonmusk wow Elon, great moves! proud of you,...","@elonmusk wow elon, great moves! proud of you,...",elonmusk wow elon great moves proud of you kee...,,@h3h3productions @elonmusk,1223133867772502019,,,12,75


In [265]:
len(df_reGTrDecomposed)

122594

In [276]:
df_reGTrDecomposed.to_csv('data/GTxM_Pass1/reGTr_SMR_Decomposed.csv')

#### Create the decomposed SMR datasets for No Agreement

In [233]:
df_NoAgreeHO1HO2HO3.head(2)

Unnamed: 0,RecID,Label,Target
39,1180954142396710912,Unknown,0
43,1181212982128328705,Unknown,0


In [234]:
# Reconnect to df_decomposed1 on InReplyTo to get the SubTweets
df_NoAgreeHO1HO2HO3.rename(columns={'RecID': 'TID'}, inplace=True)

In [235]:
df_reGTrDecomposed1_NoAgree = pd.merge(df_NoAgreeHO1HO2HO3,df_decomposed_TID, on='TID', how='inner')

In [236]:
df_reGTrDecomposed1_NoAgree.head(2)

Unnamed: 0.1,TID,Label,Target,Unnamed: 0,OrigTweet,CleanTweetNoHttp,CleanTweetNoSpecChar,HashTags,Mentions,InReplyTo,ArticleTitle,CountReplyTweets,CountReplyWords,CountReplyChars
0,1180954142396710912,Unknown,0,3661,[BREAKING] Hong Kong Hearthstone player @blitz...,[breaking] hong kong hearthstone player @blitz...,breaking hong kong hearthstone player blitzchu...,Hearthstone,@blitzchunghs @matthieist,,Hearthstone gamer banned for Hong Kong protest,260,,
1,1181212982128328705,Unknown,0,4462,This is our @algoface 209 point Facial Landmar...,this is our @algoface 209 point facial landmar...,this is our algoface point facial landmark tra...,AR RacialBias GenderBias Inclusive AR AI Compu...,@algoface,,Passport facial recognition checks fail to wor...,57,,


In [237]:
len(df_reGTrDecomposed1_NoAgree)

19

In [238]:
# Reconnect to df_decomposed on InReplyTo to get the SubTweets
df_NoAgreeHO1HO2HO3.rename(columns={'TID': 'InReplyTo'}, inplace=True)

In [242]:
len(df_NoAgreeHO1HO2HO3)

19

In [243]:
df_reGTrDecomposed2_NoAgree = pd.merge(df_NoAgreeHO1HO2HO3,df_decomposed, on='InReplyTo', how='inner')

In [244]:
df_reGTrDecomposed2_NoAgree.head(2)

Unnamed: 0.1,InReplyTo,Label,Target,Unnamed: 0,TID,OrigTweet,CleanTweetNoHttp,CleanTweetNoSpecChar,HashTags,Mentions,ArticleTitle,CountReplyTweets,CountReplyWords,CountReplyChars
0,1180954142396710912,Unknown,0,3662,1181727893510119425,@InvenGlobal @blitzchungHS @Matthieist µ»Åσñ⌐σ...,@invenglobal @blitzchunghs @matthieist u>>asns...,invenglobal blitzchunghs matthieist you asnsPS...,,@blitzchunghs @matthieist @invenglobal,,,6,339
1,1180954142396710912,Unknown,0,3663,1181073462959300608,@InvenGlobal @blitzchungHS @Matthieist What ar...,@invenglobal @blitzchunghs @matthieist what ar...,invenglobal blitzchunghs matthieist what are h...,,@blitzchunghs @matthieist @invenglobal,,,47,324


In [245]:
len(df_reGTrDecomposed2_NoAgree)

2364

In [267]:
# Loop through RecTweets, find all SubTweets, update the Labels and Targets
for i in range(len(df_reGTrDecomposed1_NoAgree)):
    iTID = df_reGTrDecomposed1_NoAgree.loc[i,'TID']
    iLabel = df_reGTrDecomposed1_NoAgree.loc[i,'Label']
    iTarget = df_reGTrDecomposed1_NoAgree.loc[i,'Target']
    df_reGTrDecomposed2_NoAgree.loc[(df_reGTrDecomposed2_NoAgree.InReplyTo == iTID), 'Label'] = iLabel
    df_reGTrDecomposed2_NoAgree.loc[(df_reGTrDecomposed2_NoAgree.InReplyTo == iTID), 'Target'] = iTarget

In [268]:
# Prepare the final file for df_reGTrDecomposed1 + df_reGTrDecomposed2
df_reGTrDecomposed_NoAgree = pd.DataFrame()

In [269]:
df_reGTrDecomposed_NoAgree = pd.concat([df_reGTrDecomposed1_NoAgree, df_reGTrDecomposed2_NoAgree], axis = 0)

In [270]:
df_reGTrDecomposed_NoAgree.head(2)

Unnamed: 0.1,TID,Label,Target,Unnamed: 0,OrigTweet,CleanTweetNoHttp,CleanTweetNoSpecChar,HashTags,Mentions,InReplyTo,ArticleTitle,CountReplyTweets,CountReplyWords,CountReplyChars
0,1180954142396710912,Unknown,0,3661,[BREAKING] Hong Kong Hearthstone player @blitz...,[breaking] hong kong hearthstone player @blitz...,breaking hong kong hearthstone player blitzchu...,Hearthstone,@blitzchunghs @matthieist,,Hearthstone gamer banned for Hong Kong protest,260,,
1,1181212982128328705,Unknown,0,4462,This is our @algoface 209 point Facial Landmar...,this is our @algoface 209 point facial landmar...,this is our algoface point facial landmark tra...,AR RacialBias GenderBias Inclusive AR AI Compu...,@algoface,,Passport facial recognition checks fail to wor...,57,,


In [272]:
df_reGTrDecomposed_NoAgree.tail(2)

Unnamed: 0.1,TID,Label,Target,Unnamed: 0,OrigTweet,CleanTweetNoHttp,CleanTweetNoSpecChar,HashTags,Mentions,InReplyTo,ArticleTitle,CountReplyTweets,CountReplyWords,CountReplyChars
2362,1224145491040129024,Unknown,0,123409,@elonmusk This guy is a genius I'd let him mix...,@elonmusk this guy is a genius i'd let him mix...,elonmusk this guy is a genius i had let him mi...,,@elonmusk,1223128794396938241,,,18,138
2363,1223556070263574528,Unknown,0,123410,@elonmusk I am making this mu Alarm tune and I...,@elonmusk i am making this mu alarm tune and i...,elonmusk i am making this mu alarm tune and i ...,,@elonmusk,1223128794396938241,,,15,98


In [273]:
len(df_reGTrDecomposed_NoAgree)

2383

In [275]:
df_reGTrDecomposed_NoAgree.to_csv('data/GTxM_Pass1/reGTr_SMR_Decomposed_NoAgree.csv')