## GTxM Pass 3

### Setup Env

In [1]:
# Pass 3 objective: remove international politics SMRs as they share terms with Human Rights, 
# and might be the reason the poor results in Pass2

In [1]:
import pandas as pd
import numpy as np
import re
from krippendorff_alpha import *

In [106]:
def removeSpCharLine(text):
    # remove all non ASCII characters and line breaks.
    # credit: https://stackoverflow.com/questions/2758921/regular-expression-that-finds-and-replaces-non-ascii-characters-with-python
    text.replace('\n', ' ')
    text.replace('\r', ' ')
    text = re.sub(r"[\u0080-\uFFFF]", " ", text) #see ASCII list: https://www.asciitable.com/
    text = " ".join(text.split()) # replace multiple spaces with a single space
    return(text+". ")

In [107]:
df_gtd_tokens1 = pd.read_csv('data/GTxM_Pass1/reGTr_Tokens.csv', dtype='str')
df_gtd_tokens2 = pd.read_csv('data/GTxM_Pass2/GTD_Pass2_To_3_tokens.csv', dtype='str')

In [108]:
len(df_gtd_tokens1), len(df_gtd_tokens2)

(854, 283)

In [109]:
df_gtd_tokens = pd.concat([df_gtd_tokens1,df_gtd_tokens2], axis = 0)

In [110]:
df_gtd_tokens.head(2)

Unnamed: 0.1,Unnamed: 0,RecID,Target,Label,rowid,PubTitle,RecDoc,countSupTweets,smrHashtags,smrMentions,smrNER,smrNouns,smrVerbs,smrAdverbs,smrAdjectives,smrTopText,smrSummary
0,0,1089699762331217920,1,Business,192,Peloton exercise bike ad mocked as being 'sexi...,Love putting my Peloton bike in the most strik...,118.0,hilarious pton homesteadexemption realproblems...,profgalloway dpshow dirrtydut danglebus nicksc...,peloton serious later coupl monthli minut king...,love peloton bike area hous wife glanc hubbi e...,keep trap doe pedal whogco buy broke buy play ...,matter away serious later fast btw though outd...,strike ultra modern nervou dark right perfect ...,Love putting my Peloton bike in the most strik...,Love putting my Peloton bike in the most strik...
1,1,1139309394968096768,6,Politics,291,Six Takeaways From Senators' Questions to Impe...,I would not have thought that I needed to say ...,25.0,clintonfoundation corruptcomplicitgop clintons...,ellenlweintraub k9dancerpovey nypapajoe killer...,agenda us elect statu quo unsustain american b...,intern corpor polit agenda us elect polit dona...,control influenc destroy wipe cheat breath myb...,care right behind forward total appar,vast unsustain unaccept question possibl fair ...,I would not have thought that I needed to say ...,I would not have thought that I needed to say ...


In [111]:
df_gtd_tokens.tail(2)

Unnamed: 0.1,Unnamed: 0,RecID,Target,Label,rowid,PubTitle,RecDoc,countSupTweets,smrHashtags,smrMentions,smrNER,smrNouns,smrVerbs,smrAdverbs,smrAdjectives,smrTopText,smrSummary
281,281,1222910064236744704,6,Politics,4658,"L'�tat, C'est Trump","Assuming the Senate votes against witnesses, a...",87.0,impeachandremovetrumpnow moreimpeaching obstru...,delavegalaw medit8now amandionair lanning_laur...,senat trump bolton mulvaney giuliani american ...,senat wit trump bolton mulvaney giuliani peopl...,assum vote expect subpoena entitl hear play ex...,immedi exactli sadli care long away right espe...,american obscen sham explicit democrat biparti...,"Assuming the Senate votes against witnesses, a...","Assuming the Senate votes against witnesses, a..."
282,282,1222952347548164098,6,Politics,4662,"U.K. Leaves E.U., Embarking On an Uncertain Fu...",I am sad to see our British friends leave the ...,260.0,bernardtapie loveit 0doubt italy france brexit...,fathonie_ag_top devonlass nevenmaguire _hadley...,british eu brexit european us uk gcpsbasket br...,friend eu mandat brexit disrupt citizen employ...,leav act ensur unit understand mean trust unde...,perhap hope anytim qu un fulli therebi vigor f...,sad british littl possibl financi western brit...,I am sad to see our British friends leave the ...,I am sad to see our British friends leave the ...


In [112]:
df_gtd_tokens.groupby(['Label']).size()

Label
Business           75
Entertainment     153
Environmental      17
Health              4
Human Rights       83
Law and Order      12
Obituary          147
Politics          539
Social Stories     32
Sports             75
dtype: int64

In [113]:
df_gtd_tokens_politics = df_gtd_tokens[df_gtd_tokens.Label=='Politics']

In [114]:
len(df_gtd_tokens_politics)

539

In [115]:
df_gtd_tokens_hrights = df_gtd_tokens[df_gtd_tokens.Label=='Human Rights']

In [116]:
len(df_gtd_tokens_hrights)

83

In [117]:
df_GTD_Rec_Labels = df_gtd_tokens[['RecID','Label','Target']]

In [19]:
df_gtd_tokens_politics.to_csv('data/GTxM_Pass3/GTxM_Pass3_GTD_politics_token.csv', index=False)

### After HO1 and HO2, get the JatoClassified files, compute intercoder

In [118]:
df_jato_pass3_HO1 = pd.read_csv('D:/KOPro/PhD/Implementation/SourceCode/JatoPass3_HO1/data/JatoClassified_Pass3_HO1.csv', dtype='str')

In [119]:
df_jato_pass3_HO1 = pd.merge(df_jato_pass3_HO1, df_gtd_tokens_politics['RecID'], on='RecID', how='inner')

In [120]:
len(df_jato_pass3_HO1)

539

In [121]:
df_jato_pass3_HO1.head(2)

Unnamed: 0,rowid,RecID,SavedDataTime,JatoCat,JatoSubCat,JatoCat2,JatoSubCat2,JatoCat3,JatoSubCat3,NewRecClass,GTCodes,NewsPubCat
0,60,826262311560216578,2023:03:25 16:17:17,Political,Debate,,,,,,,Politics
1,257,1122651175688515584,2023:03:25 16:29:31,,Campaigns,,,,,,,Politics


In [122]:
df_jato_pass3_HO1.groupby(['NewsPubCat']).size()

NewsPubCat
Human Rights       16
Politics          417
World Politics    106
dtype: int64

In [123]:
Labels_Targets = pd.read_csv('data/Labels_TargetsV3.csv')

In [124]:
Labels_Targets.head(2)

Unnamed: 0,Target,Label
0,1,Business
1,2,Entertainment


In [125]:
Labels_Targets.rename(columns={'Label': 'NewsPubCat'}, inplace=True)

In [126]:
df_jato_pass3_HO1_Target = pd.merge(df_jato_pass3_HO1,Labels_Targets, on='NewsPubCat', how='left')

In [127]:
df_jato_pass3_HO2 = pd.read_csv('D:/KOPro/PhD/Implementation/SourceCode/JatoPass3_HO2/data/JatoClassified_Pass3_HO2.csv', dtype='str')

In [128]:
df_jato_pass3_HO2 = pd.merge(df_jato_pass3_HO2, df_gtd_tokens_politics['RecID'], on='RecID', how='inner')

In [129]:
len(df_jato_pass3_HO2)

539

In [130]:
df_jato_pass3_HO2.head(2)

Unnamed: 0,rowid,RecID,SavedDataTime,JatoCat,JatoSubCat,JatoCat2,JatoSubCat2,JatoCat3,JatoSubCat3,NewRecClass,GTCodes,NewsPubCat
0,60.0,826262311560216578,2023:03:25 12:56:01,Political,Debate,,,,,,,Politics
1,257.0,1122651175688515584,2023:03:25 12:59:39,,Campaigns,,,,,,,Politics


In [131]:
df_jato_pass3_HO2.groupby(['NewsPubCat']).size()

NewsPubCat
Politics          466
World Politics     73
dtype: int64

In [132]:
df_jato_pass3_HO2_Target = pd.merge(df_jato_pass3_HO2,Labels_Targets, on='NewsPubCat', how='left')

In [133]:
df_jato_pass3_HO1_Target.drop(['rowid','SavedDataTime','JatoCat','JatoSubCat','JatoCat2','JatoSubCat2','JatoCat3','JatoSubCat3','NewRecClass','GTCodes'], axis=1, inplace=True)
df_jato_pass3_HO2_Target.drop(['rowid','SavedDataTime','JatoCat','JatoSubCat','JatoCat2','JatoSubCat2','JatoCat3','JatoSubCat3','NewRecClass','GTCodes'], axis=1, inplace=True)
df_jato_pass3_HO1_Target.rename(columns={'NewsPubCat': 'LabelHO1','Target': 'TargetHO1'}, inplace=True)
df_jato_pass3_HO2_Target.rename(columns={'NewsPubCat': 'LabelHO2','Target': 'TargetHO2'}, inplace=True)

In [134]:
# Merge HO1 and HO2 dataframes to get the agreed labels
df_jato_pass3_HO1HO2 = pd.merge(df_jato_pass3_HO1_Target,df_jato_pass3_HO2_Target, on='RecID', how='inner')
len(df_jato_pass3_HO1HO2)

539

In [135]:
df_jato_pass3_HO1HO2.head(2)

Unnamed: 0,RecID,LabelHO1,TargetHO1,LabelHO2,TargetHO2
0,826262311560216578,Politics,6,Politics,6
1,1122651175688515584,Politics,6,Politics,6


In [136]:
df_jato_pass3_HO1HO2_Agreed = df_jato_pass3_HO1HO2[df_jato_pass3_HO1HO2.TargetHO1 == df_jato_pass3_HO1HO2.TargetHO2]

In [137]:
len(df_jato_pass3_HO1HO2_Agreed)

475

In [138]:
df_jato_pass3_HO1HO2_NoAgree = df_jato_pass3_HO1HO2[df_jato_pass3_HO1HO2.TargetHO1 != df_jato_pass3_HO1HO2.TargetHO2]
len(df_jato_pass3_HO1HO2_NoAgree)

64

### Find krippendorff's alpha for the agreement between HO1 and HO2

In [139]:
# convert to horizontal array as expected by Krippendorff Alpha
HO1Label = np.stack(df_jato_pass3_HO1_Target['TargetHO1'])
HO2Label = np.stack(df_jato_pass3_HO2_Target['TargetHO2'])

In [140]:
missing = '-1'
arr = np.array((HO1Label,HO2Label))
alpha = krippendorff_alpha(arr, nominal_metric, missing_items=missing)
alpha

0.6062359682146141

### Prepare JatoMaster for HO3, where no HO1 and HO2 agreement

In [141]:
# old code, don't use
# df_jato_pass3_HO1HO2 = pd.concat([df_jato_pass3_HO1,df_jato_pass3_HO2], axis=0)
# df_jato_pass3_HO1HO2.drop_duplicates(subset=['RecID','NewsPubCat'], keep=False, inplace=True)
# df_jato_pass3_HO1HO2.drop_duplicates(subset=['RecID'], keep='first', inplace=True)

In [142]:
df_jatoMaster_Pass3_HO3 = pd.merge(df_gtd_tokens_politics, df_jato_pass3_HO1HO2_NoAgree['RecID'], on='RecID', how='inner')

In [143]:
len(df_jatoMaster_Pass3_HO3)

64

In [144]:
df_jatoMaster_Pass3_HO3.drop(['Unnamed: 0','smrAdverbs','smrAdjectives','Target','Label'], axis=1, inplace=True)

In [145]:
df_jatoMaster_Pass3_HO3['PubTitle'] = df_jatoMaster_Pass3_HO3['PubTitle'].apply(removeSpCharLine)
df_jatoMaster_Pass3_HO3['RecDoc'] = df_jatoMaster_Pass3_HO3['RecDoc'].apply(removeSpCharLine)
df_jatoMaster_Pass3_HO3['smrTopText'] = df_jatoMaster_Pass3_HO3['smrTopText'].apply(removeSpCharLine)
df_jatoMaster_Pass3_HO3['smrSummary'] = df_jatoMaster_Pass3_HO3['smrSummary'].apply(removeSpCharLine)

In [146]:
# df_jatoMaster_Pass3_HO3['PubTitle'] = df_jatoMaster_Pass3_HO3['PubTitle'].apply(lambda x: re.sub(r'\W+', ' ', x))
# df_jatoMaster_Pass3_HO3['RecDoc'] = df_jatoMaster_Pass3_HO3['RecDoc'].apply(lambda x: re.sub(r'\W+', ' ', x))
# df_jatoMaster_Pass3_HO3['smrTopText'] = df_jatoMaster_Pass3_HO3['smrTopText'].apply(lambda x: re.sub(r'\W+', ' ', x))
# df_jatoMaster_Pass3_HO3['smrSummary'] = df_jatoMaster_Pass3_HO3['smrSummary'].apply(lambda x: re.sub(r'\W+', ' ', x))
# df_jatoMaster_Pass3_HO3['PubTitle'] = df_jatoMaster_Pass3_HO3.PubTitle.str.replace('\n', ' ')
# df_jatoMaster_Pass3_HO3['RecDoc'] = df_jatoMaster_Pass3_HO3.RecDoc.str.replace('\n', ' ')
# df_jatoMaster_Pass3_HO3['smrTopText'] = df_jatoMaster_Pass3_HO3.smrTopText.str.replace('\n', ' ')
# df_jatoMaster_Pass3_HO3['smrSummary'] = df_jatoMaster_Pass3_HO3.smrSummary.str.replace('\n', ' ')
# df_jatoMaster_Pass3_HO3['PubTitle'] = df_jatoMaster_Pass3_HO3.PubTitle.str.replace('\r', ' ')
# df_jatoMaster_Pass3_HO3['RecDoc'] = df_jatoMaster_Pass3_HO3.RecDoc.str.replace('\r', ' ')
# df_jatoMaster_Pass3_HO3['smrTopText'] = df_jatoMaster_Pass3_HO3.smrTopText.str.replace('\r', ' ')
# df_jatoMaster_Pass3_HO3['smrSummary'] = df_jatoMaster_Pass3_HO3.smrSummary.str.replace('\r', ' ')

In [147]:
df_jatoMaster_Pass3_HO3.head(2)

Unnamed: 0,RecID,rowid,PubTitle,RecDoc,countSupTweets,smrHashtags,smrMentions,smrNER,smrNouns,smrVerbs,smrTopText,smrSummary
0,1179179573541511176,529,False ?Coup? Claims by Trump Echo as Unifying ...,"As I learn more and more each day, I am coming...",42.0,traitor istandwithpresidenttrump putinspuppet ...,t_wwg1wga patriotpure realjediman1 twitter go4...,power second zelenski second american russian ...,conclus place impeach coup power whistleblow a...,learn intend grant impeach accept derang lie d...,"As I learn more and more each day, I am coming...","As I learn more and more each day, I am coming..."
1,1179699937623379969,555,Barr and a Top Prosecutor Cast a Wide Net in R...,Australia?s response to Senator Lindsey Graham...,260.0,highcrimes spygate trumpisanationalsecuritythr...,king2712king jakesherman jeffsmybrother cnn do...,lindsey graham dem gop democrat fbi foe especi...,respons senat lindsey graham spi dem wongcot g...,leak work shame consid tri harm leak hide chal...,Australia?s response to Senator Lindsey Graham...,Australia?s response to Senator Lindsey Graham...


In [74]:
df_jatoMaster_Pass3_HO3.to_csv('D:/KOPro/PhD/Implementation/SourceCode/JatoPass3_HO3/data/JatoMaster.csv', index=False)

### After HO3, get the JatoClassified file, compute Krippendorff's Alpha

In [148]:
df_jato_pass3_HO3 = pd.read_csv('D:/KOPro/PhD/Implementation/SourceCode/JatoPass3_HO3/data/JatoClassified_Pass3_HO3.csv', dtype='str')

In [149]:
df_jato_pass3_HO3.drop(['rowid','SavedDataTime','JatoCat','JatoSubCat','JatoCat2','JatoSubCat2','JatoCat3','JatoSubCat3','NewRecClass','GTCodes'], axis=1, inplace=True)

In [150]:
df_jato_pass3_HO3.rename(columns={'NewsPubCat': 'Label'}, inplace=True)

In [151]:
df_jato_pass3_HO3.head(2)

Unnamed: 0,RecID,Label
0,222818213392678912,Politics
1,826262311560216578,Politics


In [152]:
Labels_Targets = pd.read_csv('data/Labels_TargetsV3.csv')

In [153]:
Labels_Targets.head(2)

Unnamed: 0,Target,Label
0,1,Business
1,2,Entertainment


In [154]:
df_jato_pass3_HO3_Target = pd.merge(df_jato_pass3_HO3,Labels_Targets, on='Label', how='left')

In [155]:
df_jato_pass3_HO3_Target.head(2)

Unnamed: 0,RecID,Label,Target
0,222818213392678912,Politics,6
1,826262311560216578,Politics,6


In [156]:
df_jato_pass3_HO3_Target.rename(columns={'Label': 'LabelHO3','Target': 'TargetHO3'}, inplace=True)
len(df_jato_pass3_HO3_Target)

2385

In [157]:
df_jato_pass3_HO1HO2_NoAgree.head(2)

Unnamed: 0,RecID,LabelHO1,TargetHO1,LabelHO2,TargetHO2
5,1179179573541511176,World Politics,12,Politics,6
7,1179699937623379969,World Politics,12,Politics,6


In [158]:
df_jato_pass3_HO3HO2HO1 = pd.merge(df_jato_pass3_HO1HO2_NoAgree, df_jato_pass3_HO3_Target, on='RecID', how='inner')

In [159]:
len(df_jato_pass3_HO3HO2HO1)

64

In [160]:
df_jato_pass3_HO3HO2HO1.head(2)

Unnamed: 0,RecID,LabelHO1,TargetHO1,LabelHO2,TargetHO2,LabelHO3,TargetHO3
0,1179179573541511176,World Politics,12,Politics,6,Politics,6
1,1179699937623379969,World Politics,12,Politics,6,World Politics,12


In [169]:
df_jato_pass3_HO3HO2HO1.groupby(['LabelHO3']).size()

LabelHO3
Human Rights       3
Politics          44
World Politics    17
dtype: int64

In [161]:
def AgreeTargetWithA3(A1Target, A2Target, A3Target):
    AgreedTarget = -1
    if (A1Target == A3Target):
        AgreedTarget = A3Target
    if (A2Target == A3Target):
        AgreedTarget = A3Target
    return AgreedTarget

In [162]:
df_jato_pass3_HO3HO2HO1['AgreedTarget'] = df_jato_pass3_HO3HO2HO1.apply(lambda x: AgreeTargetWithA3(x['TargetHO1'], x['TargetHO2'], x['TargetHO3']), axis=1)

In [163]:
# convert to horizontal array as expected by Krippendorff Alpha
HO3Target = np.stack(df_jato_pass3_HO3HO2HO1['TargetHO3'].astype("string"))
AgreedTarget = np.stack(df_jato_pass3_HO3HO2HO1['AgreedTarget'].astype("string"))

In [164]:
missing = '-1'
arr = np.array((HO3Target,AgreedTarget))
alpha2 = krippendorff_alpha(arr, nominal_metric, missing_items=missing)
alpha2

0.9339058027582617

In [165]:
df_GTD_pass3_HO3HO2HO1_Agree = df_jato_pass3_HO3HO2HO1[df_jato_pass3_HO3HO2HO1.TargetHO3 == df_jato_pass3_HO3HO2HO1.AgreedTarget]
df_GTD_pass3_HO3HO2HO1_NoAgree = df_jato_pass3_HO3HO2HO1[df_jato_pass3_HO3HO2HO1.TargetHO3 != df_jato_pass3_HO3HO2HO1.AgreedTarget]

In [166]:
len(df_GTD_pass3_HO3HO2HO1_Agree), len(df_GTD_pass3_HO3HO2HO1_NoAgree)

(62, 2)

### Generate New GTD

In [64]:
df_jato_pass3_HO1HO2_Agreed.head(2)

Unnamed: 0,RecID,LabelHO1,TargetHO1,LabelHO2,TargetHO2
0,826262311560216578,Politics,6,Politics,6
1,1122651175688515584,Politics,6,Politics,6


In [65]:
df_jato_pass3_HO1HO2_Agreed.drop(['LabelHO2','TargetHO2'], axis=1, inplace=True)
df_jato_pass3_HO1HO2_Agreed.rename(columns={'LabelHO1': 'Label','TargetHO1': 'Target'}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(


In [66]:
df_jato_pass3_HO1HO2_Agreed.head(2)

Unnamed: 0,RecID,Label,Target
0,826262311560216578,Politics,6
1,1122651175688515584,Politics,6


In [67]:
len(df_jato_pass3_HO1HO2_Agreed)

475

In [68]:
df_GTD_pass3_HO3HO2HO1_Agree.head(2)

Unnamed: 0,RecID,LabelHO1,TargetHO1,LabelHO2,TargetHO2,LabelHO3,TargetHO3,AgreedTarget
0,1179179573541511176,World Politics,12,Politics,6,Politics,6,6
1,1179699937623379969,World Politics,12,Politics,6,World Politics,12,12


In [69]:
len(df_GTD_pass3_HO3HO2HO1_Agree)

62

In [170]:
df_GTD_pass3_HO3HO2HO1_Agree.groupby(['LabelHO3']).size()

LabelHO3
Human Rights       3
Politics          43
World Politics    16
dtype: int64

In [70]:
df_GTD_pass3_HO3HO2HO1_Agree.drop(['LabelHO1','TargetHO1','LabelHO2','TargetHO2','AgreedTarget'], axis=1, inplace=True)
df_GTD_pass3_HO3HO2HO1_Agree.rename(columns={'LabelHO3': 'Label','TargetHO3': 'Target'}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(


In [71]:
df_GTD_pass3_HO3HO2HO1_Agree.head(2)

Unnamed: 0,RecID,Label,Target
0,1179179573541511176,Politics,6
1,1179699937623379969,World Politics,12


In [72]:
df_GTD_pass3_Agree = pd.concat([df_jato_pass3_HO1HO2_Agreed, df_GTD_pass3_HO3HO2HO1_Agree], axis=0)

In [73]:
len(df_GTD_pass3_Agree)

537

In [74]:
df_GTD_pass3_NoWorldPolitics = df_GTD_pass3_Agree[df_GTD_pass3_Agree.Label != 'World Politics']

In [75]:
len(df_GTD_pass3_NoWorldPolitics)

459

In [76]:
df_GTD_pass3_WorldPolitics = df_GTD_pass3_Agree[df_GTD_pass3_Agree.Label == 'World Politics']
len(df_GTD_pass3_WorldPolitics)

78

In [77]:
# New GTD = Original GTD - Politics + df_GTD_pass3_NoWorldPolitics
df_gtd_tokens_NoPolitics = df_gtd_tokens[df_gtd_tokens.Label!='Politics']
df_gtd_tokens_NoPolitics = df_gtd_tokens_NoPolitics[['RecID','Label','Target']]

In [78]:
len(df_gtd_tokens_NoPolitics)

598

In [79]:
df_gtd_tokens_NoPolitics.head(2)

Unnamed: 0,RecID,Label,Target
0,1089699762331217920,Business,1
2,1159148971106942981,Sports,11


In [80]:
df_Pass3_GTD = pd.concat([df_gtd_tokens_NoPolitics,df_GTD_pass3_NoWorldPolitics], axis=0)
len(df_Pass3_GTD)

1057

In [81]:
# randomize the dataset to prevent all Politics placed at the end
df_Pass3_GTD = df_Pass3_GTD.sample(frac=1).reset_index(drop=True)

In [82]:
df_Pass3_GTD.to_csv('data/GTxM_Pass3/GTxM_Pass3_GTD_UpTodate.csv', index=False)

### Generate New Rejected list = Existing + WorldPolitics + HO3HO2HO1_NoAgree

In [91]:
df_Pass3_Rejected = pd.DataFrame(pd.concat([df_GTD_pass3_WorldPolitics['RecID'], df_GTD_pass3_HO3HO2HO1_NoAgree['RecID']], axis=0))
len(df_Pass3_Rejected)

80

In [92]:
df_Pass3_Rejected.head(2)

Unnamed: 0,RecID
25,1181085835942232064
26,1181181496784228352


In [85]:
df_Pass3_Rejected.to_csv('data/GTxM_Pass3/GTxM_Pass3_Reject.csv', index=False)

In [98]:
# Pass 1 Rejected
df_pass1_rejected = pd.read_csv('data/GTxM_Pass1/reGTr_Tokens_NoAgree.csv')
df_pass1_rejected = pd.DataFrame(df_pass1_rejected['RecID'])
len(df_pass1_rejected)

19

In [99]:
df_pass1_rejected.head(2)

Unnamed: 0,RecID
0,1180954142396710912
1,1181212982128328705


In [100]:
# Pass 2 Rejected
df_pass2_rejected = pd.read_csv('data/GTxM_Pass2/GTxM_Pass2_Reject.csv')
len(df_pass2_rejected)

124

In [101]:
df_pass2_rejected.head(2)

Unnamed: 0,RecID
0,1222242112307187712
1,1222223980909813761


In [102]:
# consolidate all rejected
df_pass3_rejected_all = pd.concat([df_Pass3_Rejected['RecID'],df_pass2_rejected['RecID'],df_pass1_rejected['RecID']], axis=0)
len(df_pass3_rejected_all)

223

In [103]:
df_pass3_rejected_all.head(2)

25    1181085835942232064
26    1181181496784228352
Name: RecID, dtype: object

In [104]:
df_pass3_rejected_all.to_csv('data/GTxM_Pass3/GTxM_Pass3_Reject_UpTodate.csv', index=False)

### Prepare the dataframe statements for GTxM Classifiers

In [3]:
data = pd.read_csv('data/MasterTokens.csv', encoding='ISO-8859-1')

In [4]:
df_GTD_Rec = pd.read_csv('data/GTxM_Pass3/GTxM_Pass3_GTD_UpTodate.csv')

In [5]:
data = pd.merge(data, df_GTD_Rec, on='RecID')

In [6]:
len(data)

1057

In [7]:
data.head(2)

Unnamed: 0,rowid,RecID,PubTitle,RecDoc,countSupTweets,smrHashtags,smrMentions,smrNER,smrNouns,smrVerbs,smrAdverbs,smrAdjectives,smrTopText,smrSummary,Label,Target
0,60,826262311560216578,Assessing the Impeachment Defenses Offered by ...,#coup has started. First of many steps. #rebel...,76.0,rebellion pervertprotector crossfirehurricaneb...,markszaidesq object__observe realdonaldtrumps ...,wtelf constitut senat host biden ukrain burism...,step peopl countri problem busi oil weapon wte...,start follow discrimin sell step suppos hand p...,ultim forth serious mostli btw certainli rabid...,funni multipl sad human hoax constitut pro quo...,#coup has started. First of many steps. #rebel...,has started.\nFunny you want to discriminate a...,Politics,6
1,63,833502973204459520,Ringo Starr: Abbey Road wasn't meant to be The...,Thanks for coming over man and playing Great ...,13.0,,ringostarrmusic,ringo paul fn fn fn fn fn fn fyifyifyifyifyify...,man bass peac love photo memori band music guy...,play love man love order live play love beat f...,togeth forev especi nearli,great great greatest happi love ador beauti wo...,Thanks for coming over man and playing Great ...,Summarization skipped (text is 1000 characters...,Entertainment,2


In [28]:
data.groupby(['Label','Target']).size()

Label           Target
Business        1          75
Entertainment   2         153
Environmental   3          17
Health          4           4
Human Rights    5          86
Law and Order   7          12
Obituary        9         147
Politics        6         456
Social Stories  10         32
Sports          11         75
dtype: int64

#### prep for BERT/XLNet

In [2]:
df_CoreCGT = pd.read_csv('data/CGTexpandedSMR_Data.csv', dtype='str', usecols=['TID', 'OrigTweet', 'InReplyTo'])
df_CoreCGT.head(2)

Unnamed: 0,TID,OrigTweet,InReplyTo
0,222818213392678912,Seedy lists of party apparatchiks appointed by...,
1,1207942378688040961,@ZacGoldsmith @mrjamesob Do you think losing y...,2.2281821339267888e+17


In [3]:
df_CoreGTr = pd.read_csv('data/GroundTruthBERT.csv', dtype='str', usecols=['TID', 'OrigTweet', 'InReplyTo'])
df_CoreGTr.head(2)

Unnamed: 0,TID,OrigTweet,InReplyTo
0,826262311560216578,#coup has started. First of many steps. #rebel...,
1,1193437298303438858,@MarkSZaidEsq @jody_prichard Funny you want to...,8.262623115602164e+17


In [4]:
df_Core = pd.concat([df_CoreGTr, df_CoreCGT], axis=0)

In [5]:
len(df_Core)

322199

In [6]:
df_Core_Rec = df_Core[df_Core.InReplyTo.isna()]
len(df_Core_Rec)

2385

In [7]:
df_Core_Sup = df_Core[df_Core.InReplyTo.notna()]
len(df_Core_Sup)

319814

In [8]:
df_GTD_Rec = pd.read_csv('data/GTxM_Pass3/GTxM_Pass3_GTD_UpTodate.csv', dtype='str')
len(df_GTD_Rec)

1057

In [9]:
df_GTD_Rec.head(2)

Unnamed: 0,RecID,Label,Target
0,1207761446513319936,Politics,6
1,1180079141087055872,Politics,6


In [10]:
df_GTD_Rec.rename(columns={'RecID': 'TID'}, inplace=True)
df_Core_Rec = pd.merge(df_Core_Rec, df_GTD_Rec, on='TID')
len(df_Core_Rec)

1057

In [11]:
df_GTD_Rec.rename(columns={'TID': 'InReplyTo'}, inplace=True)
df_Core_Sup = pd.merge(df_Core_Sup, df_GTD_Rec, on='InReplyTo')
len(df_Core_Sup)

157436

In [130]:
df = pd.concat([df_Core_Rec, df_Core_Sup], axis=0)

In [131]:
df = df.sample(frac=1,axis=0,ignore_index=True)

In [132]:
len(df)

158493

In [133]:
df.head()

Unnamed: 0,TID,OrigTweet,InReplyTo,Label,Target
0,1201948365249925120,@Leahgreenb You're acting like Biden doesn't h...,1201934660789510144,Human Rights,5
1,1192258461523992582,@mattgaetz Matt Gaetz getting behind the wheel...,1192218741033046016,Politics,6
2,1220502912465588228,@GretaThunberg whats your contingency plan if ...,1220355420600008704,Environmental,3
3,1200671040675950593,@TheSteinLine @R_Greezy,1200658086731628544,Sports,11
4,1220779702270484481,@BorisJohnson @AlunCairns I'm really looking f...,1220759425868275712,Politics,6


In [134]:
df.groupby(['Label','Target']).size()

Label           Target
Business        1          8903
Entertainment   2         21955
Environmental   3          2592
Health          4           715
Human Rights    5         14253
Law and Order   7          2098
Obituary        9         16841
Politics        6         75573
Social Stories  10         3424
Sports          11        12139
dtype: int64

In [135]:
df_Rec = df[df.InReplyTo.isna()]

In [136]:
df_Rec.groupby(['Label','Target']).size()

Label           Target
Business        1          75
Entertainment   2         153
Environmental   3          17
Health          4           4
Human Rights    5          86
Law and Order   7          12
Obituary        9         147
Politics        6         456
Social Stories  10         32
Sports          11         75
dtype: int64

In [137]:
df_Core_Sup.groupby(['Label','Target']).size()

Label           Target
Business        1          8828
Entertainment   2         21802
Environmental   3          2575
Health          4           711
Human Rights    5         14167
Law and Order   7          2086
Obituary        9         16694
Politics        6         75117
Social Stories  10         3392
Sports          11        12064
dtype: int64

In [138]:
len(df)

158493

In [139]:
# Remove Label with low SMR counts
df = df[df.Label != 'Social Stories']
df = df[df.Label != 'Law and Order']
df = df[df.Label != 'Environmental']
df = df[df.Label != 'Health']
len(df)

149664

In [140]:
def remove1stMent(text):
  if text[0] == '@':
      words = text.split()
      words = words[1:] # remove the @mention word
      text = " ".join(words).lstrip()
  return(text)

def removeSpecialChar(text):
  #old text = re.sub(r'\W', ' ', text) #replace ALL non-word characters, including emojis with space
  # remove all non ASCII characters 

  # credit: https://stackoverflow.com/questions/2758921/regular-expression-that-finds-and-replaces-non-ascii-characters-with-python
  text = re.sub(r"[\u0080-\uFFFF]", " ", text) #see ASCII list: https://www.asciitable.com/
  text = " ".join(text.split()) # replace multiple spaces with a single space
  return(text)

def removeDigits(text):
  text = re.sub(r'\d', ' ', text) #replace digits with space
  return(text)

def cleanColloquials(text):
  #Replace common abbreviations and slangs
  text = text.replace(" i m "," i am ")
  text = text.replace(" i ve "," i have ")
  text = text.replace(" i ll "," i will ")
  text = text.replace(" i d "," i had ")
  text = text.replace(" that s "," that is ")
  text = text.replace(" isn t "," is not ")
  text = text.replace(" it s "," it is ")
  text = text.replace(" she s "," she is ")
  text = text.replace(" he s "," he is ")
  text = text.replace(" u "," you ")
  text = text.replace(" ur "," your ")
  text = text.replace(" b4 "," before ")
  text = text.replace(" wasnt "," was not ")
  text = text.replace(" wasn t "," was not ")
  text = text.replace(" cant "," can not ")
  text = text.replace(" can t "," can not ")
  text = text.replace(" couldnt "," could not ")
  text = text.replace(" couldn t "," could not ")
  text = text.replace(" wouldnt "," would not ")
  text = text.replace(" wouldn t "," would not ")
  text = text.replace(" dont "," do not ")
  text = text.replace(" don t "," do not ")
  text = text.replace(" didnt "," did not ")
  text = text.replace(" didn t "," did not ")
  text = text.replace(" let s "," let us ")
  text = text.replace(" i'm "," i am ")
  text = text.replace(" i've "," i have ")
  text = text.replace(" i'll "," i will ")
  text = text.replace(" i'd "," i had ")
  text = text.replace(" that's "," that is ")
  text = text.replace(" isn't "," is not ")
  text = text.replace(" it's "," it is ")
  text = text.replace(" she's "," she is ")
  text = text.replace(" he's "," he is ")
  text = text.replace(" u "," you ")
  text = text.replace(" ur "," your ")
  text = text.replace(" b4 "," before ")
  text = text.replace(" wasn't "," was not ")
  text = text.replace(" can't "," can not ")
  text = text.replace(" couldn't "," could not ")
  text = text.replace(" wouldn't "," would not ")
  text = text.replace(" don't "," do not ")
  text = text.replace(" didn't "," did not ")
  text = text.replace(" let's "," let us ")
  text = text.replace(" luv "," love ")
  text = text.replace(" true "," truth ")
  text = text.replace(" ppl "," people ")
  text = text.replace(" fb "," facebook ")
  text = text.replace(" b day "," birthday ")
  text = text.replace(" bday "," birthday ")
  if (len(text.strip())  == 0):
      text = ' ' #replace None with a single space
  return(text)

def removeHashtags(text):
  words = text.split()
  words = filter(lambda x:x[0]!='#', words)
  text = " ".join(words)
  if (len(text.strip())  == 0):
      text = ' ' #replace None with a single space
  return(text)

def removeMentions(text):
  words = text.split()
  words = filter(lambda x:x[0]!='@', words)
  text = " ".join(words)
  if (len(text.strip())  == 0):
      text = ' ' #replace None with a single space
  return(text)

def removeHttpWeb(text):
  words = text.split()
  words = filter(lambda x:x[0]!='&', words)
  words = filter(lambda x:x[0:4]!='http', words)
  text = " ".join(words)
  if (len(text.strip())  == 0):
      text = ' ' #replace None with a single space
  return(text)

def removeUnicode(text):
  #convert unicode chars to ascii
  text = unidecode(text)
  return(text)



In [141]:
# CLEANUP FACTORY
def cleanup(text):
  #Scenario1: 
  #text = removeUnicode(text)
  text = removeSpecialChar(text)
  text = remove1stMent(text)
  text = removeHttpWeb(text)
  text = cleanColloquials(text)

  #Scenario2: remove1stMent, removeHttpWeb, removeSpecialChar, cleanColloquials
  #Scenario3: remove1stMent, removeHttpWeb, removeHashtags, removeSpecialChar, cleanColloquials
  return(text)

In [142]:
df['CleanTweet'] = df['OrigTweet'].apply(cleanup)
#remove1stMentNoSpecChar('@MarkSZaidEsq @tko From the evidence and more to co')
# df['CleanTweet'] = df['CleanTweetNoHttp'].apply(remove1stMentNoSpecChar)

In [143]:
print(f"The dataset contains { df.Target.nunique() } unique categories")

The dataset contains 6 unique categories


In [144]:
# convert the tweets into lower case if uncased model.
df['CleanTweet'] = df['CleanTweet'].apply(lambda x: str(x).lower())
# trim to 280 characters max
df['CleanTweet'] = df['CleanTweet'].str.slice(0,279)
# calculating the length of tweet
df['CleanTweet_len'] = df['CleanTweet'].apply(lambda x: len(str(x).split()))
# Remove tweets with less than 10 words
df = df.query('CleanTweet_len > 9')

In [145]:
len(df)

122716

In [146]:
# check that the RecTweets are still all in the dataset after the cleansing
df_RecTweets = df[df.InReplyTo.isna()]
df_RecTweets.groupby(['Label','Target']).size()

Label          Target
Business       1          67
Entertainment  2         139
Human Rights   5          82
Obituary       9         138
Politics       6         436
Sports         11         58
dtype: int64

In [147]:
df['Label'].unique()

array(['Human Rights', 'Politics', 'Obituary', 'Sports', 'Entertainment',
       'Business'], dtype=object)

In [148]:
df['Target'].unique()

array(['5', '6', '9', '11', '2', '1'], dtype=object)

In [149]:
df.groupby(['Label','Target']).size()

Label          Target
Business       1          7407
Entertainment  2         16623
Human Rights   5         11927
Obituary       9         11406
Politics       6         66470
Sports         11         8883
dtype: int64