In [1]:
import scipy.stats as stats
import numpy as np
import pandas as pd

### Hypothesis RH1
Social Media Records (SMRs) are Social Media Contents (SMCs) to which other users have interacted (replied, forwarded, liked or responded) as allowed on the social media platform.

##### Prepare data

In [2]:
dfRAW = pd.read_csv("D:/KOPro/PhD/DataCollection/Datasets/NewsCited/Consolidated/RecTweets.csv", usecols=['id','favorite_count','retweet_count','possibly_sensitive','user_verified'], dtype='str')
len(dfRAW)

5600

In [3]:
dfRAW = dfRAW[dfRAW.id != 'id']
len(dfRAW)

5597

In [4]:
# rename fields in dfRecTweets
dfRAW.rename(columns={'id': 'RecID', 'favorite_count': 'likes_count'}, inplace=True)

In [5]:
dfRAW.drop_duplicates(subset='RecID', keep='last', inplace=True)
len(dfRAW)

5016

In [6]:
dfRAW.to_csv('dfRAW.csv')

<b>SMC (Master)

In [7]:
dfSMC = pd.read_csv('D:/KOPro/PhD/Implementation/SourceCode/py38/GTxM/data/MasterTokens.csv', usecols=['RecID','countSupTweets'], dtype='str', encoding='ISO-8859-1')
dfSMC.rename(columns={'countSupTweets': 'reply_count'}, inplace=True)
len(dfSMC)

4684

In [8]:
# merge
dfRH_SMC = pd.merge(dfSMC, dfRAW, on='RecID', how='left')
len(dfRH_SMC)

4684

In [9]:
dfRH_SMC.head()

Unnamed: 0,RecID,reply_count,likes_count,possibly_sensitive,retweet_count,user_verified
0,11947603240,0.0,2,,0,True
1,12643331537,1.0,23,,5,True
2,23257004857,0.0,25,,32,False
3,210423753388208128,0.0,1004,,1629,True
4,211098255919038464,0.0,4,False,4,False


In [10]:
dfRH_SMC.to_csv('dfRH_SMC.csv')

<b>SMR

In [11]:
dfSMR = pd.read_csv('D:/KOPro/PhD/Implementation/SourceCode/py38/GTxM/data/MasterTokens_10to260SupTweets.csv', usecols=['RecID'], dtype='str', encoding='ISO-8859-1')
len(dfSMR)

2385

In [12]:
dfSMR.drop_duplicates(subset='RecID', keep='last', inplace=True)
len(dfSMR)

2385

In [13]:
# rename fields in dfSMR
dfSMR.rename(columns={'countSupTweets': 'reply_count'}, inplace=True)

In [14]:
dfSMR.to_csv('dfSMR.csv')

In [15]:
# dfSMC['RecID'] = dfSMC['RecID'].astype('string')
# dfSMR['RecID'] = dfSMR['RecID'].astype('string')

In [16]:
# merge
dfRH_SMR = pd.merge(dfSMR, dfRH_SMC, on='RecID', how='left')
len(dfRH_SMR)

2385

In [17]:
dfRH_SMR.head()

Unnamed: 0,RecID,reply_count,likes_count,possibly_sensitive,retweet_count,user_verified
0,222818213392678912,260.0,1443,,1888,True
1,826262311560216578,76.0,1689,False,2657,True
2,833502973204459520,13.0,40190,False,14400,True
3,835347243020451840,39.0,51275,,29026,False
4,867832469181128704,24.0,23297,False,8400,True


<b>GTD

In [18]:
dfGTD = pd.read_csv('D:/KOPro/PhD/Implementation/SourceCode/py38/GTxM/data/GTxM_Pass5/GTxM_Pass5_GTD_UpTodate.csv', dtype='str')
len(dfGTD)

1295

In [19]:
dfGTD.head()

Unnamed: 0,RecID,Label,Target
0,1207761446513319936,Politics,6
1,1180079141087055872,Politics,6
2,1179396754665549829,Obituary,9
3,1180775281067462658,Entertainment,2
4,1208021541323202562,Politics,6


In [20]:
dfGTD = dfGTD[dfGTD.Label != 'Health']
dfGTD = dfGTD[dfGTD.Label != 'Law and Order']
len(dfGTD)

1261

In [21]:
dfGTD = dfGTD['RecID']

In [22]:
# merge GTD
dfRH_GTD = pd.merge(dfGTD, dfRH_SMR, on='RecID', how='inner')
len(dfRH_GTD)

1261

In [23]:
dfRH_GTD.to_csv('dfRH_GTD.csv')

In [24]:
dfRH_GTD.drop_duplicates(subset='RecID', keep=False, inplace=True)
len(dfRH_GTD)

1261

##### Try a proportions test

In [29]:
stats.barnard_exact([[30, 0], [4684, 2835]], alternative='less')

BarnardExactResult(statistic=4.256053123746618, pvalue=1.0)

##### Try adding up the interaction columns and do a one-sample test

In [30]:
dfRH_SMR['reply_count']= dfRH_SMR['reply_count'].apply(lambda x: int(float(x)))

In [31]:
dfRH_SMR['Interaction'] = dfRH_SMR['reply_count'] + dfRH_SMR['likes_count'].astype(int)+ dfRH_SMR['retweet_count'].astype(int)
dfRH_SMR.head()

Unnamed: 0,RecID,reply_count,likes_count,possibly_sensitive,retweet_count,user_verified,Interaction
0,222818213392678912,260,1443,,1888,True,3591
1,826262311560216578,76,1689,False,2657,True,4422
2,833502973204459520,13,40190,False,14400,True,54603
3,835347243020451840,39,51275,,29026,False,80340
4,867832469181128704,24,23297,False,8400,True,31721


In [32]:
from statsmodels.stats.weightstats import ztest as ztest
ztest(dfRH_SMR['Interaction'], value=0)

(11.480460112744092, 1.653979777743639e-30)

### Hypothesis RH2
RH: Tweets created by verified user accounts have more conversational replies than those created by unverified user accounts.<br>
RH0: Tweets created by verified user accounts have the same number of replies as unverified user accounts<br>
RH1: Tweets created by verified user accounts have more replies as unverified user accounts<br>
Test plan: 2 sample t-test.

<b>SMC

In [33]:
dfRH2_SMC = pd.DataFrame()
dfRH2_SMC['user_verified'] = dfRH_SMC['user_verified']
dfRH2_SMC['reply_count_yes_no'] = dfRH_SMC['reply_count'].apply (lambda x: 'Yes' if float(x) > 0 else 'No')
dfRH2_SMC.groupby(by=['user_verified','reply_count_yes_no']).size()

user_verified  reply_count_yes_no
False          No                     403
               Yes                   1028
True           No                     619
               Yes                   2634
dtype: int64

<b>SMR

In [34]:
dfRH2_SMR = pd.DataFrame()
dfRH2_SMR['user_verified'] = dfRH_SMR['user_verified']
dfRH2_SMR['reply_count_yes_no'] = dfRH_SMR['reply_count'].apply (lambda x: 'Yes' if float(x) > 0 else 'No')
dfRH2_SMR.groupby(by=['user_verified','reply_count_yes_no']).size()

user_verified  reply_count_yes_no
False          Yes                    489
True           Yes                   1896
dtype: int64

<b>GTD

In [35]:
dfRH2_GTD = pd.DataFrame()
dfRH2_GTD['user_verified'] = dfRH_GTD['user_verified']
dfRH2_GTD['reply_count_yes_no'] = dfRH_GTD['reply_count'].apply (lambda x: 'Yes' if float(x) > 0 else 'No')
dfRH2_GTD.groupby(by=['user_verified','reply_count_yes_no']).size()

user_verified  reply_count_yes_no
False          Yes                    177
True           Yes                   1084
dtype: int64

In [36]:
Verified_Reply = [2634, 1896, 1106]
Unverified_Reply = [1028, 489, 189]
# Conduct normal t-Test since the data groups have equal sizes
stats.ttest_ind(a=Verified_Reply, b=Unverified_Reply, alternative='greater')

Ttest_indResult(statistic=2.594760412943199, pvalue=0.03019014818601733)

### Hypothesis RH3
Majority of the social media records cited in news articles are non-records.

In [37]:
print('Count of SMC is ' + str(len(dfSMC)))
print('Count of GTD is ' + str(len(dfGTD)))
print('Count of SMC - GTD is ' + str(len(dfSMC)- len(dfGTD)))

Count of SMC is 4684
Count of GTD is 1261
Count of SMC - GTD is 3423


### Hypothesis RH4
All tweets flagged as ‘possibly_sensitive’ are social media records

In [38]:
dfRH4_SMC = dfRH_SMC.convert_dtypes()
dfRH4_SMR = dfRH_SMR.convert_dtypes()
dfRH4_GTD = dfRH_GTD.convert_dtypes()

<b>SMC

In [39]:
dfRH4_SMC = dfRH4_SMC[dfRH4_SMC.possibly_sensitive == 'True']
len(dfRH4_SMC)

19

<b>SMR

In [40]:
dfRH4_SMR = dfRH4_SMR[dfRH4_SMR.possibly_sensitive == 'True']
len(dfRH4_SMR)

7

<b>GTD

In [41]:
dfRH4_GTD = dfRH4_GTD[dfRH4_GTD.possibly_sensitive == 'True']
len(dfRH4_GTD)

6

### Hypothesis RH5
H0: There is no deference in performance between Traditional ML and DL algorithms for the classification of social media records
<br>H1: Deep Learning (DL) algorithms will perform differently than Traditional ML algorithms in the classification of social media records<br>
Test plan: Welch's 2-sample t-test of independence

In [2]:
# Define data groups
# TradML_F1Score = [87.13, 82.5, 72.79, 75.45]
# DL_F1Score = [92.29, 91.41, 63.51] # with LSTM
# Conduct Welch's t-Test with equal_var = False, because the data groups have unequal sizes
# stats.ttest_ind(a=TradML_F1Score, b=DL_F1Score, equal_var = False, alternative='two-sided')

# use top 3 trad ML and regular t-test
TradML_F1Score = [87.13, 82.5, 75.45]  
DL_F1Score = [92.29, 91.41, 63.51] # with LSTM
stats.ttest_ind(a=TradML_F1Score, b=DL_F1Score, alternative='two-sided')

Ttest_indResult(statistic=-0.07070532894054996, pvalue=0.9470261610939597)

### Hypothesis RH6
H0: There is no deference in performance between Traditional ML and DL algorithms for the classification of social media records
<br>H1: Deep Learning (DL) algorithms perform better than Traditional ML algorithms in the classification of social media records<br>
Test plan: Welch's 2-sample t-test of independence

In [3]:
# Define data groups
TradML_F1Score = [87.13, 82.5, 75.45]
DL_F1Score = [92.29, 91.41, 91.63] # with T5
#regular t-test
stats.ttest_ind(a=TradML_F1Score, b=DL_F1Score, alternative='less')

# Conduct Welch's t-Test with equal_var = False, because the data groups have unequal sizes
# stats.ttest_ind(a=TradML_F1Score, b=DL_F1Score, equal_var = False, alternative='less')

Ttest_indResult(statistic=-2.960425409528834, pvalue=0.020768445179274647)

### Hypothesis RH7
H0: Accuracy of predicting decomposed SMR will be the same as accuracy of recomposed SMR<br>
H1: Accuracy of predicting decomposed SMR will be less than as accuracy of recomposed SMR. <br>
Test plan: 2-sample t-test of independence

In [48]:
# Define data groups
Decomposed_Acc = [87.06,86.07,84.61]
Recomposed_Acc = [91.75,91.40,88.73]
# Conduct normal t-Test since the data groups have equal sizes
stats.ttest_ind(a=Decomposed_Acc, b=Recomposed_Acc, alternative='less')

Ttest_indResult(statistic=-3.9610795820964153, pvalue=0.008331288596383426)

### Hypothesis RH8
RH: Class imbalance will cause the GTxM Classifier algorithms to underperform through the GTxM continuum passes <br>
H0: Class imbalance will not affect the GTxM Classifier algorithms' performance through the GTxM continuum passes <br>
H1: Class imbalance will lead to lower GTxM Classifier algorithm performance through the GTxM continuum passes <br>
Test plan: Perform a one-sample t-test to see if the mean of an unknown population (Avg_Passes1to5_F1score) is lower than known value (Avg_Pass0_F1score). 

In [49]:
Avg_Pass0_F1score = 90.12
Avg_Passes1to5_F1score = [90.50, 91.60, 91.73, 92.52, 93.82]
# Conduct one-sample t-Test against known value
stats.ttest_1samp(a=Avg_Passes1to5_F1score, popmean=Avg_Pass0_F1score, alternative='less')

Ttest_1sampResult(statistic=3.4771504789325967, pvalue=0.9872911317060336)

### Hypothesis RH9
RH: SBERT algorithm generated labels will produce lower GTxM classifier performance results than labels generated by human operators after Intercoder Reliability<br>
H0: SBERT labels will produce the same GTxM classifier performance results as human operators labels<br>
H1: SBERT labels will produce lower GTxM classifier performance results than human operators labels<br>
Test plan: 2-sample t-test of independence

In [50]:
SBERT_F1scores = [70.9,72.38,45.96,76.63,76.47,82.26]
HO_ICR_F1scores = [79.42,76.12,44.37,76.84,81.75,83.49]
# Conduct normal t-Test since the data groups have equal sizes
stats.ttest_ind(a=SBERT_F1scores, b=HO_ICR_F1scores, alternative='less')

Ttest_indResult(statistic=-0.3655119556375282, pvalue=0.3611711079645545)

### Hypothesis RH10
RH: the CCL Intercoder Reliability human operator effort will be less than 2 human operator effort<br>
H0: Human Operator 1 + Human Operator 2 effort will be the same as twice Human Operator 1 effort<br>
H1: Human Operator 1 + Human Operator 2 effort will be less than twice Human Operator 1 effort<br>
Test plan: 2-sample t-test

In [51]:
HO1_effort = [407,425,220,251,122,121]
HO2_effort = [407,425,220,407,425,220]
# Conduct normal t-Test since the data groups have equal sizes
stats.ttest_ind(a=HO1_effort, b=HO2_effort, alternative='less')

Ttest_indResult(statistic=-1.3594754342588802, pvalue=0.10193012928010309)

### Hypothesis RH11
H0: There is no statistical difference in the cosine similarity score of “Social Stories” compared with "other Ground-truth" records <br>
H1: The cosine similarity scores of “Social Stories” differs significantly compared with "other Ground-truth" records<br>
Test plan: using the SBERT results for Pass 4, osine similarity scores for “Social Stories” (A) and "other Ground-truth" (B) SMRs, then perform a Kruskal-Wallis H to confirm the hypothesis.

In [52]:
df_pass4_GTD = pd.read_csv('D:/KOPro/PhD/Implementation/SourceCode/py38/GTxM/data/GTxM_Pass4/GTxM_Pass4_GTD_New.csv', dtype=str)
len(df_pass4_GTD)

297

In [53]:
df_pass4_GTD.rename(columns={'RecID': 'TID'}, inplace=True)

In [54]:
df_pass4_GTD.head()

Unnamed: 0,TID,Label,Target
0,1151389038781390848,Human Rights,5
1,1177679699369050112,Entertainment,2
2,1179050498428682240,Politics,6
3,1179429715867787264,Human Rights,5
4,1180271642263658497,Politics,6


In [55]:
df_pass4_GTD_SS = df_pass4_GTD[df_pass4_GTD.Label == 'Social Stories']
len(df_pass4_GTD_SS)

27

In [56]:
df_pass4_GTD_Other = df_pass4_GTD[df_pass4_GTD.Label != 'Social Stories']
len(df_pass4_GTD_Other)

270

In [57]:
# Get the SBERT SMR and Cosine Similarity Score result for all the query strings -- QID is irrelevant and not imported
df_pass4_SBERT = pd.read_csv('D:/KOPro/PhD/Implementation/SourceCode/py38/GTxM/data/GTxM_Pass4/CGTsbert_kaggle_pass4.csv', usecols=['QID','TID','Score'], dtype=str)
len(df_pass4_SBERT)

230965

In [58]:
# Remove the self-referenced scores i.e. where the QueryID is the same as the matched TweetID (these are invalid scores)
df_pass4_SBERT = df_pass4_SBERT[df_pass4_SBERT.QID != df_pass4_SBERT.TID]
len(df_pass4_SBERT)

230609

In [59]:
# sort by TID (Asc), Score (Desc)
# df_pass4_SBERT = df_pass4_SBERT.sort_values(['TID','Score'], ascending=[True, False])
# drop duplicates and retain only each unique TID and its top score
# df_pass4_SBERT.drop_duplicates(subset='TID', keep='first', inplace=True)
# len(df_pass4_SBERT)

In [60]:
df_RH11_SStories = pd.merge(df_pass4_SBERT,df_pass4_GTD_SS, on='TID')
len(df_RH11_SStories)

29

In [61]:
df_RH11_SStories.head(20)

Unnamed: 0,QID,TID,Score,Label,Target
0,1218229901184073729,1182104046213185537,0.3426157832145691,Social Stories,10
1,1186729408536731648,1202165763093729281,0.4657312631607055,Social Stories,10
2,1194189805862735872,1218070897925423104,0.3652199506759643,Social Stories,10
3,1188591243657404418,1188591222413234176,0.4717740714550018,Social Stories,10
4,1188591243657404418,1188591222413234176,0.35311359167099,Social Stories,10
5,1188591243657404418,1188591222413234176,0.5234397053718567,Social Stories,10
6,1188591243657404418,1188591222413234176,0.662091076374054,Social Stories,10
7,1215067146386059264,1219304506703392768,0.3980175852775574,Social Stories,10
8,1191616329264041985,1193266932989992960,0.3240981698036194,Social Stories,10
9,1207489389535866881,1189656804386594821,0.4134630858898163,Social Stories,10


In [62]:
df_RH11_GTD_Other = pd.merge(df_pass4_SBERT,df_pass4_GTD_Other, on='TID')
len(df_RH11_GTD_Other)

471

In [63]:
df_RH11_GTD_Other.head(20)

Unnamed: 0,QID,TID,Score,Label,Target
0,1183587236039684096,1190587912817123328,0.3440412878990173,Sports,11
1,1183587236039684096,1211893717990133761,0.3005029559135437,Environmental,3
2,1183587236039684096,1180763964575289345,0.385413259267807,Entertainment,2
3,1183587236039684096,1180763964575289345,0.3061078488826751,Entertainment,2
4,1212015649020686336,1184578713725390852,0.6024719476699829,Politics,6
5,1189625485124354048,1184578713725390852,0.6697288751602173,Politics,6
6,1188591243657404418,1184578713725390852,0.4609363377094269,Politics,6
7,1212015649020686336,1214121238681051136,0.5697686076164246,Politics,6
8,1189539380932808704,1214121238681051136,0.5902812480926514,Politics,6
9,1216719991510327296,1214121238681051136,0.4926089346408844,Politics,6


In [64]:
df_RH11_SStories.to_csv('df_RH11_SStories_NoQID.csv')

In [65]:
df_RH11_GTD_Other.to_csv('df_RH11_GTD_Other_NoQID.csv')

#### Test using Kruskal Wallis H test
The Kruskal-Wallis H test (sometimes also called the "one-way ANOVA on ranks") is a rank-based nonparametric test that can be used to determine if there are statistically significant differences between two or more groups of an independent variable on a continuous or ordinal dependent variable. It is considered the nonparametric alternative to the one-way ANOVA, and an extension of the Mann-Whitney U test to allow the comparison of more than two independent groups.<br>
https://statistics.laerd.com/spss-tutorials/kruskal-wallis-h-test-using-spss-statistics.php

In [66]:
stats.kruskal(df_RH11_SStories['Score'], df_RH11_GTD_Other['Score'], axis=1)

KruskalResult(statistic=7.173019807727805, pvalue=0.0074008091024261695)

#### Test using Ztest from statsmodel

In [67]:
from statsmodels.stats.weightstats import ztest as ztest

In [68]:
A = df_RH11_SStories['Score'].tolist()
B = df_RH11_GTD_Other['Score'].tolist()

In [69]:
A = [float(i) for i in A]
B = [float(i) for i in B]

In [70]:
ztest(A, B, value=0, alternative='smaller')

(-2.393875319947415, 0.008335707416480622)

### Hypothesis RH12
H0: The GTxM classifier prediction performance to be the same with or without "Social Stories"<br>
H1: “Social Stories” will cause the GTxM classifier prediction performance to be lower when included than without it.

In [71]:
# Define data groups
With_SStories = [42.65,52.20,50.06]
Without_SStories = [68.28,69.99,64.55]
# Conduct normal t-Test since the data groups have equal sizes
stats.ttest_ind(a=With_SStories, b=Without_SStories, alternative='less')

Ttest_indResult(statistic=-5.83313914412357, pvalue=0.0021521575633912716)