In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
data_path = 'data/labeled_data/inter_observer_agreement/ioa_labeled_2.csv'
ioa_df = pd.read_csv(data_path)
ioa_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Document       100 non-null    object
 1   sentiment      98 non-null     object
 2   sentiment_ioa  98 non-null     object
 3   author         40 non-null     object
 4   author_ioa     35 non-null     object
 5   commnents      23 non-null     object
 6   commnents_ioa  2 non-null      object
dtypes: object(7)
memory usage: 5.6+ KB


In [3]:
# lower case all entries and strip white space
ioa_df = ioa_df.applymap(lambda x: x.lower().strip() if isinstance(x, str) else x)
ioa_df.head(3)

Unnamed: 0,Document,sentiment,sentiment_ioa,author,author_ioa,commnents,commnents_ioa
0,tl dr expect any conversation surrounding aba ...,neutral,neutral,,other professional,opinion,
1,aba doesn t cure autism nor does it pretend to...,neutral,neutral,,,information,
2,thank you guys i m feeling a lot better alread...,neutral,neutral,aba professional,aba professional,suffering burnout,


In [4]:
# replace NaN in author and author_ioa with layperson
ioa_df['author'].fillna('layperson', inplace=True)
ioa_df['author_ioa'].fillna('layperson', inplace=True)
ioa_df.head(3)

Unnamed: 0,Document,sentiment,sentiment_ioa,author,author_ioa,commnents,commnents_ioa
0,tl dr expect any conversation surrounding aba ...,neutral,neutral,layperson,other professional,opinion,
1,aba doesn t cure autism nor does it pretend to...,neutral,neutral,layperson,layperson,information,
2,thank you guys i m feeling a lot better alread...,neutral,neutral,aba professional,aba professional,suffering burnout,


In [5]:
# inspect comments rows that are not null
ioa_df[(ioa_df['commnents'].notnull()) | (ioa_df['commnents_ioa'].notnull())]

# with regards to the relevance od posts to ABA the 2 coders are in complete agreement as they both found the same 2 entries to not be about aba
# these entries will be marked as 'not aba' for borth sentiment and author

Unnamed: 0,Document,sentiment,sentiment_ioa,author,author_ioa,commnents,commnents_ioa
0,tl dr expect any conversation surrounding aba ...,neutral,neutral,layperson,other professional,opinion,
1,aba doesn t cure autism nor does it pretend to...,neutral,neutral,layperson,layperson,information,
2,thank you guys i m feeling a lot better alread...,neutral,neutral,aba professional,aba professional,suffering burnout,
5,idk since that happened if it really did happe...,positive,neutral,layperson,layperson,defending aba practice,
27,a tragic case with a mentally ill mother using...,,,layperson,layperson,not aba,not aba
28,entering aba field hi everyone i'm new to this...,neutral,neutral,aba student,layperson,seeking information about aba,
29,i ve seen a lot of adults who used aba as chil...,neutral,neutral,layperson,parent,opinion,
32,it s difficult to navigate here you ll find pe...,neutral,neutral,parent,parent,offering opinions about aba,
35,they aren t even using old school aba techniqu...,neutral,neutral,layperson,layperson,information,
44,in the last some horrific things have been don...,neutral,neutral,layperson,layperson,information,


In [6]:
# sentiment has NaN only in those places where posts are not about ABA
# replace NaN for sentiment columns with 'not aba'
ioa_df['sentiment'].fillna('not_aba', inplace=True)
ioa_df['sentiment_ioa'].fillna('not_aba', inplace=True)
ioa_df.loc[ioa_df['sentiment'] == 'not_aba']

Unnamed: 0,Document,sentiment,sentiment_ioa,author,author_ioa,commnents,commnents_ioa
27,a tragic case with a mentally ill mother using...,not_aba,not_aba,layperson,layperson,not aba,not aba
74,i saw bcp plan and the only think i could thin...,not_aba,not_aba,layperson,layperson,not aba,not aba


In [7]:
# replace the 'layperson' values in author where sentiment is 'not_aba' , with 'not_aba'
ioa_df.loc[ioa_df['sentiment'] == 'not_aba'] = 'not_aba'
ioa_df.loc[ioa_df['sentiment'] == 'not_aba']

Unnamed: 0,Document,sentiment,sentiment_ioa,author,author_ioa,commnents,commnents_ioa
27,not_aba,not_aba,not_aba,not_aba,not_aba,not_aba,not_aba
74,not_aba,not_aba,not_aba,not_aba,not_aba,not_aba,not_aba


In [8]:
print(ioa_df['sentiment'].unique())
print(ioa_df['sentiment_ioa'].unique())

['neutral' 'positive' 'negative' 'not_aba' 'neutral?']
['neutral' 'pozitive' 'negative' 'not_aba' 'positive']


In [9]:
# correct misspelings
ioa_df['sentiment_ioa'].replace('pozitive', 'positive', inplace=True)
ioa_df['sentiment'].replace('neutral?', 'neutral', inplace=True)
print(ioa_df['sentiment'].unique())
print(ioa_df['sentiment_ioa'].unique())

['neutral' 'positive' 'negative' 'not_aba']
['neutral' 'positive' 'negative' 'not_aba']


In [10]:
ioa_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Document       100 non-null    object
 1   sentiment      100 non-null    object
 2   sentiment_ioa  100 non-null    object
 3   author         100 non-null    object
 4   author_ioa     100 non-null    object
 5   commnents      23 non-null     object
 6   commnents_ioa  2 non-null      object
dtypes: object(7)
memory usage: 5.6+ KB


In [11]:
ioa_df[['sentiment', 'sentiment_ioa']].value_counts(ascending = True)

sentiment  sentiment_ioa
negative   neutral           1
not_aba    not_aba           2
neutral    negative          3
positive   neutral           3
neutral    positive          5
positive   positive          8
negative   negative         11
neutral    neutral          67
Name: count, dtype: int64

In [12]:
from sklearn.metrics import cohen_kappa_score
coder1 = ioa_df['sentiment'].tolist()
coder2 = ioa_df['sentiment_ioa'].tolist()
kappa_sentiment = cohen_kappa_score(coder1, coder2)
print(f"Cohen's kappa (IRR): {kappa_sentiment}")

Cohen's kappa (IRR): 0.7247706422018348


In [13]:
print(ioa_df['author'].unique())
print(ioa_df['author_ioa'].unique())

['layperson' 'aba professional' 'professional' 'parent' 'asd person'
 'not_aba' 'aba student' 'aba professional with asd' 'slp'
 'parent with asd child']
['other professional' 'layperson' 'aba professional'
 'other professional/parent' 'parent' 'asd' 'not_aba'
 'aba professional/asd']


In [14]:
# correct misspelings/ make labels uniform
ioa_df['author'].replace('aba student', 'layperson', inplace=True)
ioa_df['author'].replace('asd person', 'asd', inplace=True)
ioa_df['author'].replace('professional', 'other professional', inplace=True)
ioa_df['author'].replace('slp', 'other professional', inplace=True)
ioa_df['author'].replace('aba professional with asd', 'aba professional/asd', inplace=True)
ioa_df['author'].replace('parent with asd child', 'parent/asd', inplace=True)
print(sorted(ioa_df['author'].unique()))
print(sorted(ioa_df['author_ioa'].unique()))

['aba professional', 'aba professional/asd', 'asd', 'layperson', 'not_aba', 'other professional', 'parent', 'parent/asd']
['aba professional', 'aba professional/asd', 'asd', 'layperson', 'not_aba', 'other professional', 'other professional/parent', 'parent']


In [15]:
ioa_df[['author', 'author_ioa']].value_counts()

author                author_ioa               
layperson             layperson                    57
aba professional      aba professional             18
parent                parent                        8
aba professional      layperson                     4
asd                   asd                           2
not_aba               not_aba                       2
parent                layperson                     2
aba professional      other professional/parent     1
aba professional/asd  aba professional/asd          1
layperson             other professional            1
                      parent                        1
other professional    aba professional              1
                      other professional            1
parent/asd            parent                        1
Name: count, dtype: int64

In [16]:
from sklearn.metrics import cohen_kappa_score
coder1 = ioa_df['author'].tolist()
coder2 = ioa_df['author_ioa'].tolist()
kappa_author = cohen_kappa_score(coder1, coder2)
print(f"Cohen's kappa (IRR): {kappa_author}")

Cohen's kappa (IRR): 0.8081283795569509
