In [9]:
import pandas as pd
import numpy as np
import os
import string
import re

## Load Data

In [10]:
folder_path = "C:\\Users\\Reen\\Desktop\\web science\\crowdsourced_data\\submissions_fixed_anonymized"
data = []
for root,dirs,files in os.walk(folder_path):
    for file in files:
        filename = folder_path+"\\"+file
        df = pd.read_csv(filename)
        data.append(df)
final_df = pd.concat(data, ignore_index=True)
final_df.head()

Unnamed: 0,Question,Answer URL,Answer Label,Question Rating,Answer Quality,Factual,questionId
0,Does Bioptron VIP (Visible Incoherent Polarize...,https://skeptics.stackexchange.com/questions/5...,na,1.0,2.0,1.0,5017
1,Is Halal meat healthier to consume?,https://skeptics.stackexchange.com/questions/1...,no,2.0,1.0,1.0,18628
2,Does catnip treat headache and coughs and inso...,https://skeptics.stackexchange.com/questions/7...,na,1.0,2.0,1.0,7367
3,Is climate change currently good for the world...,https://skeptics.stackexchange.com/questions/1...,no,2.0,3.0,1.0,18136
4,Does cooking with extra virgin olive oil cause...,https://skeptics.stackexchange.com/questions/1...,no,2.0,2.0,1.0,19180


## Preprocessing

In [11]:
def preprocess_text(df):    

    # remove leading/trailing spaces
    df = df.str.strip()
    
    # convert to lowercase
    df = df.str.lower()
    
    #regex_pat = re.compile(r'http\S+"', flags=re.IGNORECASE)
    #df = df.str.replace(regex_pat,'')
    df = df.replace(to_replace ='http\S+', value = '', regex = True)
    
    # remove punctuation
    translator = str.maketrans('', '', string.punctuation) 
    df = df.str.translate(translator)
    
    # remove non-alphanumeric characters
    df = df.replace(to_replace ='\s*[^A-Za-z0-9]+\s*', value = ' ', regex = True)
    
    # remove digits
    translator = str.maketrans('', '', string.digits) 
    df = df.str.translate(translator)
    
    df = df.str.strip()
    
    return df

In [12]:
# convert to lower case, remove leading/trailing spaces
final_df['Question'] = final_df['Question'].astype(str).str.lower().str.strip()
final_df['Answer Label'] = final_df['Answer Label'].astype(str).str.lower().str.strip()

#### Unique Answer Labels

In [13]:
print(np.unique(final_df['Answer Label']))
final_df['Answer Label'] = final_df['Answer Label'].replace('nan','na')
final_df.head()

['na' 'nan' 'no' 'yes']


Unnamed: 0,Question,Answer URL,Answer Label,Question Rating,Answer Quality,Factual,questionId
0,does bioptron vip (visible incoherent polarize...,https://skeptics.stackexchange.com/questions/5...,na,1.0,2.0,1.0,5017
1,is halal meat healthier to consume?,https://skeptics.stackexchange.com/questions/1...,no,2.0,1.0,1.0,18628
2,does catnip treat headache and coughs and inso...,https://skeptics.stackexchange.com/questions/7...,na,1.0,2.0,1.0,7367
3,is climate change currently good for the world...,https://skeptics.stackexchange.com/questions/1...,no,2.0,3.0,1.0,18136
4,does cooking with extra virgin olive oil cause...,https://skeptics.stackexchange.com/questions/1...,no,2.0,2.0,1.0,19180


## Duplicate Questions

In [14]:
#duplicate questions
questions = final_df['questionId']
print(len(questions))
unique_questions = set(questions)
print(len(unique_questions))

2898
1066


In [15]:
questions = set(final_df['questionId'])
print(len(questions))

1066


In [16]:
df.columns

Index(['Question', 'Answer URL', 'Answer Label', 'Question Rating',
       'Answer Quality', 'Factual', 'questionId'],
      dtype='object')

### Majority Voting and Aggregation

In case of duplicate questions with tied answer labels, ties are broken arbitrarily, i.e.,
either one of the majority answer label is chosen randomly
This is done to prevent inducing bias, in case we fix the label to be chosen in case of ties. Moreover, 
if I annotate these again myself, it again would also induce bias, as I would be trying to shift it towards my own observations 

    

In [17]:
duplicates = final_df.groupby(['questionId'])
#duplicates.apply(lambda grp: grp.sum())

new_df = pd.DataFrame(columns = df.columns)

#break ties, single valued

tied_ans_labels_count = 0
tied_factuality_count = 0

old_count = 0
count = 0
erroneous_ques_ids = []

for group_name,group in duplicates:
    if(len(group)):
        
        
        unique_questions = set(group['Question'])
        if(len(unique_questions)>1):
            old_count+=1
                   
        group['Question'] = preprocess_text(group['Question'])
        unique_questions = set(group['Question'])
        
        if(len(unique_questions)>1):
            count+=1
            print("After preprocessing")
            print(unique_questions)
            print("\n")
            erroneous_ques_ids.append(group['questionId'].iloc[0])
        
        question = group['Question'].iloc[0]
        answer_url = group['Answer URL'].iloc[0]
        questionId = group['questionId'].iloc[0]
        
        answer_labels_ = group['Answer Label'].mode()
        tied_ans_labels_count+=1 #no of questions for which answer labels are tied
        answer_label = answer_labels_.sample(n=1).values[0]

        
        answer_quality_ = group['Answer Quality'].mean()
        answer_quality = answer_quality_
        
        question_ratings_ = group['Question Rating'].mean()
        question_rating = question_ratings_
        
        factual_labels_ = group['Factual'].mode()
        tied_factuality_count +=1 #no of questions which are annotated equally as being factual or non-factual
        factual = factual_labels_.sample(n=1).values[0]
        
        # add to new dataframe, only if labels are factual:
        if(factual==1):
            new_df = new_df.append({'Question':question,'Answer Label': answer_label, 'Factual': factual, 'Question Rating': question_rating, 'Answer Quality': answer_quality,'Answer URL': answer_url,'questionId':questionId},ignore_index=True)


new_df.to_csv("cleaned_data.csv")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


After preprocessing
{'is perfect pitch something youre born with can blood group affect behavior', 'can blood group affect behavior'}


After preprocessing
{'does general relativity bring cosmic time which is the same for all framesofreference', 'does general relativity bring cosmic time'}


After preprocessing
{'does sunflower oil reduce ldl bad cholesterol more than olive oil', 'does one suv tank of ethanol take as much grain as could feed a person for a whole year does sunflower oil reduce ldl bad cholesterol more than olive oil'}




In [18]:
len(new_df)
count

3

In [11]:
print(len(new_df))

964


In [12]:
erroneous_ques_ids

[156, 42915, 44749]

## Inter Annotator Agreement

In [15]:
import krippendorff 

In [16]:
set(final_df['Factual'])

{0.0, 1.0}

In [17]:
question_df = pd.read_csv("cleaned_data.csv")
question_df.head()

Unnamed: 0.1,Unnamed: 0,Question,Answer URL,Answer Label,Question Rating,Answer Quality,Factual,questionId
0,0,does water have a memory as claimed in homeopathy,https://skeptics.stackexchange.com/questions/2#27,no,1.666667,2.333333,1.0,2
1,1,does chamomile help you to relax,https://skeptics.stackexchange.com/questions/3...,na,2.0,2.0,1.0,3
2,2,are there benefits to the eca stack for bodybu...,https://skeptics.stackexchange.com/questions/2...,yes,1.666667,2.0,1.0,22
3,3,can positive thinking provide an improved outc...,https://skeptics.stackexchange.com/questions/2...,yes,2.666667,2.666667,1.0,26
4,4,are vegetables good for me,https://skeptics.stackexchange.com/questions/3...,yes,2.0,2.333333,1.0,32


#### loading user df's

In [29]:
question_df = question_df.astype({'questionId': 'int64'})
question_ids = list(set(question_df['questionId']))
question_ids_dict = {v:k for k,v in enumerate(question_ids)}
label_dict = {'yes':2,'no':1,'na':0}
fact_dict = {1:2, 0:1}

# loading data 
folder_path = "C:\\Users\\Reen\\Desktop\\web science\\crowdsourced_data\\submissions_fixed_anonymized"

user_df_dict = {}

for root,dirs,files in os.walk(folder_path):
    for file in files:
        filename = folder_path+"\\"+file
        df = pd.read_csv(filename)
        df = df.drop(['Answer URL'],axis=1)
        
        #add clean question column
        #df['question'] = df['questionId'].map(questext_id_map)
        
        
        df['Answer Label'] = df['Answer Label'].astype(str).str.lower().str.strip()
        df['Answer Label'] = df['Answer Label'].replace('nan','na')
        
        #df = df.loc[df['questionId'].isin(question_ids)]
        df['Answer Label'] = df['Answer Label'].map(label_dict)
        
        #df['Factual'] = df['Factual'].map(fact_dict)
        
        key = file.replace('.csv','')
        key = int(key.replace('WS',''))
        
        user_df_dict[key] = df

In [32]:
qids = set(final_df['questionId'])
len(qids)

1066

In [33]:
answer_quality_ratings_all = []
question_rating_all = []
factual_all = []
answer_label_all = []

for user in user_df_dict.keys():
    
    #print(user)
    user_df = user_df_dict[user]
    
    #print(user_df.dtypes)
    answer_quality = []
    question_rating = []
    factual = []
    answer_label = []
        
    for qid in qids:
        
        if qid in list(set(user_df['questionId'])):
            answer_quality.append(user_df.loc[user_df['questionId'] == qid, 'Answer Quality'].iloc[0])
            question_rating.append(user_df.loc[user_df['questionId'] == qid,'Question Rating'].iloc[0])
            answer_label.append(user_df.loc[user_df['questionId'] == qid,'Answer Label'].iloc[0])
            factual.append(user_df.loc[user_df['questionId'] == qid,'Factual'].iloc[0])
        else:
            answer_quality.append(np.nan)
            question_rating.append(np.nan)
            answer_label.append(np.nan)
            factual.append(np.nan)
    #print(len([i for i in factual if np.isnan(i)]))
            
    answer_quality_ratings_all.append(answer_quality)
    question_rating_all.append(question_rating)
    factual_all.append(factual)
    answer_label_all.append(answer_label)



966
968
966
966
966
966
966
966
966
966
966
966
966
966
966
966
966
966
966
966
966
966
966
966
966
966
966
966
966


In [34]:
krippendorff.alpha(answer_quality_ratings_all)

0.17461897185164588

In [35]:
krippendorff.alpha(question_rating_all)

0.06762494449406009

In [36]:
krippendorff.alpha(factual_all)

0.053187086092715274

In [37]:
krippendorff.alpha(answer_label_all)

0.3603173976259818

In [38]:
all_data = [answer_label_all,question_rating_all,answer_quality_ratings_all,factual_all]

In [39]:
alpha_vals = []
for i in range(len(all_data)):
    alpha_vals.append(krippendorff.alpha(all_data[i]))
    

In [40]:
alpha_vals

[0.3603173976259818,
 0.06762494449406009,
 0.17461897185164588,
 0.053187086092715274]