In [1]:
import numpy as np
import pandas as pd
from gensim.models import Word2Vec
from gensim.models import KeyedVectors

In [2]:
model = KeyedVectors.load_word2vec_format('word_embeddings/model.bin', binary=True)

In [3]:
args_json = pd.read_json("../crawlers/procon/procon.json")
wiki_json = pd.read_json("../crawlers/procon/wiki.json")

#### Format DataFrame rows

In [4]:
def format_args_row(row):
    df = row.apply(pd.Series).T
    df.url = df.url[0]
    df.topic = df.topic[0]
    return(df)

arg_df_series = args_json.apply(format_args_row, axis=1)
args_df = pd.concat(arg_df_series.values.tolist(),ignore_index=True)

In [5]:
def format_wiki_row(row):
    df = row.apply(pd.Series).T
    df.source = df.source[0]
    df.title = df.title[0]
    df.topic = df.topic[0]
    df.stance = df.stance[0]
    return(df)

wiki_df_series = wiki_json.apply(format_wiki_row, axis=1)
wiki_df = pd.concat(wiki_df_series.values.tolist(),ignore_index=True)

In [28]:
wiki_df

Unnamed: 0,source,topic,stance,title,content
0,conservapedia,Death Penalty,Pro,Capital punishment,"Capital punishment, also known as the death pe..."
1,conservapedia,Death Penalty,Pro,Capital punishment,There are several compelling justifications fo...
2,conservapedia,Death Penalty,Pro,Capital punishment,Opposition to the death penalty ignores its fo...
3,conservapedia,Death Penalty,Pro,Capital punishment,Although support for the death penalty has lar...
4,conservapedia,Death Penalty,Pro,Capital punishment,Contents
...,...,...,...,...,...
4694,conservapedia,Iraq War,Pro,Iraq War,America destroyed Iraq but the war crimes rema...
4695,conservapedia,Iraq War,Pro,Iraq War,"""Despite the failures that occurred in the tim..."
4696,conservapedia,Iraq War,Pro,Iraq War,
4697,conservapedia,Iraq War,Pro,Iraq War,Her dissatisfaction with politicians is only n...


### Preprocess Text

#### Create dataframe which will store the data

In [6]:
wmd_df = args_df.merge(wiki_df)

#### Lowercase and remove Stopwords

In [8]:
from nltk.corpus import stopwords
from nltk import download
download('stopwords') 

stop_words = stopwords.words('english')

def preprocess_sentence(str_array):
    new_sentences = []
    sentences_size = []
    for text in str_array:
        new_sentence = ""
        for word in text.split():
            if word not in stop_words:
                new_sentence += " " + word.lower()
        sentences_size.append(new_sentence.count(" "))
        new_sentences.append(new_sentence.strip())
        
    return(new_sentences, sentences_size)

[nltk_data] Downloading package stopwords to /home/allan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [9]:
wmd_df['argument_processed'], wmd_df['argument_processed_size']  = preprocess_sentence(wmd_df.argument)
wmd_df['paragraph_processed'], wmd_df['paragraph_processed_size'] = preprocess_sentence(wmd_df.content)

In [10]:
#wmd_df = wmd_df.assign(argument_processed = preprocess_sentence(wmd_df.argument),
#                      content_processed = preprocess_sentence(wmd_df.content))

#### Add argument orientation  

In [11]:
def orientation(pro_con):
    if "Pro" in pro_con:
        return "Pro"
    return "Con"

wmd_df = wmd_df.assign(arg_orient = wmd_df.apply(lambda row: orientation(row.pro_con), axis=1))

#### Remove rows which either the content_processed or argument_processed is too short

Our goal is computing the similarity between an idea (paragraph) and an argument. However, some rows contain only a few words (sometimes only one word), which are not enough for expressing one idea. Thus, we define a variable for determining the minimum size of a paragraph.

In [12]:
paragraph_min_size = 5

wmd_df = wmd_df.query("paragraph_processed_size > " + str(paragraph_min_size)).query("argument_processed_size > " + str(paragraph_min_size))

#### Compute the WMD between the arguments and paragraphs

In [13]:
#from tqdm import tqdm
#tqdm.pandas()

#model.init_sims(replace=True)

#wmd_df['wmd'] = wmd_df.progress_apply(lambda x: model.wmdistance(x.argument_processed.split(), x.paragraph_processed.split()), axis=1)

#### Similarity computation 

In [14]:
#wmd_df = wmd_df.assign(wmd_sim = (wmd_df.wmd - wmd_df.wmd.min()) / (wmd_df.wmd.max() - wmd_df.wmd.min()))

In [15]:
#wmd_df = wmd_df.rename(columns={"wmd_sim":"wmd_norm", "content":"paragraph", "content_processed":"paragraph_processed", "content_processed_size":"paragraph_processed_size"})

#paragraph_min_size = 15
#wmd_df = wmd_df.query("paragraph_processed_size > " + str(paragraph_min_size)).query("argument_processed_size > " + str(paragraph_min_size))

In [16]:
#wmd_df.to_csv('wmd_distances.csv', index = None, header=True)
wmd_df = pd.read_csv("wmd_distances.csv")
wmd_df = wmd_df.rename(columns={"wmd_sim":"wmd_norm", "stance":"source_stance"})

wmd_df = wmd_df[wmd_df.topic != "Gay Marriage"]

  interactivity=interactivity, compiler=compiler, result=result)


In [26]:
wmd_df.head()

Unnamed: 0,url,topic,pro_con,argument_topic,argument,argument_summary,source,source_stance,title,paragraph,argument_processed,argument_processed_size,paragraph_processed,paragraph_processed_size,arg_orient,wmd,wmd_norm
0,https://euthanasia.procon.org/view.resource.ph...,Euthanasia,Pro 1,Legalization,The right to die should be a matter of persona...,,conservapedia,Con,Euthanasia,Euthanasia Society lists Margaret Sanger as a ...,the right die matter personal choice. we able ...,62,euthanasia society lists margaret sanger member,6,Pro,1.14477,0.816794
1,https://euthanasia.procon.org/view.resource.ph...,Euthanasia,Pro 1,Legalization,The right to die should be a matter of persona...,,conservapedia,Con,Euthanasia,Euthanasia (from the Greek for good death) is ...,the right die matter personal choice. we able ...,62,euthanasia (from greek good death) politically...,39,Pro,0.923911,0.509788
2,https://euthanasia.procon.org/view.resource.ph...,Euthanasia,Pro 1,Legalization,The right to die should be a matter of persona...,,conservapedia,Con,Euthanasia,Most libertarians and many liberals support le...,the right die matter personal choice. we able ...,62,most libertarians many liberals support legali...,126,Pro,0.897518,0.473101
3,https://euthanasia.procon.org/view.resource.ph...,Euthanasia,Pro 1,Legalization,The right to die should be a matter of persona...,,conservapedia,Con,Euthanasia,The widely reported withdrawal of a feeding tu...,the right die matter personal choice. we able ...,62,the widely reported withdrawal feeding tube di...,41,Pro,0.949271,0.545041
4,https://euthanasia.procon.org/view.resource.ph...,Euthanasia,Pro 1,Legalization,The right to die should be a matter of persona...,,conservapedia,Con,Euthanasia,Of recent concern are erroneous reports (July ...,the right die matter personal choice. we able ...,62,of recent concern erroneous reports (july 2009...,48,Pro,1.005283,0.6229


#### Most similar (less distant) argument to each paragraph by document 

In [17]:
most_sim_arg2paragraph = wmd_df.groupby(['paragraph']).wmd_norm.agg(['min']).reset_index()
most_sim_arg2paragraph = most_sim_arg2paragraph.merge(wmd_df, left_on=["paragraph","min"], right_on=["paragraph","wmd_norm"]).drop_duplicates()

##### Number of paragraphs, by source and topic, most related to each orientation's argument 

In [18]:
most_sim_arg2paragraph.groupby(["topic","source","arg_orient"]).size().to_frame('count').reset_index()

Unnamed: 0,topic,source,arg_orient,count
0,Abortion,conservapedia,Con,110
1,Abortion,conservapedia,Pro,87
2,Abortion,rationalwiki,Con,15
3,Abortion,rationalwiki,Pro,24
4,Climate Change,conservapedia,Con,5
5,Climate Change,conservapedia,Pro,24
6,Climate Change,rationalwiki,Con,60
7,Climate Change,rationalwiki,Pro,122
8,Corporal Punishment,conservapedia,Con,6
9,Corporal Punishment,conservapedia,Pro,3


#### Most similar (less distant) paragraph to each argument 

In [19]:
most_sim_paragraph2arg = wmd_df.groupby(['argument']).wmd_norm.agg(['min']).reset_index()
most_sim_paragraph2arg = most_sim_paragraph2arg.merge(wmd_df, left_on=["argument","min"], right_on=["argument","wmd_norm"]).drop_duplicates()
#most_sim_paragraph2arg = most_sim_paragraph2arg.rename(columns={"wmd_sim":"wmd_norm"})

In [20]:
most_sim_paragraph2arg = most_sim_paragraph2arg.groupby(["topic","source","arg_orient","source_stance"]).size().to_frame('count').reset_index()
most_sim_paragraph2arg




Unnamed: 0,topic,source,arg_orient,source_stance,count
0,Abortion,conservapedia,Con,Con,15
1,Abortion,conservapedia,Pro,Con,14
2,Abortion,rationalwiki,Con,Pro,1
3,Abortion,rationalwiki,Pro,Pro,1
4,Climate Change,conservapedia,Con,Con-,5
5,Climate Change,conservapedia,Pro,Con-,2
6,Climate Change,rationalwiki,Con,Pro-,8
7,Climate Change,rationalwiki,Pro,Pro-,12
8,Corporal Punishment,rationalwiki,Con,Con,3
9,Corporal Punishment,rationalwiki,Pro,Con,3


### WEAT for documents (paragraph level, infer about the document's stance) 

In [21]:
# mean distance of each paragraph to each attribute (argument - con/pro) set
sim_paragraph_args = wmd_df.groupby(["paragraph","topic","title","source","arg_orient","source_stance"]).wmd_norm.agg(['mean']).reset_index()
sim_paragraph_args = sim_paragraph_args.rename(columns={"mean":"mean_dist"})

# difference of distance of each paragraph to the attribute sets
con_args = sim_paragraph_args[sim_paragraph_args.arg_orient == "Con"]
pro_args = sim_paragraph_args[sim_paragraph_args.arg_orient == "Pro"]
sim_paragraph_args = pro_args.merge(con_args, on=["paragraph","topic","title","source","source_stance"]).drop(columns=["arg_orient_x", "arg_orient_y"])
sim_paragraph_args = sim_paragraph_args.rename(columns={"mean_dist_x":"pro_dist","mean_dist_y":"con_dist"})
sim_paragraph_args["diff_dist"] = sim_paragraph_args.pro_dist - sim_paragraph_args.con_dist

In [22]:
cons_art = sim_paragraph_args[(sim_paragraph_args.source == "conservapedia") & (sim_paragraph_args.topic == "Abortion")]
ratio_art = sim_paragraph_args[(sim_paragraph_args.source == "rationalwiki") & (sim_paragraph_args.topic == "Abortion")]

In [23]:
from mlxtend.evaluate import permutation_test

sources = sim_paragraph_args.source.unique()
topics = sim_paragraph_args.topic.unique()

significance_test_array = []

for topic in topics:
    conservapedia = sim_paragraph_args[(sim_paragraph_args.source == sources[1]) & (sim_paragraph_args.topic == topic)]
    ratiowiki = sim_paragraph_args[(sim_paragraph_args.source == sources[0]) & (sim_paragraph_args.topic == topic)]

    p_value = permutation_test(conservapedia.diff_dist, ratiowiki.diff_dist,
                           method='approximate', num_rounds=1000)
    
    interpret = "cons-pro x rwiki-con"
    if conservapedia.diff_dist.mean() > ratiowiki.diff_dist.mean():
        interpret = "cons-con x rwiki-pro"
    
    groundtruth = "cons-" + conservapedia.source_stance.iloc[0] + " x ratiowiki-" + ratiowiki.source_stance.iloc[0]
    
    significance_test_array.append([topic, p_value, interpret, groundtruth, conservapedia.diff_dist.mean(), ratiowiki.diff_dist.mean()])
    
significance_test_df = pd.DataFrame(significance_test_array, columns=['topic', 'p_value', 'res_interpretation', 'ground_truth','cons_mean_dif', 'rwiki_mean_dif'])        

In [24]:
significance_test_df

Unnamed: 0,topic,p_value,res_interpretation,ground_truth,cons_mean_dif,rwiki_mean_dif
0,Health Care Form,0.405,cons-con x rwiki-pro,cons-Con x ratiowiki-Pro*,-0.009929,-0.012056
1,Prostitution,0.015,cons-con x rwiki-pro,cons-Con x ratiowiki-Pro*,0.015388,-0.001645
2,Iraq War,0.001,cons-pro x rwiki-con,cons-Pro x ratiowiki-Con,-0.006217,0.00085
3,Gun Control,0.463,cons-con x rwiki-pro,cons-Con x ratiowiki-Pro,0.02603,0.020387
4,Abortion,0.205,cons-pro x rwiki-con,cons-Con x ratiowiki-Pro,-0.009452,-0.005178
5,Death Penalty,0.047,cons-pro x rwiki-con,cons-Pro x ratiowiki-Con,0.0027,0.007061
6,Minimum Wage,0.514,cons-pro x rwiki-con,cons-Con x ratiowiki-Pro*,-0.009455,-0.005851
7,Climate Change,0.033,cons-con x rwiki-pro,cons-Con- x ratiowiki-Pro-,0.002936,-0.004877
8,Marijuana,0.0,cons-pro x rwiki-con,cons-Con x ratiowiki-Pro,0.011054,0.032291
9,Euthanasia,0.559,cons-pro x rwiki-con,cons-Con x ratiowiki-Pro*,0.013388,0.016243


### WEAT for documents (document level, infer about the source's stance)

###### Toy example 

In [None]:
fake_data = {'content':["pa1","pa2","pa1","pa2","pa1","pa2","pa1","pa2","pa1","pa2"],
             'pro_con':["c1","c1","c2","c2","p1","p1","p2","p2","p3","p3"],
             'source':["con","con","con","con","con","con","con","con","con","con"],
             'topic':["A","A","A","A","A","A","A","A","A","A"],
             'wmd_sim':[0.8, 0.8, 0.9, 0.7, 0.6, 0.8, 0.8, 0.9, 0.7, 0.6]} 

fake_data_2 = {'content':["pa1","pa2","pa3","pa1","pa2","pa3","pa1","pa2","pa3","pa1","pa2","pa3","pa1","pa2","pa3"],
             'pro_con':["c1","c1","c1","c2","c2","c2","p1","p1","p1","p2","p2","p2","p3","p3","p3"],
             'source':["rat","rat","rat","rat","rat","rat","rat","rat","rat","rat","rat","rat","rat","rat","rat"],
             'topic':["A","A","A","A","A","A","A","A","A","A","A","A","A","A","A"],
             'wmd_sim':[0.8, 0.8, 0.9, 0.7, 0.6, 0.8, 0.8, 0.9, 0.7, 0.6, 0.8, 0.8, 0.9, 0.7, 0.6]} 

fake_data_3 = {'content':["pa1","pa2","pa1","pa2","pa1","pa2","pa1","pa2","pa1","pa2","pa1","pa2","pa3","pa1","pa2","pa3","pa1","pa2","pa3","pa1","pa2","pa3","pa1","pa2","pa3"],
             'pro_con':["Con 1","Con 1","Con 2","Con 2","Pro 1","Pro 1","Pro 2","Pro 2","Pro 3","Pro 3","Con 1","Con 1","Con 1","Con 2","Con 2","Con 2","Pro 1","Pro 1","Pro 1","Pro 2","Pro 2","Pro 2","Pro 3","Pro 3","Pro 3"],
             'source':["con","con","con","con","con","con","con","con","con","con","rat","rat","rat","rat","rat","rat","rat","rat","rat","rat","rat","rat","rat","rat","rat"],
             'topic':["A","A","A","A","A","A","A","A","A","A","A","A","A","A","A","A","A","A","A","A","A","A","A","A","A"],
             'wmd_sim':[0.8, 0.8, 0.9, 0.7, 0.6, 0.8, 0.8, 0.9, 0.7, 0.6,0.8, 0.8, 0.9, 0.7, 0.6, 0.8, 0.8, 0.9, 0.7, 0.6, 0.8, 0.8, 0.9, 0.7, 0.6]} 

fake_wmd_df = pd.DataFrame(fake_data_3) 