In [1]:
import numpy as np
import pandas as pd
from gensim.models import Word2Vec
from gensim.models import KeyedVectors

In [2]:
model = KeyedVectors.load_word2vec_format('word_embeddings/model.bin', binary=True)

In [55]:
wiki_json = pd.read_json("../crawlers/procon/wiki.json")
args_json = pd.read_json("../crawlers/procon/procon.json")

In [56]:
wiki_json

Unnamed: 0,source,topic,title,content
0,conservapedia,Death Penalty,Capital punishment,"[Capital punishment, also known as the death p..."
1,conservapedia,Gun Control,Gun,[The gun is a mechanical weapon which uses exp...
2,conservapedia,Climate Change,Climate change,[3675 Days Until Final Destruction of the Plan...
3,conservapedia,Marijuana,Marijuana,"[Indoor Marijuana Grow (DEA photo), Marijuana ..."
4,conservapedia,Net Neutrality,Network neutrality,[Network neutrality (more frequently referred ...
5,conservapedia,ACLU,ACLU,"[Current logo of the ACLU., The American Civil..."
6,conservapedia,Abortion,Abortion,[An unborn child in the womb. Liberals believe...
7,conservapedia,Euthanasia,Euthanasia,[Euthanasia Society lists Margaret Sanger as a...
8,conservapedia,Corporal Punishment,Corporal punishment,[Corporal punishment is the infliction of phys...
9,conservapedia,GMOs,Genetically modified organism,[A genetically modified organism (GMO) is an o...


#### Format DataFrame rows

In [4]:
def format_args_row(row):
    df = row.apply(pd.Series).T
    df.url = df.url[0]
    df.topic = df.topic[0]
    return(df)

arg_df_series = args_json.apply(format_args_row, axis=1)
args_df = pd.concat(arg_df_series.values.tolist(),ignore_index=True)

In [5]:
def format_wiki_row(row):
    df = row.apply(pd.Series).T
    df.source = df.source[0]
    df.title = df.title[0]
    df.topic = df.topic[0]
    return(df)

wiki_df_series = wiki_json.apply(format_wiki_row, axis=1)
wiki_df = pd.concat(wiki_df_series.values.tolist(),ignore_index=True)

### Preprocess Text

#### Create dataframe which will store the data

In [6]:
wmd_df = args_df.merge(wiki_df)

In [7]:
#wmd_df = wmd_df.head()

#### Lowercase and remove Stopwords

In [8]:
from nltk.corpus import stopwords
from nltk import download
download('stopwords') 

stop_words = stopwords.words('english')

def preprocess_sentence(str_array):
    new_sentences = []
    sentences_size = []
    for text in str_array:
        new_sentence = ""
        for word in text.split():
            if word not in stop_words:
                new_sentence += " " + word.lower()
        sentences_size.append(new_sentence.count(" "))
        new_sentences.append(new_sentence.strip())
        
    return(new_sentences, sentences_size)

[nltk_data] Downloading package stopwords to /home/allan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [9]:
wmd_df['argument_processed'], wmd_df['argument_processed_size']  = preprocess_sentence(wmd_df.argument)
wmd_df['paragraph_processed'], wmd_df['paragraph_processed_size'] = preprocess_sentence(wmd_df.content)

In [10]:
#wmd_df = wmd_df.assign(argument_processed = preprocess_sentence(wmd_df.argument),
#                      content_processed = preprocess_sentence(wmd_df.content))

#### Add argument orientation  

In [11]:
def orientation(pro_con):
    if "Pro" in pro_con:
        return "Pro"
    return "Con"

wmd_df = wmd_df.assign(arg_orient = wmd_df.apply(lambda row: orientation(row.pro_con), axis=1))

#### Remove rows which either the content_processed or argument_processed is too short

Our goal is computing the similarity between an idea (paragraph) and an argument. However, some rows contain only a few words (sometimes only one word), which are not enough for expressing one idea. Thus, we define a variable for determining the minimum size of a paragraph.

In [12]:
paragraph_min_size = 5

wmd_df = wmd_df.query("paragraph_processed_size > " + str(paragraph_min_size)).query("argument_processed_size > " + str(paragraph_min_size))

#### Compute the WMD between the arguments and paragraphs

In [13]:
#from tqdm import tqdm
#tqdm.pandas()

#model.init_sims(replace=True)

#wmd_df['wmd'] = wmd_df.progress_apply(lambda x: model.wmdistance(x.argument_processed.split(), x.content_processed.split()), axis=1)

#### Similarity computation 

In [14]:
#wmd_df = wmd_df.assign(wmd_sim = (wmd_df.wmd - wmd_df.wmd.min()) / (wmd_df.wmd.max() - wmd_df.wmd.min()))

In [53]:
#wmd_df.to_csv ('wmd_distances.csv', index = None, header=True)
wmd_df = pd.read_csv("wmd_distances.csv")
wmd_df = wmd_df.rename(columns={"wmd_sim":"wmd_norm", "content":"paragraph", "content_processed":"paragraph_processed", "content_processed_size":"paragraph_processed_size"})

#paragraph_min_size = 15
#wmd_df = wmd_df.query("paragraph_processed_size > " + str(paragraph_min_size)).query("argument_processed_size > " + str(paragraph_min_size))

#### Most similar (less distant) argument to each paragraph by document 

In [16]:
most_sim_arg2paragraph = wmd_df.groupby(['paragraph']).wmd_norm.agg(['min']).reset_index()
most_sim_arg2paragraph = most_sim_arg2paragraph.merge(wmd_df, left_on=["paragraph","min"], right_on=["paragraph","wmd_norm"]).drop_duplicates()

##### Number of paragraphs, by source and topic, most related to each orientation's argument 

In [17]:
most_sim_arg2paragraph.groupby(["topic","source","arg_orient"]).size().to_frame('count').reset_index()

Unnamed: 0,topic,source,arg_orient,count
0,Abortion,conservapedia,Con,110
1,Abortion,conservapedia,Pro,87
2,Abortion,rationalwiki,Con,15
3,Abortion,rationalwiki,Pro,24
4,Climate Change,conservapedia,Con,5
...,...,...,...,...
56,School Vouchers,rationalwiki,Pro,8
57,Video Games,conservapedia,Con,44
58,Video Games,conservapedia,Pro,15
59,Video Games,rationalwiki,Con,51


#### Most similar (less distant) paragraph to each argument 

In [18]:
most_sim_paragraph2arg = wmd_df.groupby(['argument']).wmd_norm.agg(['min']).reset_index()
most_sim_paragraph2arg = most_sim_paragraph2arg.merge(wmd_df, left_on=["argument","min"], right_on=["argument","wmd_norm"]).drop_duplicates()

In [19]:
most_sim_paragraph2arg.groupby(["topic","source","arg_orient"]).size().to_frame('count').reset_index()

Unnamed: 0,topic,source,arg_orient,count
0,Abortion,conservapedia,Con,15
1,Abortion,conservapedia,Pro,14
2,Abortion,rationalwiki,Con,1
3,Abortion,rationalwiki,Pro,1
4,Climate Change,conservapedia,Con,5
5,Climate Change,conservapedia,Pro,2
6,Climate Change,rationalwiki,Con,8
7,Climate Change,rationalwiki,Pro,12
8,Corporal Punishment,rationalwiki,Con,3
9,Corporal Punishment,rationalwiki,Pro,3


### WEAT for documents (paragraph level, infer about the document's stance) 

In [20]:
# mean distance of each paragraph to each attribute (argument - con/pro) set
sim_paragraph_args = wmd_df.groupby(["paragraph","topic","title","source","arg_orient"]).wmd_norm.agg(['mean']).reset_index()
sim_paragraph_args = sim_paragraph_args.rename(columns={"mean":"mean_dist"})

# difference of distance of each paragraph to the attribute sets
con_args = sim_paragraph_args[sim_paragraph_args.arg_orient == "Con"]
pro_args = sim_paragraph_args[sim_paragraph_args.arg_orient == "Pro"]
sim_paragraph_args = pro_args.merge(con_args, on=["paragraph","topic","title","source"]).drop(columns=["arg_orient_x", "arg_orient_y"])
sim_paragraph_args = sim_paragraph_args.rename(columns={"mean_dist_x":"pro_dist","mean_dist_y":"con_dist"})
sim_paragraph_args["diff_dist"] = sim_paragraph_args.pro_dist - sim_paragraph_args.con_dist

In [21]:
cons_art = sim_paragraph_args[(sim_paragraph_args.source == "conservapedia") & (sim_paragraph_args.topic == "Abortion")]
ratio_art = sim_paragraph_args[(sim_paragraph_args.source == "rationalwiki") & (sim_paragraph_args.topic == "Abortion")]

In [50]:
from mlxtend.evaluate import permutation_test

sources = sim_paragraph_args.source.unique()
topics = sim_paragraph_args.topic.unique()

significance_test_array = []

for topic in topics:
    conservapedia = sim_paragraph_args[(sim_paragraph_args.source == sources[1]) & (sim_paragraph_args.topic == topic)]
    ratiowiki = sim_paragraph_args[(sim_paragraph_args.source == sources[0]) & (sim_paragraph_args.topic == topic)]

    p_value = permutation_test(conservapedia.diff_dist, ratiowiki.diff_dist,
                           method='approximate', num_rounds=10000)
    
    interpret = "cons-pro x rwiki-con"
    if conservapedia.diff_dist.mean() > ratiowiki.diff_dist.mean():
        interpret = "cons-con x rwiki-pro"
    
    significance_test_array.append([topic, p_value, interpret, conservapedia.diff_dist.mean(), ratiowiki.diff_dist.mean()])
    
significance_test_df = pd.DataFrame(significance_test_array, columns=['topic', 'p_value', 'interpretation','cons_mean_dif', 'rwiki_mean_dif'])        

In [51]:
significance_test_df

Unnamed: 0,topic,p_value,interpretation,cons_mean_dif,rwiki_mean_dif
0,Health Care Form,0.401,cons-con x rwiki-pro,-0.009929,-0.012056
1,Prostitution,0.0217,cons-con x rwiki-pro,0.014321,-0.001645
2,Iraq War,0.0004,cons-pro x rwiki-con,-0.006217,0.00085
3,Gun Control,0.4583,cons-con x rwiki-pro,0.02603,0.020539
4,Abortion,0.1971,cons-pro x rwiki-con,-0.009452,-0.005178
5,Death Penalty,0.05,cons-pro x rwiki-con,0.0027,0.006806
6,Minimum Wage,0.5324,cons-pro x rwiki-con,-0.009455,-0.005851
7,Climate Change,0.0325,cons-con x rwiki-pro,0.002961,-0.00486
8,Marijuana,0.0,cons-pro x rwiki-con,0.011099,0.032291
9,Video Games,0.0,cons-con x rwiki-pro,0.019632,0.009811


### WEAT for documents (document level, infer about the source's stance)

###### Toy example 

In [None]:
fake_data = {'content':["pa1","pa2","pa1","pa2","pa1","pa2","pa1","pa2","pa1","pa2"],
             'pro_con':["c1","c1","c2","c2","p1","p1","p2","p2","p3","p3"],
             'source':["con","con","con","con","con","con","con","con","con","con"],
             'topic':["A","A","A","A","A","A","A","A","A","A"],
             'wmd_sim':[0.8, 0.8, 0.9, 0.7, 0.6, 0.8, 0.8, 0.9, 0.7, 0.6]} 

fake_data_2 = {'content':["pa1","pa2","pa3","pa1","pa2","pa3","pa1","pa2","pa3","pa1","pa2","pa3","pa1","pa2","pa3"],
             'pro_con':["c1","c1","c1","c2","c2","c2","p1","p1","p1","p2","p2","p2","p3","p3","p3"],
             'source':["rat","rat","rat","rat","rat","rat","rat","rat","rat","rat","rat","rat","rat","rat","rat"],
             'topic':["A","A","A","A","A","A","A","A","A","A","A","A","A","A","A"],
             'wmd_sim':[0.8, 0.8, 0.9, 0.7, 0.6, 0.8, 0.8, 0.9, 0.7, 0.6, 0.8, 0.8, 0.9, 0.7, 0.6]} 

fake_data_3 = {'content':["pa1","pa2","pa1","pa2","pa1","pa2","pa1","pa2","pa1","pa2","pa1","pa2","pa3","pa1","pa2","pa3","pa1","pa2","pa3","pa1","pa2","pa3","pa1","pa2","pa3"],
             'pro_con':["Con 1","Con 1","Con 2","Con 2","Pro 1","Pro 1","Pro 2","Pro 2","Pro 3","Pro 3","Con 1","Con 1","Con 1","Con 2","Con 2","Con 2","Pro 1","Pro 1","Pro 1","Pro 2","Pro 2","Pro 2","Pro 3","Pro 3","Pro 3"],
             'source':["con","con","con","con","con","con","con","con","con","con","rat","rat","rat","rat","rat","rat","rat","rat","rat","rat","rat","rat","rat","rat","rat"],
             'topic':["A","A","A","A","A","A","A","A","A","A","A","A","A","A","A","A","A","A","A","A","A","A","A","A","A"],
             'wmd_sim':[0.8, 0.8, 0.9, 0.7, 0.6, 0.8, 0.8, 0.9, 0.7, 0.6,0.8, 0.8, 0.9, 0.7, 0.6, 0.8, 0.8, 0.9, 0.7, 0.6, 0.8, 0.8, 0.9, 0.7, 0.6]} 

fake_wmd_df = pd.DataFrame(fake_data_3) 