In [1]:
import pandas as pd

# Data Import

## Documents preparation

### Arguments reading 

In [2]:
args_json = pd.read_json("crawlers/procon/procon.json")

def format_args_row(row):
    df = row.apply(pd.Series).T
    df.url = df.url[0]
    df.topic = df.topic[0]
    return(df)

arg_df_series = args_json.apply(format_args_row, axis=1)
args_df = pd.concat(arg_df_series.values.tolist(),ignore_index=True)

In [3]:
def orientation(pro_con):
    if "Pro" in pro_con:
        return "Pro"
    return "Con"

args_df = args_df.assign(argument_orient = args_df.apply(lambda row: orientation(row.pro_con), axis=1))

### Documents reading

#### Convert from json to dataframe and add the topic and the source stance information 

In [4]:
import glob
import json

def create_df(path_list, topic, source_stance):
    glob_data = []
    for path in path_list:
        for file in glob.glob(path):
            with open(file) as json_file:
                data = json.load(json_file)
                glob_data.append(data)

        df = pd.DataFrame(glob_data)
        df['text_topic'] = topic
        df['text_source_stance'] = source_stance
    return(df)

In [5]:
path_list = ["../../news-please-repo/data/2020/02/06/abort73.com/*.json"]
topic = "Abortion"
source_stance = "Con"

abort73 = create_df(path_list, topic, source_stance)

In [6]:
path_list = ["../../news-please-repo/data/2020/02/06/mpp.org/*.json","../../news-please-repo/data/2020/02/06/blog.mpp.org/*.json"]
topic = "Marijuana"
source_stance = "Pro"

mpp = create_df(path_list, topic, source_stance)

In [7]:
path_list = ["../../news-please-repo/data/2020/02/06/prochoiceamerica.org/*.json"]
topic = "Abortion"
source_stance = "Pro"

prochoice = create_df(path_list, topic, source_stance)

In [8]:
path_list = ["../../news-please-repo/data/2020/02/06/calmca.org/*.json"]
topic = "Marijuana"
source_stance = "Con"

calmca = create_df(path_list, topic, source_stance)

In [9]:
document_df = pd.concat([abort73, prochoice, mpp, calmca], ignore_index=True)

#### Filter the columns of dataframes to use 

In [10]:
columns = ['authors', 'date_modify', 'date_publish', 'title', 'source_domain', 'text', 'url', 'text_topic', 'text_source_stance']
document_df = pd.DataFrame(document_df, columns=columns)
document_df = document_df.rename(columns={"authors":"text_authors", "title":"text_title", "source_domain":"text_source_domain", "url":"text_url"})

### Preprocessing document's and argument's text

####  Removing rows which text contains None from the data

In [11]:
args_df = args_df[~args_df.argument.isna()]
document_df = document_df[~document_df.text.isna()]

#### Removing stopwords and measuring the resulting text length 

In [12]:
from nltk.corpus import stopwords
from nltk import download
download('stopwords') 

stop_words = stopwords.words('english')

def preprocess_sentence(str_array):
    new_sentences = []
    sentences_size = []
    for text in str_array:
        new_sentence = ""
        for word in text.split():
            if word not in stop_words:
                new_sentence += " " + word.lower()
        sentences_size.append(new_sentence.count(" "))
        new_sentences.append(new_sentence.strip())
        
    return(new_sentences, sentences_size)

[nltk_data] Downloading package stopwords to /home/allan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [13]:
args_df['argument_processed'], args_df['argument_processed_size']  = preprocess_sentence(args_df.argument)
document_df['text_processed'], document_df['text_processed_size'] = preprocess_sentence(document_df.text)

### Merge documents and arguments by topic 

In [14]:
args_docs = args_df.merge(document_df, left_on=['topic'], right_on=['text_topic'])

## Paragraphs preparation

In [15]:
wiki_json = pd.read_json("crawlers/procon/wiki.json")

In [16]:
# TODO: Add preprocess_sentence in the paragraphs processing
def format_wiki_row(row):
    df = row.apply(pd.Series).T
    df.source = df.source[0]
    df.title = df.title[0]
    df.topic = df.topic[0]
    df.stance = df.stance[0]
    df.url = df.url[0]
    return(df)

wiki_df_series = wiki_json.apply(format_wiki_row, axis=1)
wiki_df = pd.concat(wiki_df_series.values.tolist(),ignore_index=True)
wiki_df = wiki_df.rename(columns={"topic":"text_topic", "title":"text_title", "content":"text", "source":"text_source_domain", "stance":"text_source_stance", "url":"text_url"})

In [17]:
args_prg = args_df.merge(wiki_df, left_on=['topic'], right_on=['text_topic'])

# Distance computation

In [18]:
from gensim.models import Word2Vec
from gensim.models import KeyedVectors

In [19]:
model = KeyedVectors.load_word2vec_format('data/model.bin', binary=True)
model.init_sims(replace=True)

In [20]:
from tqdm import tqdm
import math
tqdm.pandas()

def wmd_computation(df):
    df['wmd'] = df.progress_apply(lambda x: model.wmdistance(x.argument_processed.split(), x.text_processed.split()), axis=1)
    df = df.query("wmd != "+str(math.inf))
    df = df.assign(wmd_norm = (df.wmd - df.wmd.min()) / (df.wmd.max() - df.wmd.min()))
    return(df)

  from pandas import Panel


In [21]:
#args_prg_dist = wmd_computation(args_prg)

In [22]:
#args_prg_dist.to_csv("../data/args_prg_dist.csv", index_label=False)

In [23]:
args_prg_dist = pd.read_csv("data/args_prg_dist.csv")

  interactivity=interactivity, compiler=compiler, result=result)


In [24]:
#sampled_data = args_docs.sample(4000)
#args_docs_dist = wmd_computation(sampled_data)

In [25]:
#args_docs_dist.to_csv("../data/sampled_args_docs_dist.csv", index_label=False)

In [84]:
args_docs_dist = pd.read_csv("data/sampled_args_docs_dist.csv")

In [85]:
count = 0
idx = []
for txt in args_docs_dist.text:
    count += 1
    if ((topic == "Marijuana") and ("medic" in txt)):        
        idx.append(count)

In [86]:
medic_marijuana_dist = args_docs_dist.iloc[idx]

In [87]:
abortion_dist = args_docs_dist[args_docs_dist.text_topic == "Abortion"]

In [88]:
args_docs_dist = pd.concat([medic_marijuana_dist, abortion_dist])

# WEAT

In [89]:
from mlxtend.evaluate import permutation_test

def weat(dist_df):

    # mean distance of each text to each argument (argument - con/pro) set
    sim_text_args = dist_df.groupby(["text","text_topic","text_title","text_source_domain","argument_orient","text_source_stance"]).wmd_norm.agg(['mean']).reset_index()
    sim_text_args = sim_text_args.rename(columns={"mean":"mean_dist"})

    # difference of distance of each paragraph to the attribute sets
    con_args = sim_text_args[sim_text_args.argument_orient == "Con"]
    pro_args = sim_text_args[sim_text_args.argument_orient == "Pro"]
    sim_text_args = pro_args.merge(con_args, on=["text","text_topic","text_title","text_source_domain","text_source_stance"]).drop(columns=["argument_orient_x", "argument_orient_y"])
    sim_text_args = sim_text_args.rename(columns={"mean_dist_x":"pro_dist","mean_dist_y":"con_dist"})
    sim_text_args["diff_dist"] = sim_text_args.pro_dist - sim_text_args.con_dist

    topics = sim_text_args.text_topic.unique()

    significance_test_array = []

    for topic in topics:
        topic_sources = sim_text_args[sim_text_args.text_topic == topic]
        sources = topic_sources.text_source_domain.unique()
        
        for source in sources:
            source_a = topic_sources[(topic_sources.text_source_domain == sources[0])]
            source_b = topic_sources[(topic_sources.text_source_domain == sources[1])]

            p_value = permutation_test(source_b.diff_dist, source_a.diff_dist,
                                       method='approximate', num_rounds=1000)
            
            interpret = source_b.text_source_domain.iloc[0]+"-Pro x "+source_a.text_source_domain.iloc[0]+"-Con"
            if source_b.diff_dist.mean() > source_a.diff_dist.mean():
                interpret = source_b.text_source_domain.iloc[0]+"-Con x "+source_a.text_source_domain.iloc[0]+"-Pro"
            
            groundtruth = source_b.text_source_domain.iloc[0] + "-" + source_b.text_source_stance.iloc[0] + " x " + source_a.text_source_domain.iloc[0] +"-"+ source_a.text_source_stance.iloc[0]
    
        significance_test_array.append([topic, p_value, interpret, groundtruth, source_b.diff_dist.mean(), source_a.diff_dist.mean()])
    
    significance_test_df = pd.DataFrame(significance_test_array, columns=['topic', 'p_value', 'res_interpretation', 'ground_truth','cons_mean_dif', 'rwiki_mean_dif'])
    
    return(significance_test_df)

In [90]:
weat(args_prg_dist)

['rationalwiki' 'conservapedia']
['conservapedia' 'rationalwiki']
['rationalwiki' 'conservapedia']
['rationalwiki' 'conservapedia']
['conservapedia' 'rationalwiki']
['rationalwiki' 'conservapedia']
['rationalwiki' 'conservapedia']
['conservapedia' 'rationalwiki']
['rationalwiki' 'conservapedia']
['rationalwiki' 'conservapedia']
['rationalwiki' 'conservapedia']
['rationalwiki' 'conservapedia']
['rationalwiki' 'conservapedia']
['rationalwiki' 'conservapedia']
['rationalwiki' 'conservapedia']
['conservapedia' 'rationalwiki']


Unnamed: 0,topic,p_value,res_interpretation,ground_truth,cons_mean_dif,rwiki_mean_dif
0,Health Care Form,0.926,conservapedia-Pro x rationalwiki-Con,conservapedia-Con x rationalwiki-Pro*,-0.007664,-0.007526
1,Prostitution,0.014,rationalwiki-Pro x conservapedia-Con,rationalwiki-Pro* x conservapedia-Con,0.000779,0.011459
2,Iraq War,0.0,conservapedia-Pro x rationalwiki-Con,conservapedia-Pro x rationalwiki-Con,-0.004981,0.000782
3,Gun Control,0.879,conservapedia-Pro x rationalwiki-Con,conservapedia-Con x rationalwiki-Pro,0.010806,0.011382
4,Abortion,0.0,rationalwiki-Con x conservapedia-Pro,rationalwiki-Pro x conservapedia-Con,-0.001799,-0.007977
5,Video Games,0.0,conservapedia-Con x rationalwiki-Pro,conservapedia-Con- x rationalwiki-Pro-,0.013261,0.003191
6,Death Penalty,0.096,conservapedia-Pro x rationalwiki-Con,conservapedia-Pro x rationalwiki-Con,0.002502,0.004459
7,Marijuana,0.0,rationalwiki-Con x conservapedia-Pro,rationalwiki-Pro x conservapedia-Con,0.020878,0.004745
8,Minimum Wage,0.026,conservapedia-Pro x rationalwiki-Con,conservapedia-Con x rationalwiki-Pro*,-0.009562,-0.00526
9,Climate Change,0.178,conservapedia-Con x rationalwiki-Pro,conservapedia-Con- x rationalwiki-Pro-,-0.00031,-0.003468


In [91]:
weat(args_docs_dist)

['abort73.com' 'prochoiceamerica.org']
['mpp.org']


IndexError: index 1 is out of bounds for axis 0 with size 1