In [1]:
import numpy as np
import pandas as pd
from gensim.models import Word2Vec
from gensim.models import KeyedVectors

In [2]:
model = KeyedVectors.load_word2vec_format('word_embeddings/model.bin', binary=True)

In [3]:
wiki_json = pd.read_json("../crawlers/procon/wiki.json")
args_json = pd.read_json("../crawlers/procon/procon.json")

#### Format DataFrame rows

In [4]:
def format_args_row(row):
    df = row.apply(pd.Series).T
    df.url = df.url[0]
    df.topic = df.topic[0]
    return(df)

arg_df_series = args_json.apply(format_args_row, axis=1)
args_df = pd.concat(arg_df_series.values.tolist(),ignore_index=True)

In [5]:
def format_wiki_row(row):
    df = row.apply(pd.Series).T
    df.source = df.source[0]
    df.title = df.title[0]
    df.topic = df.topic[0]
    return(df)

wiki_df_series = wiki_json.apply(format_wiki_row, axis=1)
wiki_df = pd.concat(wiki_df_series.values.tolist(),ignore_index=True)

### Preprocess Text

#### Create dataframe which will store the data

In [6]:
wmd_df = args_df.merge(wiki_df)

In [7]:
#wmd_df = wmd_df.head()

#### Lowercase and remove Stopwords

In [8]:
from nltk.corpus import stopwords
from nltk import download
download('stopwords') 

stop_words = stopwords.words('english')

def preprocess_sentence(str_array):
    new_sentences = []
    sentences_size = []
    for text in str_array:
        new_sentence = ""
        for word in text.split():
            if word not in stop_words:
                new_sentence += " " + word.lower()
        sentences_size.append(new_sentence.count(" "))
        new_sentences.append(new_sentence.strip())
        
    return(new_sentences, sentences_size)

[nltk_data] Downloading package stopwords to /home/allan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [9]:
wmd_df['argument_processed'], wmd_df['argument_processed_size']  = preprocess_sentence(wmd_df.argument)
wmd_df['content_processed'], wmd_df['content_processed_size'] = preprocess_sentence(wmd_df.content)

In [10]:
#wmd_df = wmd_df.assign(argument_processed = preprocess_sentence(wmd_df.argument),
#                      content_processed = preprocess_sentence(wmd_df.content))

#### Add argument orientation information  

In [11]:
def orientation(pro_con):
    if "Pro" in pro_con:
        return "Pro"
    return "Con"

wmd_df = wmd_df.assign(arg_orient = wmd_df.apply(lambda row: orientation(row.pro_con), axis=1))

#### Remove rows which either the content_processed or argument_processed is too short

Our goal is computing the similarity between an idea (paragraph) and an argument. However, some rows contain only a few words (sometimes only one word), which are not enough for expressing one idea. Thus, we define a variable for determining the minimum size of a paragraph.

In [12]:
paragraph_min_size = 5

wmd_df = wmd_df.query("content_processed_size > " + str(paragraph_min_size)).query("argument_processed_size > " + str(paragraph_min_size))

#### Compute the WMD between arguments and each paragraph

In [None]:
from tqdm import tqdm
tqdm.pandas()

model.init_sims(replace=True)

wmd_df['wmd'] = wmd_df.progress_apply(lambda x: model.wmdistance(x.argument_processed.split(), x.content_processed.split()), axis=1)
wmd_df.to_csv ('wmd_distances.csv', index = None, header=True)
#wmd_df = pd.read_csv("wmd_distances.csv")

  from pandas import Panel
 69%|██████▊   | 30484/44426 [16:45<06:37, 35.03it/s] 

#### Similarity computation 

In [None]:
wmd_df = wmd_df.assign(wmd_sim = (wmd_df.wmd - wmd_df.wmd.min()) / (wmd_df.wmd.max() - wmd_df.wmd.min()))

In [None]:
wmd_df

### WEAT for documents (name it better)

##### Similarity between paragraph and a pro/con set 

In [None]:
sim_paragraph_args = wmd_df.groupby(['topic','source','content',"arg_orient"]).wmd_sim.agg(['mean'])

In [None]:
sim_paragraph_args

In [None]:
sim_paragraph_args.query("topic == 'Abortion'").query("source == 'conservapedia'")

#### toy example 

In [None]:
fake_data = {'content':["pa1","pa2","pa1","pa2","pa1","pa2","pa1","pa2","pa1","pa2"],
             'pro_con':["c1","c1","c2","c2","p1","p1","p2","p2","p3","p3"],
             'source':["con","con","con","con","con","con","con","con","con","con"],
             'topic':["A","A","A","A","A","A","A","A","A","A"],
             'wmd_sim':[0.8, 0.8, 0.9, 0.7, 0.6, 0.8, 0.8, 0.9, 0.7, 0.6]} 

fake_data_2 = {'content':["pa1","pa2","pa3","pa1","pa2","pa3","pa1","pa2","pa3","pa1","pa2","pa3","pa1","pa2","pa3"],
             'pro_con':["c1","c1","c1","c2","c2","c2","p1","p1","p1","p2","p2","p2","p3","p3","p3"],
             'source':["rat","rat","rat","rat","rat","rat","rat","rat","rat","rat","rat","rat","rat","rat","rat"],
             'topic':["A","A","A","A","A","A","A","A","A","A","A","A","A","A","A"],
             'wmd_sim':[0.8, 0.8, 0.9, 0.7, 0.6, 0.8, 0.8, 0.9, 0.7, 0.6, 0.8, 0.8, 0.9, 0.7, 0.6]} 

fake_data_3 = {'content':["pa1","pa2","pa1","pa2","pa1","pa2","pa1","pa2","pa1","pa2","pa1","pa2","pa3","pa1","pa2","pa3","pa1","pa2","pa3","pa1","pa2","pa3","pa1","pa2","pa3"],
             'pro_con':["Con 1","Con 1","Con 2","Con 2","Pro 1","Pro 1","Pro 2","Pro 2","Pro 3","Pro 3","Con 1","Con 1","Con 1","Con 2","Con 2","Con 2","Pro 1","Pro 1","Pro 1","Pro 2","Pro 2","Pro 2","Pro 3","Pro 3","Pro 3"],
             'source':["con","con","con","con","con","con","con","con","con","con","rat","rat","rat","rat","rat","rat","rat","rat","rat","rat","rat","rat","rat","rat","rat"],
             'topic':["A","A","A","A","A","A","A","A","A","A","A","A","A","A","A","A","A","A","A","A","A","A","A","A","A"],
             'wmd_sim':[0.8, 0.8, 0.9, 0.7, 0.6, 0.8, 0.8, 0.9, 0.7, 0.6,0.8, 0.8, 0.9, 0.7, 0.6, 0.8, 0.8, 0.9, 0.7, 0.6, 0.8, 0.8, 0.9, 0.7, 0.6]} 

fake_wmd_df = pd.DataFrame(fake_data_3) 