In [22]:
import numpy as np
import pandas as pd
from gensim.models import Word2Vec
from gensim.models import KeyedVectors

In [2]:
model = KeyedVectors.load_word2vec_format('word_embeddings/model.bin', binary=True)

In [3]:
wiki_json = pd.read_json("../crawlers/procon/wiki.json")
args_json = pd.read_json("../crawlers/procon/procon.json")

#### Format DataFrame rows

In [4]:
def format_args_row(row):
    df = row.apply(pd.Series).T
    df.url = df.url[0]
    df.topic = df.topic[0]
    return(df)

arg_df_series = args_json.apply(format_args_row, axis=1)
args_df = pd.concat(arg_df_series.values.tolist(),ignore_index=True)

In [5]:
def format_wiki_row(row):
    df = row.apply(pd.Series).T
    df.source = df.source[0]
    df.title = df.title[0]
    df.topic = df.topic[0]
    return(df)

wiki_df_series = wiki_json.apply(format_wiki_row, axis=1)
wiki_df = pd.concat(wiki_df_series.values.tolist(),ignore_index=True)

### Preprocess Text

#### Create dataframe which will store the data

In [104]:
wmd_df = args_df.merge(wiki_df)

In [105]:
#wmd_df = wmd_df.head()

#### Lowercase and remove Stopwords

In [106]:
from nltk.corpus import stopwords
from nltk import download
download('stopwords') 

stop_words = stopwords.words('english')

def preprocess_sentence(str_array):
    new_sentences = []
    for text in str_array: #wmd_df.argument:
        new_sentence = []
        for word in text.split():
            if word not in stop_words:
                new_sentence.append(word.lower())
        new_sentences.append(new_sentence)
    return(new_sentences)

[nltk_data] Downloading package stopwords to /home/allan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [107]:
wmd_df = wmd_df.assign(argument_processed = preprocess_sentence(wmd_df.argument),
                      content_processed = preprocess_sentence(wmd_df.content))

#### Compute the WMD between arguments and each paragraph

In [109]:
from tqdm import tqdm
tqdm.pandas()

model.init_sims(replace=True)

#wmd_df['wmd'] = wmd_df.apply(lambda x: model.wmdistance(x.argument_processed, x.content_processed), axis=1)
wmd_df['wmd'] = wmd_df.progress_apply(lambda x: model.wmdistance(x.argument_processed, x.content_processed), axis=1)

  from pandas import Panel
100%|██████████| 97673/97673 [31:22<00:00, 51.88it/s]  


In [110]:
#wmd_df.to_csv ('wmd_distances.csv', index = None, header=True)

#### Similarity computation 

In [19]:
def similarity_computation(X):
    norm_X = (X - X.min()) / (X.max() - X.min())
    sim_X = 1-norm_X
    return(sim_X)

wmd_df = wmd_df.assign(wmd_sim = similarity_computation(wmd_df.wmd))

#### WEAT for documents (name it better)

In [48]:
df_filtered = wmd_df.query('topic == "Euthanasia"') 

In [89]:
fake_data = {'content':["pa1","pa2","pa1","pa2","pa1","pa2","pa1","pa2","pa1","pa2"],
             'pro_con':["c1","c1","c2","c2","p1","p1","p2","p2","p3","p3"],
             'source':["con","con","con","con","con","con","con","con","con","con"],
             'topic':["A","A","A","A","A","A","A","A","A","A"],
             'wmd_sim':[0.8, 0.8, 0.9, 0.7, 0.6, 0.8, 0.8, 0.9, 0.7, 0.6]} 

fake_data_2 = {'content':["pa1","pa2","pa3","pa1","pa2","pa3","pa1","pa2","pa3","pa1","pa2","pa3","pa1","pa2","pa3"],
             'pro_con':["c1","c1","c1","c2","c2","c2","p1","p1","p1","p2","p2","p2","p3","p3","p3"],
             'source':["rat","rat","rat","rat","rat","rat","rat","rat","rat","rat","rat","rat","rat","rat","rat"],
             'topic':["A","A","A","A","A","A","A","A","A","A","A","A","A","A","A"],
             'wmd_sim':[0.8, 0.8, 0.9, 0.7, 0.6, 0.8, 0.8, 0.9, 0.7, 0.6, 0.8, 0.8, 0.9, 0.7, 0.6]} 

fake_data_3 = {'content':["pa1","pa2","pa1","pa2","pa1","pa2","pa1","pa2","pa1","pa2","pa1","pa2","pa3","pa1","pa2","pa3","pa1","pa2","pa3","pa1","pa2","pa3","pa1","pa2","pa3"],
             'pro_con':["Con 1","Con 1","Con 2","Con 2","Pro 1","Pro 1","Pro 2","Pro 2","Pro 3","Pro 3","Con 1","Con 1","Con 1","Con 2","Con 2","Con 2","Pro 1","Pro 1","Pro 1","Pro 2","Pro 2","Pro 2","Pro 3","Pro 3","Pro 3"],
             'source':["con","con","con","con","con","con","con","con","con","con","rat","rat","rat","rat","rat","rat","rat","rat","rat","rat","rat","rat","rat","rat","rat"],
             'topic':["A","A","A","A","A","A","A","A","A","A","A","A","A","A","A","A","A","A","A","A","A","A","A","A","A"],
             'wmd_sim':[0.8, 0.8, 0.9, 0.7, 0.6, 0.8, 0.8, 0.9, 0.7, 0.6,0.8, 0.8, 0.9, 0.7, 0.6, 0.8, 0.8, 0.9, 0.7, 0.6, 0.8, 0.8, 0.9, 0.7, 0.6]} 

fake_wmd_df = pd.DataFrame(fake_data_3) 

In [97]:
def orientation(pro_con):
    if "Pro" in pro_con:
        return "Pro"
    return "Con"

fake_wmd_df["arg_orient"] = fake_wmd_df.apply(lambda row: orientation(row.pro_con), axis=1)


In [99]:
fake_wmd_df

Unnamed: 0,content,pro_con,source,topic,wmd_sim,arg_orient
0,pa1,Con 1,con,A,0.8,Con
1,pa2,Con 1,con,A,0.8,Con
2,pa1,Con 2,con,A,0.9,Con
3,pa2,Con 2,con,A,0.7,Con
4,pa1,Pro 1,con,A,0.6,Pro
5,pa2,Pro 1,con,A,0.8,Pro
6,pa1,Pro 2,con,A,0.8,Pro
7,pa2,Pro 2,con,A,0.9,Pro
8,pa1,Pro 3,con,A,0.7,Pro
9,pa2,Pro 3,con,A,0.6,Pro


##### Similarity paragraph and a pro set 

In [102]:
fake_wmd_df.groupby(['topic','source','content',"arg_orient"]).wmd_sim.agg(['mean'])

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,mean
topic,source,content,arg_orient,Unnamed: 4_level_1
A,con,pa1,Con,0.85
A,con,pa1,Pro,0.7
A,con,pa2,Con,0.75
A,con,pa2,Pro,0.766667
A,rat,pa1,Con,0.75
A,rat,pa1,Pro,0.766667
A,rat,pa2,Con,0.7
A,rat,pa2,Pro,0.8
A,rat,pa3,Con,0.85
A,rat,pa3,Pro,0.7
