In [1]:
import numpy
import pandas as pd
from gensim.models import Word2Vec
from gensim.models import KeyedVectors

In [2]:
model = KeyedVectors.load_word2vec_format('word_embeddings/model.bin', binary=True)

In [3]:
wiki_json = pd.read_json("../crawlers/procon/wiki.json")
args_json = pd.read_json("../crawlers/procon/procon.json")

#### Format DataFrame rows

In [4]:
def format_args_row(row):
    df = row.apply(pd.Series).T
    df.url = df.url[0]
    df.topic = df.topic[0]
    return(df)

arg_df_series = args_json.apply(format_args_row, axis=1)
args_df = pd.concat(arg_df_series.values.tolist(),ignore_index=True)

In [5]:
def format_wiki_row(row):
    df = row.apply(pd.Series).T
    df.source = df.source[0]
    df.title = df.title[0]
    df.topic = df.topic[0]
    return(df)

wiki_df_series = wiki_json.apply(format_wiki_row, axis=1)
wiki_df = pd.concat(wiki_df_series.values.tolist(),ignore_index=True)

### Preprocess Text

#### Create dataframe which will store the data

In [6]:
wmd_df = args_df.merge(wiki_df)

In [7]:
wmd_df = wmd_df.head()

#### Lowercase and remove Stopwords

In [8]:
from nltk.corpus import stopwords
from nltk import download
download('stopwords') 

stop_words = stopwords.words('english')


def preprocess_sentence(str_array):
    new_sentences = []
    for text in str_array: #wmd_df.argument:
        new_sentence = []
        for word in text.split():
            if word not in stop_words:
                new_sentence.append(word.lower())
        new_sentences.append(new_sentence)
    return(new_sentences)

[nltk_data] Downloading package stopwords to /home/allan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [9]:
wmd_df = wmd_df.assign(argument_processed = preprocess_sentence(wmd_df.argument),
                      content_processed = preprocess_sentence(wmd_df.content))

#### Compute the WMD between arguments and each paragraph

In [10]:
wmd_df['wmd'] = wmd_df.apply(lambda x: model.wmdistance(x.argument_processed, x.content_processed), axis=1)

In [11]:
wmd_df

Unnamed: 0,url,topic,pro_con,argument_topic,argument,argument_summary,source,title,content,argument_processed,content_processed,wmd
0,https://euthanasia.procon.org/view.resource.ph...,Euthanasia,Pro 1,Legalization,The right to die should be a matter of persona...,,conservapedia,Euthanasia,Euthanasia Society lists Margaret Sanger as a ...,"[the, right, die, matter, personal, choice., w...","[euthanasia, society, lists, margaret, sanger,...",3.412289
1,https://euthanasia.procon.org/view.resource.ph...,Euthanasia,Pro 1,Legalization,The right to die should be a matter of persona...,,conservapedia,Euthanasia,Euthanasia (from the Greek for good death) is ...,"[the, right, die, matter, personal, choice., w...","[euthanasia, (from, greek, good, death), polit...",2.688866
2,https://euthanasia.procon.org/view.resource.ph...,Euthanasia,Pro 1,Legalization,The right to die should be a matter of persona...,,conservapedia,Euthanasia,Most libertarians and many liberals support le...,"[the, right, die, matter, personal, choice., w...","[most, libertarians, many, liberals, support, ...",2.668337
3,https://euthanasia.procon.org/view.resource.ph...,Euthanasia,Pro 1,Legalization,The right to die should be a matter of persona...,,conservapedia,Euthanasia,The widely reported withdrawal of a feeding tu...,"[the, right, die, matter, personal, choice., w...","[the, widely, reported, withdrawal, feeding, t...",2.767987
4,https://euthanasia.procon.org/view.resource.ph...,Euthanasia,Pro 1,Legalization,The right to die should be a matter of persona...,,conservapedia,Euthanasia,Of recent concern are erroneous reports (July ...,"[the, right, die, matter, personal, choice., w...","[of, recent, concern, erroneous, reports, (jul...",2.925021
