In [1]:
import os
import pandas as pd
import numpy as np
import re
import nltk
import unicodedata
import nltk
import gensim
import math
from sklearn.metrics.pairwise import cosine_similarity
from nltk.corpus import stopwords

In [18]:
df1 = pd.read_csv('./S08_question_answer_pairs.txt', sep='\t')
df2 = pd.read_csv('./S09_question_answer_pairs.txt', sep='\t')
df3 = pd.read_csv('./S10_question_answer_pairs.txt', sep='\t', encoding = 'ISO-8859-1')
frames = [df1, df2, df3]
df = pd.concat(frames)

def getArticleText(file):
    fpath = './text_data/'+file+'.txt.clean'
    try:
        f = open(fpath, 'r')
        text = f.read()
    except UnicodeDecodeError:
        f = open(fpath, 'r', encoding = 'ISO-8859-1')
        text = f.read()
    return text

df = df.dropna(subset=['ArticleFile'])
df['ArticleText'] = df['ArticleFile'].apply(lambda x: getArticleText(x))
df['ArticleText'] = df['ArticleText'].apply(lambda x: re.sub(r'(\n)+', '. ', x))
df = df.drop(['DifficultyFromQuestioner', 'DifficultyFromAnswerer', 'ArticleFile'], axis='columns')

def cleanQuestion(text):
    text = str(text)
    wnl = nltk.stem.WordNetLemmatizer()
    text = text.lower()
    words = re.sub(r'[^\w\s]', '', text).split()
    return " ".join([word for word in words])

def cleanAnswer(text):
    text = str(text)
    wnl = nltk.stem.WordNetLemmatizer()
    text = text.lower()
    words = re.sub(r'[^\w\s]', '', text).split()
    return " ".join([word for word in words])

def cleanText(text):
    text = str(text)
    wnl = nltk.stem.WordNetLemmatizer()
    text = text.lower()
    words = re.sub(r'[^\w\s\.\?]', '', text).split()
    return " ".join([word for word in words])

df['Question'] = df['Question'].apply(lambda x: cleanQuestion(x))
df['Answer'] = df['Answer'].apply(lambda x: cleanAnswer(x))
df['ArticleText'] = df['ArticleText'].apply(lambda x: cleanText(x))

In [19]:
dataset = []
title = ""
for i in range(0, len(df), 2):
    this_title = df.iloc[i]['ArticleTitle']
    if (this_title!=title):
        title = this_title
        text = df.iloc[i]['ArticleText']
        splitted = text.split(sep='.')
        for j in range(len(splitted)):
            text = splitted[j]
            if(text!=''):
                words = text.split()
                dataset.append(words)
    dataset.append(df.iloc[i]['Question'].split())
    dataset.append(df.iloc[i]['Answer'].split())

In [63]:
model = gensim.models.Word2Vec(dataset, vector_size=100, window=8, min_count=0, sg=0, workers=8) # I have 8 cpu cores
# sg = {0, 1} – Training algorithm: 1 for skip-gram; otherwise CBOW

In [64]:
model.train(dataset, total_examples=len(dataset), compute_loss=True, epochs=50)

(25701047, 32457850)

In [69]:
def get_embedding(sentence):
    pos_sum = [0.0 for i in range(100)]
    num = 0
    words = sentence.split()
#     print(words)
    for i in words:
        try:
            embed = model.wv[i]
        except:
            continue
        else:
            pos_sum += embed
            num +=1
    if(num==0):
        return pos_sum
    else:
        pos_sum /= num
#         print(num, len(words))
#         print(pos_sum)
#         /len(model.wv[i]))
        return pos_sum

def get_answer(question, answer_para):
    question_embedding = get_embedding(rem_stop(question))
    
    min_distance = math.inf
    answer = 0
    for i in range(len(answer_para)):
        answer_embedding = get_embedding(rem_stop(answer_para[i]))
#         print(answer_embedding)
        distance = np.linalg.norm(question_embedding-answer_embedding)
        if (distance < min_distance):
            answer = i
            # print(answer)
            min_distance = distance
    return answer_para[answer]

def rem_stop(sentence):
    strr=''
    my_string = sentence.split()
    for i in range(len(my_string)):
        if my_string[i] not in stopwords.words('english'):
            strr = strr+' '+my_string[i]
    return strr[1:]

def get_answer_cosine(question, answer_para):
    question_embedding = get_embedding(rem_stop(question))
    max_similarity = -math.inf
    answer = 0
    for i in range(len(answer_para)):
        answer_embedding = get_embedding(rem_stop(answer_para[i]))
        similarity = cosine_similarity(np.expand_dims(question_embedding,0), np.expand_dims(answer_embedding,0))
        if (similarity > max_similarity):
            answer = i
            max_similarity = similarity
        return answer_para[answer]

In [70]:
index = 296
my_text = df.iloc[index]['ArticleText']
temp_sentences = my_text.split(sep='.')
sentences=[]
for i in range(len(temp_sentences)):
    if(temp_sentences[i]!=''):
        sentences.append(temp_sentences[i])
my_question = df.iloc[index]['Question']

In [71]:
print(my_question) # Actual Question
print(rem_stop(my_question)) # Answer without stopwords
print(df.iloc[index]['Answer']) # Actual Answer

what was the consitution act formerly called
consitution act formerly called
british north america act


In [68]:
print(get_answer(my_question, sentences)) # Our model's prediction using euclidean distance
print("\n")
print(get_answer_cosine(my_question, sentences)) # Our model's prediction using cosine similarity

4 4
1 1
16 16
13 13
5 5
12 12
10 10
13 13
24 24
16 16
9 9
21 21
2 2
4 4
6 6
15 15
12 12
8 8
15 15
15 15
13 13
14 14
9 9
9 9
6 6
10 10
15 15
12 12
27 27
18 18
6 6
9 9
20 20
7 7
11 11
11 11
7 7
21 21
14 14
7 7
2 2
8 8
31 31
13 13
9 9
10 10
19 19
9 9
8 8
7 7
10 10
5 5
8 8
7 7
13 13
8 8
9 9
8 8
27 27
11 11
19 19
10 10
26 26
21 21
15 15
3 3
12 12
26 26
18 18
15 15
8 8
14 14
16 16
13 13
7 7
15 15
18 18
8 8
20 20
12 12
13 13
7 7
12 12
9 9
7 7
7 7
14 14
2 2
1 1
14 14
7 7
5 5
7 7
7 7
13 13
15 15
23 23
19 19
10 10
18 18
13 13
11 11
3 3
24 24
10 10
8 8
22 22
20 20
16 16
3 3
22 22
17 17
26 26
11 11
20 20
11 11
17 17
13 13
17 17
18 18
6 6
14 14
7 7
12 12
16 16
1 1
8 8
16 16
9 9
14 14
9 9
8 8
21 21
3 3
16 16
15 15
20 20
12 12
9 9
14 14
2 2
5 5
22 22
11 11
10 10
24 24
7 7
5 5
9 9
12 12
8 8
22 22
13 13
3 3
16 16
33 33
12 12
16 16
5 5
1 1
3 3
16 16
7 7
8 8
12 12
11 11
22 22
9 9
9 9
3 3
20 20
17 17
15 15
11 11
22 22
4 4
6 6
4 4
6 6
5 5
4 4
8 8
29 29
12 12
8 8
18 18
14 14
8 8
6 6
5 5
14 14
19 19
8 8
17 1