In [18]:
import pandas as pd
import numpy as np
import string
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk import pos_tag
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [19]:
df1 = pd.read_csv('./data/S08_question_answer_pairs.txt', sep='\t')
df2 = pd.read_csv('./data/S09_question_answer_pairs.txt', sep='\t')
df3 = pd.read_csv('./data/S10_question_answer_pairs.txt', sep='\t', encoding = 'ISO-8859-1')

In [20]:
df1.head(20)

Unnamed: 0,ArticleTitle,Question,Answer,DifficultyFromQuestioner,DifficultyFromAnswerer,ArticleFile
0,Abraham_Lincoln,Was Abraham Lincoln the sixteenth President of...,yes,easy,easy,S08_set3_a4
1,Abraham_Lincoln,Was Abraham Lincoln the sixteenth President of...,Yes.,easy,easy,S08_set3_a4
2,Abraham_Lincoln,Did Lincoln sign the National Banking Act of 1...,yes,easy,medium,S08_set3_a4
3,Abraham_Lincoln,Did Lincoln sign the National Banking Act of 1...,Yes.,easy,easy,S08_set3_a4
4,Abraham_Lincoln,Did his mother die of pneumonia?,no,easy,medium,S08_set3_a4
5,Abraham_Lincoln,Did his mother die of pneumonia?,No.,easy,easy,S08_set3_a4
6,Abraham_Lincoln,How many long was Lincoln's formal education?,18 months,medium,easy,S08_set3_a4
7,Abraham_Lincoln,How many long was Lincoln's formal education?,18 months.,medium,medium,S08_set3_a4
8,Abraham_Lincoln,When did Lincoln begin his political career?,1832,medium,easy,S08_set3_a4
9,Abraham_Lincoln,When did Lincoln begin his political career?,1832.,medium,medium,S08_set3_a4


In [21]:
all_data = df1.append([df2, df3])
all_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1735 entries, 0 to 10
Data columns (total 7 columns):
 #   Column                                                   Non-Null Count  Dtype 
---  ------                                                   --------------  ----- 
 0   ArticleTitle                                             1715 non-null   object
 1   Question                                                 1696 non-null   object
 2   Answer                                                   1475 non-null   object
 3   DifficultyFromQuestioner                                 1224 non-null   object
 4   DifficultyFromAnswerer                                   1473 non-null   object
 5   ArticleFile                                              1713 non-null   object
 6   The destination name is too long (785), reducing to 236  20 non-null     object
dtypes: object(7)
memory usage: 108.4+ KB


In [22]:
all_data['Question'] = all_data['ArticleTitle'].str.replace('_', ' ') + ' ' + all_data['Question']
all_data = all_data[['Question', 'Answer']]
all_data.shape

(1735, 2)

In [23]:
all_data.head(10)["Question"]

0    Abraham Lincoln Was Abraham Lincoln the sixtee...
1    Abraham Lincoln Was Abraham Lincoln the sixtee...
2    Abraham Lincoln Did Lincoln sign the National ...
3    Abraham Lincoln Did Lincoln sign the National ...
4     Abraham Lincoln Did his mother die of pneumonia?
5     Abraham Lincoln Did his mother die of pneumonia?
6    Abraham Lincoln How many long was Lincoln's fo...
7    Abraham Lincoln How many long was Lincoln's fo...
8    Abraham Lincoln When did Lincoln begin his pol...
9    Abraham Lincoln When did Lincoln begin his pol...
Name: Question, dtype: object

In [24]:
all_data = all_data.drop_duplicates(subset='Question')
all_data.head(10)

Unnamed: 0,Question,Answer
0,Abraham Lincoln Was Abraham Lincoln the sixtee...,yes
2,Abraham Lincoln Did Lincoln sign the National ...,yes
4,Abraham Lincoln Did his mother die of pneumonia?,no
6,Abraham Lincoln How many long was Lincoln's fo...,18 months
8,Abraham Lincoln When did Lincoln begin his pol...,1832
10,Abraham Lincoln What did The Legal Tender Act ...,"the United States Note, the first paper curren..."
12,Abraham Lincoln Who suggested Lincoln grow a b...,11-year-old Grace Bedell
14,Abraham Lincoln When did the Gettysburg addres...,1776
16,Abraham Lincoln Did Lincoln beat John C. Breck...,yes
18,Abraham Lincoln Was Abraham Lincoln the first ...,No


In [25]:
all_data.shape

(1034, 2)

In [26]:
all_data = all_data.dropna()
all_data.shape

(901, 2)

In [27]:
stopwords_list = stopwords.words('english')

lemmatizer = WordNetLemmatizer()

def my_tokenizer(doc):
    words = word_tokenize(doc)
    
    pos_tags = pos_tag(words)
    
    non_stopwords = [w for w in pos_tags if not w[0].lower() in stopwords_list]
    
    non_punctuation = [w for w in non_stopwords if not w[0] in string.punctuation]
    
    lemmas = []
    for w in non_punctuation:
        if w[1].startswith('J'):
            pos = wordnet.ADJ
        elif w[1].startswith('V'):
            pos = wordnet.VERB
        elif w[1].startswith('N'):
            pos = wordnet.NOUN
        elif w[1].startswith('R'):
            pos = wordnet.ADV
        else:
            pos = wordnet.NOUN
        
        lemmas.append(lemmatizer.lemmatize(w[0], pos))

    return lemmas

In [28]:
tfidf_vectorizer = TfidfVectorizer(tokenizer=my_tokenizer)
tfidf_matrix = tfidf_vectorizer.fit_transform(tuple(all_data['Question']))
print(tfidf_matrix.shape)

(901, 1944)


In [29]:
def ask_question(question):
    query_vect = tfidf_vectorizer.transform([question])
    similarity = cosine_similarity(query_vect, tfidf_matrix)
    max_similarity = np.argmax(similarity, axis=None)
    
    print('Your question:', question)
    print('Closest question found:', all_data.iloc[max_similarity]['Question'])
    print('Similarity: {:.2%}'.format(similarity[0, max_similarity]))
    print('Answer:', all_data.iloc[max_similarity]['Answer'])

In [30]:
ask_question('When Abraham Lincoln started his political career')

Your question: When Abraham Lincoln started his political career
Closest question found: Abraham Lincoln Did Lincoln start his political career in 1832?
Similarity: 87.47%
Answer: Yes


In [31]:
ask_question('Where was Nicola Tesla born')

Your question: Where was Nicola Tesla born
Closest question found: Nikola Tesla Are there at least two films describing Tesla 's life ?
Similarity: 41.74%
Answer: Yes.


In [32]:
ask_question('Can whales fly')

Your question: Can whales fly
Closest question found: turtle Are turtles a part of the Flying Spaghetti Monster?
Similarity: 41.42%
Answer: *shrug*


In [33]:
ask_question('Was Alessandro Volta taught in public schools?')

Your question: Was Alessandro Volta taught in public schools?
Closest question found: Ghana What is the weather like at Lake Volta?
Similarity: 31.84%
Answer: Warm and comparatively dry


In [34]:
ask_question('How high are crime rates in Brazil')

Your question: How high are crime rates in Brazil
Closest question found: Canada What is Canada's national unemployment rate?
Similarity: 36.77%
Answer: While as of October 2007, Canada's national unemployment rate of 5.9% is its lowest in 33 years. Provincial unemployment rates vary from a low of 3.6% in Alberta to a high of 14.6% in Newfoundland and Labrador. 
