### KT-ASSISTANT ###
### NLP Project Group 8 ###

In [2]:
import sklearn as sk
import numpy as np
import pandas as pd
import json
from torch.utils.data import Dataset, DataLoader


class BQDataset():
    def __init__(self, path):
        self.dataset = open(path,encoding="utf-8")

        self.dataset = [json.loads(instance) for instance in self.dataset ]

        self.passages = []
        self.questions = []
        self.answers = []
        self.titles = []

        for inst in self.dataset:
            self.passages.append(inst["passage"])
            self.questions.append(inst["question"])
            self.answers.append(inst["answer"])
            self.titles.append(inst["title"])

    def get_dataset(self):
        return self.dataset

    def get_split(self):
        return self.passages, self.questions, self.answers

bqd = BQDataset("datasets/train.jsonl")
dataset = bqd.get_dataset()

### Preprocessing

In [3]:
def clean(text, stem_words=True):
    import re    # for regular expressions
    from string import punctuation
    from nltk.stem import SnowballStemmer    #if you are brave enough to do stemming
    from nltk.corpus import stopwords      #if you want to remove stopwords
    
    if type(text) != str or text=='':
        return ''

    text = re.sub("\'s", " ", text) # we have cases like "Sam is" or "Sam's" (i.e. his) these two cases aren't separable, I choose to compromise are kill "'s" directly
    text = re.sub(" whats ", " what is ", text, flags=re.IGNORECASE)
    text = re.sub("\'ve", " have ", text)

    ### YOUR CODE HERE
    text = re.sub("can't", "cannot", text, flags=re.IGNORECASE)
    text = re.sub("don't", "do not", text, flags=re.IGNORECASE)
    text = re.sub("won't", "will not", text, flags=re.IGNORECASE)
    text = re.sub("shouldn't", "should not", text, flags=re.IGNORECASE)
    text = re.sub("couldn't", "could not", text, flags=re.IGNORECASE)
    text = re.sub("isn't", "is not", text, flags=re.IGNORECASE)
    text = re.sub("wasn't", "was not", text, flags=re.IGNORECASE)
    text = re.sub("weren't", "were not", text, flags=re.IGNORECASE)
    text = re.sub("haven't", "have not", text, flags=re.IGNORECASE)
    text = re.sub("hasn't", "has not", text, flags=re.IGNORECASE)

    text = re.sub(r"[0-9]-[0-9]", " minus ", text)
    text = re.sub("-", " ", text)

    digit_letters = ["zero", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine"]
    for i in range(len(digit_letters)):
        regex = rf"(?<=\b){str(i)}(?=\b)"
        text = re.sub(regex, digit_letters[i], text)
    
    # remove comma between numbers, i.e. 15,000 -> 15000
    text = re.sub('(?<=[0-9])\,(?=[0-9])', "", text)
    
    # Return a list of words
    return text


In [4]:
def tokenize(text):

    # Split by whitespace
    text = text.split(" ")

    return text

In [5]:
# Apply pre-processing to data

for row in dataset:

    row["question"] = clean(row["question"])
    row["question"] = tokenize(row["question"])

    row["passage"] = clean(row["passage"])
    row["passage"] = tokenize(row["passage"])

In [6]:
print(dataset[0])

{'question': ['do', 'iran', 'and', 'afghanistan', 'speak', 'the', 'same', 'language'], 'title': 'Persian language', 'answer': True, 'passage': ['Persian', '(/ˈpɜːrʒən,', '', 'ʃən/),', 'also', 'known', 'by', 'its', 'endonym', 'Farsi', '(فارسی', 'fārsi', '(fɒːɾˈsiː)', '(', 'listen)),', 'is', 'one', 'of', 'the', 'Western', 'Iranian', 'languages', 'within', 'the', 'Indo', 'Iranian', 'branch', 'of', 'the', 'Indo', 'European', 'language', 'family.', 'It', 'is', 'primarily', 'spoken', 'in', 'Iran,', 'Afghanistan', '(officially', 'known', 'as', 'Dari', 'since', '1958),', 'and', 'Tajikistan', '(officially', 'known', 'as', 'Tajiki', 'since', 'the', 'Soviet', 'era),', 'and', 'some', 'other', 'regions', 'which', 'historically', 'were', 'Persianate', 'societies', 'and', 'considered', 'part', 'of', 'Greater', 'Iran.', 'It', 'is', 'written', 'in', 'the', 'Persian', 'alphabet,', 'a', 'modified', 'variant', 'of', 'the', 'Arabic', 'script,', 'which', 'itself', 'evolved', 'from', 'the', 'Aramaic', 'alpha

### Baseline Model

In [5]:
from gensim.models import Word2Vec

embedding_size = 100

passages, questions, answers = bqd.get_split()

sentences = passages.extend(questions)

s_ = []
for sentence in sentences:

    s_.append(sentence.split(" "))

sentences = s_

model = Word2Vec(sentences=sentences,vector_size=embedding_size, window= 5, min_count= 1, workers= 4)

model.train(sentences,total_examples=len(sentences),epochs=10)



ModuleNotFoundError: No module named 'gensim'

In [None]:
from parse import *
from query import QueryProcessor
import operator

def main():
	qp = QueryParser(filename='../text/queries.txt')
	cp = CorpusParser(filename='../text/corpus.txt')
	qp.parse()
	queries = qp.get_queries()
	cp.parse()
	corpus = cp.get_corpus()
	proc = QueryProcessor(queries, corpus)
	results = proc.run()
	qid = 0
	for result in results:
		sorted_x = sorted(result.items(), key=operator.itemgetter(1))
		sorted_x.reverse()
		index = 0
		for i in sorted_x[:100]:
			tmp = (qid, i[0], index, i[1])
			print('{:>1}\tQ0\t{:>4}\t{:>2}\t{:>12}\tNH-BM25'.format(*tmp))
			index += 1
		qid += 1


if __name__ == '__main__':
	main()