# École Polytechnique de Montréal
# Département Génie Informatique et Génie Logiciel



## Équipe et contributions 
Veuillez indiquer la contribution effective de chaque membre de l'équipe en pourcentage et en indiquant les modules ou questions sur lesquelles chaque membre a travaillé

Cedric Sadeu (1869737): 1/3

Mamoudou Sacko (1924187): 1/3

Oumayma Messoussi (2016797): 1/3

# Librairies externes

In [46]:
import io
import os
import nltk
import time
import sklearn
import zipfile
import requests
import numpy as np
import pandas as pd
from typing import Dict, List, Tuple
from collections import Counter, defaultdict
from sklearn.metrics import classification_report, accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from scipy.spatial.distance import euclidean, cosine
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn.decomposition import TruncatedSVD

nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mamoudou\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\mamoudou\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 
warnings.filterwarnings("ignore", category=FutureWarning) 
warnings.filterwarnings("ignore", category=UserWarning) 

## Lecture

In [47]:
def read_data(path: str) -> Tuple[List[int], List[str]]:
    data = pd.read_csv(path)
    ids = data["id"].tolist()
    paragraphs = data["paragraph"].tolist()
    return ids, paragraphs

def read_questions(path: str) -> Tuple[List[int], List[str], List[int], List[str]]:
    data = pd.read_csv(path)
    ids = data["id"].tolist()
    questions = data["question"].tolist()
    paragraph_ids = data["paragraph_id"].tolist()
    answers = data["answer"].tolist()
    return ids, questions, paragraph_ids, answers

def read_questions_vectors(path: str) -> Tuple[List[int], List[str], List[int], List[str]]:
    data = pd.read_csv(path)
    ids = data["id"].tolist()
    questions = data["question"].tolist()
    questions_vectors = data["question_vector"].tolist()
    return ids, questions, questions_vectors

def save_to_csv(path: str, corpus):
    df = pd.DataFrame(corpus, columns= list(corpus.keys())).head()
    df.to_csv (os.path.join(output_path, path), index = False, header=True)


In [48]:
data_path = "data"
output_path = "output"

train_data = read_data(os.path.join(data_path, "corpus.csv"))
train_ids = read_questions(os.path.join(data_path, "train_ids.csv"))


paragraphs = [" ".join(sentence.split()).lower() for sentence in train_data[1]]
questions = [" ".join(sentence.split()).lower() for sentence in train_ids[1]]


## Prétraitement

In [49]:
class Preprocess(object):
    def __init__(self, lemmatize=True):
        self.stopwords = set(nltk.corpus.stopwords.words("english"))
        self.lemmatize = lemmatize

    def preprocess_pipeline(self, data):
        clean_tokenized_data = self._clean_doc(data)
        if self.lemmatize:
            clean_tokenized_data = self._lemmatize(clean_tokenized_data)

        return clean_tokenized_data

    def _clean_doc(self, data):
        tokenizer = nltk.tokenize.RegexpTokenizer(r"\w+")
        return [
            [
                token.lower()
                for token in tokenizer.tokenize(review)
                if token.lower() not in self.stopwords
                and len(token) > 1
                and token.isalpha()
            ]
            for review in data
        ]

    def _lemmatize(self, data):
        lemmatizer = nltk.stem.WordNetLemmatizer()
        return [[lemmatizer.lemmatize(word) for word in review] for review in data]

    def convert_to_reviews(self, tokenized_reviews):
        reviews = []
        for tokens in tokenized_reviews:
            reviews.append(" ".join(tokens))

        return reviews

In [50]:
pre = Preprocess()

paragraphs_tokenized = pre.preprocess_pipeline(paragraphs)
questions_tokenized = pre.preprocess_pipeline(questions)

paragraphs_text = [" ".join(sentence) for sentence in paragraphs_tokenized]
questions_text = [" ".join(sentence) for sentence in questions_tokenized]



In [98]:
def buildVocab(X) -> object:
  vectorizer = CountVectorizer(min_df=0, lowercase=False)
  vectorizer.fit(X)
  return vectorizer.vocabulary_

def getTfIdfReprentation(vocab, data, feature = 5) -> object:
  vectorizer = TfidfVectorizer(vocabulary=vocab) 
  data_tfidf = vectorizer.fit_transform(data)
  features = vectorizer.get_feature_names()
  dense = data_tfidf.todense()
  return dense

def getTfIdfEmbedded(vocab, data, feature = 5) -> object:
  vectorizer = TfidfVectorizer(vocabulary=vocab) 
  data_tfidf = vectorizer.fit_transform(data)
  features = vectorizer.get_feature_names()
  dense = data_tfidf.todense()
  denselist = dense.tolist()
  df = pd.DataFrame(
    denselist,columns=features)
  return df


def get_doc_embedded(X, vocab, embeddings) -> object:
  X_embedded = np.zeros((len(X), len(embeddings)), dtype=float)

  for i, doc in enumerate(X):
    vec = np.zeros((1, len(embeddings)), dtype=float)
    tokens = doc.split()new_question_tfidf
    cpt = 0
    for word in tokens:
      if(word in vocab):
        cpt += 1
        vec += embeddings[word]
    vec /= cpt
    X_embedded[i] = vec
  return X_embedded

  def getMedian(corpus):
    total_lenght = sorted([len(doc) for doc in corpus])
    return total_lenght[int(len(total_lenght) * 2/3)]

  
def sklearn_svd(df, k):
    svd_model = TruncatedSVD(n_components=k)
    df_r = svd_model.fit_transform(df)
    return  df_r

In [53]:
def voisins(word, df, n, distfunc=cosine):
    assert distfunc.__name__ == 'cosine' or distfunc.__name__ == 'euclidean', "distance metric not supported"
    order = True if distfunc.__name__ == 'euclidean' else False

    closest = {}
    for w in df:
        distance = distfunc(word, df[w])
        closest[w] = distance

    closest = {k: v for k, v in sorted(closest.items(), key=lambda item: item[1], reverse=order)}

    return list(closest.keys())[:n], list(closest.values())[:n]

In [113]:
new_question = ["Did von Neumann rule hidden "]
new_question_tokenized = pre.preprocess_pipeline(new_question)
new_question_text = [" ".join(sentence) for sentence in new_question_tokenized]

embeddings = getTfIdfEmbedded(questions_vocab, questions_text) 
new_question_tfidf = get_doc_embedded(new_question_text, questions_vocab, embeddings)


questions = ["Who leaders the sub-divisions of offices or divisions?", "Besides using 3kV DC what other power type is used in the former Soviet Union countries?", "	How many other cities had populations larger than 40,000 by 1500?", "Did von Neumann rule hidden variable theories?", "	The OSHA claimed that the Tajik government censored what?"]
questions_tfidf = get_doc_embedded(questions_text, questions_vocab, embeddings)


#questions_tfidf_r = sklearn_svd(questions_tfidf, len(embeddings))
dic_questions = {}
for i, ids in enumerate(questions) :
    dic_questions[questions[i]] = questions_tfidf[i]


print(questions_tfidf)
print(new_question_tfidf)

print(voisins(new_question_tfidf, dic_questions, 3, distfunc=cosine))


[[0.52915026 0.         0.         0.         0.        ]
 [0.         0.31622777 0.         0.         0.        ]
 [0.         0.         0.5        0.         0.        ]
 [0.         0.         0.         0.40824829 0.        ]
 [0.         0.         0.         0.         0.4472136 ]]
[[0.         0.         0.         0.40824829 0.        ]]
(['Did von Neumann rule hidden variable theories?', 'Who leaders the sub-divisions of offices or divisions?', 'Besides using 3kV DC what other power type is used in the former Soviet Union countries?'], [0.0, 1.0, 1.0])
