In [None]:
import time
import torch
import os
import nltk
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

In [None]:
%matplotlib inline


In [None]:
ran_state = 42

current_dir = os.getcwd()
data_dir = f'{current_dir}/dataset'
dl_folder = f'{current_dir}/dataset/dl'
destination_folder = f'{current_dir}/results'
filenames = [
    'kant.txt', 
    'aristotle.txt', 
    'plato.txt', 
    'hume.txt',
    'nietzsche.txt'
    ]

[os.path.join(data_dir, file) for file in filenames]
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [None]:
csv_file = 'sentences.csv'
data_csv = os.path.join(data_dir, csv_file)
philo_df = pd.read_csv(data_csv).sample(frac = 1)
# philo_df = philo_df.loc[philo_df.author=='Nietzsche']
philo_all_df = philo_df.copy()
philo_df = philo_df.iloc[:len(philo_df)//3]
philo_df = philo_df.reset_index()
philo_df.head()

In [None]:
df_nan = philo_df.isna()
# print(df_nan.sum())

# print(philo_df.describe())

philo_df['word_counter'] = philo_df['sentence'].apply(lambda x: x.count(' '))


philo_df.groupby('author')['word_count'].mean().plot.bar()
plt.show()
philo_df.groupby('author')['label'].count().plot.bar()
plt.show()

philo_df.groupby('author')['label'].size().plot.bar()
plt.show()

philo_df.groupby('author')['label'].mean()

In [None]:
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

first_n_words = 200

def trim_string(x):
    x = x.split(maxsplit=first_n_words)
    x = ' '.join(x[:first_n_words])
    return x

REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
PATTERN_S = re.compile("\'s")
PATTERN_RN = re.compile("\\r\\n")
PATTERN_PUNC = re.compile(r"[^\w\s]")
STOPWORDS = set(stopwords.words('english'))
MIN_WORDS = 2

def clean_text(text):
    """
        text: a string
        # TODO What is doing spacy
        
        return: modified initial string
    """
    text = text.lower() # lowercase text
    text = re.sub(PATTERN_S, ' ', text)
    text = re.sub(PATTERN_RN, ' ', text)
    text = re.sub(PATTERN_PUNC, ' ', text)
    text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text. substitute the matched string in REPLACE_BY_SPACE_RE with space.
    text = BAD_SYMBOLS_RE.sub(' ', text) # remove symbols which are in BAD_SYMBOLS_RE from text. substitute the matched string in BAD_SYMBOLS_RE with nothing. 
    text = text.replace('x', ' ')
    text = re.sub(r'\W+', ' ', text)
    text = re.sub(r"\d+", " ", text)
    tokens = [w for w in text.split() if not w in STOPWORDS] # remove stopwors from text
    # Remove short words (under 3 characters) from the tokens
    long_words = []
    for token in tokens:
        if len(token) >= MIN_WORDS:
            long_words.append(token)
    # Join the tokens back together
    cleaned_text = (" ".join(long_words)).strip()
    return cleaned_text


In [None]:
from nltk.stem.porter import *

# for tokenizer
import nltk 
nltk.download('punkt')
from nltk.tokenize import sent_tokenize, word_tokenize

# Processing the data
# Drop empty text
philo_df.drop(philo_df[philo_df.sentence.str.len()<3].index, inplace=True)

# To try
philo_df['clean_sentence'] = philo_df['sentence'].apply(clean_text)
philo_df['sentence'] = philo_df['clean_sentence'].apply(lambda x: re.sub('[^A-Za-z0-9]+', ' ', x))

# trim 
# philo_df['trim_sentence'] = philo_df['sentence'].apply(trim_string)

# tokenized
philo_df['token_sentence'] = philo_df['clean_sentence'].apply(lambda x: word_tokenize(x))

# # To try
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer

stemmer = WordNetLemmatizer() #PorterStemmer()
# philo_df["token_sentence"] = philo_df['token_sentence'].str.split().apply(lambda x: ' '.join([stemmer.stem(w.lower()) for w in x]))
philo_df["token_sentence"] = philo_df['token_sentence'].apply(lambda x: [stemmer.lemmatize(w) for w in x])

mini_philo_df = philo_df[['sentence', 'label']]


In [None]:
philo_df

In [None]:
import spacy
# !python -m spacy download en_core_web_lg

nlp = spacy.load("en_core_web_lg")

philo_df['spacy_sentence'] = philo_df['sentence'].apply(lambda x: nlp(x.lower())) # calling nlp on a string and spaCy tokenizes the text and creates a document object
# philo_df['spacy_sentence_token'] = philo_df['sentence'].apply(lambda x: nlp(x.lower()).text.split()) # calling nlp on a string and spaCy tokenizes the text and creates a document object

# philo_all_df['spacy_sentence'] = philo_all_df['sentence'].apply(lambda x: nlp(x.lower())) # calling nlp on a string and spaCy tokenizes the text and creates a document object
philo_df['spacy_vec'] = philo_df['spacy_sentence'].apply(lambda x: np.array(x.vector))
philo_df['spacy_vec'] = philo_df['spacy_vec'].to_numpy()

In [None]:
from sklearn.decomposition import PCA
import plotly.express as px

def decompose_embed(X):
    pca = PCA(n_components=2)
    y = pca.fit_transform(X)
    return y
    

def show_embedding(X, aut, sentence):
    fig = px.scatter(x=y[:,0], y=y[:,1], color=aut)
    fig.update_layout(title='Word embedding')

    fig.show()
    


In [None]:
philo_df

In [None]:
pd.options.display.max_colwidth = 500


# Version 1
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


# Using TFIDF
vectorizer = TfidfVectorizer(stop_words='english') #, ngram_range=(1, 2)) #one gram to three gram
tfidf_mat = vectorizer.fit_transform(philo_df['sentence'].values)
# Compute cosine similarity
cosine_sim_mat = cosine_similarity(tfidf_mat, tfidf_mat)
# cosine_similarities = linear_kernel(tfidf_matrix, tfidf_matrix)

print(tfidf_mat.shape)

In [None]:
sub_df = philo_df.sample(n=100, random_state=42)
sub_df.spacy_vec = sub_df.spacy_vec.apply(lambda x: np.array(x))
X = np.array([np.array(e) for e in sub_df.spacy_vec.values])
y = decompose_embed(X)
aut_list = sub_df.author.values
sent_list = sub_df.sentence.values
print(aut_list)
show_embedding(y, aut_list, sent_list)

In [None]:
## Recommendation

# V1 tfidf recommandation matching

In [None]:
# print(tfidf_mat.shape) 
# print(cosine_sim_mat.shape)

def get_recommendations_v1(sentence, series):
    tokens = [str(tok) for tok in nlp(sentence)]
    # using tfidf
    vec = vectorizer.transform(tokens)
    # using spacy
#     print('computing similarity')
    mat = cosine_similarity(vec, tfidf_mat)
    best_sim_each_token = np.argmax(mat, axis=1)
    index = np.argsort(best_sim_each_token)[::-1] #take the five highest norm 
#     print('norms, indices', best_sim_each_token, index)
    null_index = best_sim_each_token != 0
    null_index = null_index[index]
    index = index[null_index==True]
    best_index = best_sim_each_token[index][:3]
#     print('best_index', best_index)
    print(philo_df[['sentence', 'author']].iloc[best_index])
    return best_index


mat = get_recommendations_v1('Can I eat a hot soup tonight?', philo_df['sentence'])


# V2 spacy vectors

In [None]:
def get_recommendations_v2(sentence, series):
    vec = np.array([tok.vector for tok in nlp(sentence)])
    print('computing similarity')
    data_vec = np.array([np.array(elt) for elt in philo_df['spacy_vec'].values])
    print(vec.shape, data_vec.shape)
    mat = cosine_similarity(vec, data_vec)
    best_sim_each_token = np.argmax(mat, axis=1)
    index = np.argsort(best_sim_each_token)[::-1] #take the five highest norm 
    print('norms, indices', best_sim_each_token, index)
    null_index = best_sim_each_token != 0
    null_index = null_index[index]
    index = index[null_index==True]
    best_index = best_sim_each_token[index][:3]
    print('best_index', best_index)
    print(philo_df[['sentence', 'author']].iloc[best_index])
    return best_index


mat = get_recommendations_v2('Can I eat a hot soup tonight?', philo_df['sentence'])


# V3 Word2Vec

In [None]:
# Version #3 with word2vec gensim
# missing keys (common words)

from gensim.models import KeyedVectors
import gensim.downloader
import gensim.downloader as api

import nltk
nltk.download('punkt')

print(list(gensim.downloader.info()['models'].keys()))
# 'glove-wiki-gigaword-300'

# glove_vectors = gensim.downloader.load('word2vec-google-news-300')
glove_vectors = api.load('glove-wiki-gigaword-300')

In [None]:
from gensim.models import Word2Vec

word2vec_model = Word2Vec(sentences=philo_df.spacy_sentence.values, vector_size=300, window=5, min_count = 1, workers = 2)
word2vec_model.build_vocab(philo_df.sentence.values)
print(word2vec_model.wv)
# word2vec_model.intersect_word2vec_format('./word2vec/GoogleNews-vectors-negative300.bin', lockf=0.0,binary=True)
word2vec_model.train(philo_df.sentence.values, total_examples=2, epochs = 2)
print(word2vec_model.wv)
word2vec_model.save('my_gensim_word2vec.bin')

In [None]:
# Access vectors for specific words with a keyed lookup:
# philo_df['gensim_vec'] = philo_df['spacy_sentence'].apply(lambda x: [word2vec_model.wv[elt.text] for elt in x])
print(philo_df.spacy_sentence.values)
print(word2vec_model.wv.key_to_index)
# word2vec_model = model.wv.get_vecattr("rock", "count")  # 👍
# word2vec_model = len(model.wv)  # 

In [None]:
def get_recommendations_v3(sentence, series):
    vec = [model[str(tok)] for tok in nlp(sentence)]
    print('computing similarity')
    mat = cosine_similarity(vec, philo_df['spacy_vec'].values)
    best_sim_each_token = np.argmax(mat, axis=1)
    index = np.argsort(best_sim_each_token)[::-1] #take the five highest norm 
    print('norms, indices', best_sim_each_token, index)
    null_index = best_sim_each_token != 0
    null_index = null_index[index]
    index = index[null_index==True]
    best_index = best_sim_each_token[index][:3]
    print('best_index', best_index)
    print(philo_df[['sentence', 'author']].iloc[best_index])
    return best_index

mat = get_recommendations_v3('I can\'t wait seeing you again', philo_df['sentence'])


In [None]:
# Version 4 Doc2Vec
# import
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
tagged_data = [TaggedDocument(d, [i]) for i, d in enumerate(philo_df['spacy_sentence'].values)]

# Training Doc2Vec
## Train doc2vec model
model = Doc2Vec(tagged_data, vector_size = 20, window = 2, min_count = 1, epochs = 100)


In [None]:
print(tagged_data[0])
print(model.docvecs)

def get_recommendations_v4(sentence, series):
    print(sentence.split())
    vec = model.infer_vector(sentence.split())
    print('computing similarity')
    results = model.docvecs.most_similar(positive = [vec])
    best_idx, score = list(zip(*results))
    print(list(best_idx))
    print(philo_df[['sentence', 'author']].iloc[list(best_idx[:3])])
    
get_recommendations_v4('what is virtue?', philo_df['sentence'])


In [None]:
# Version 5
from sentence_transformers import SentenceTransformer
sbert_model = SentenceTransformer('bert-base-nli-mean-tokens')

In [None]:
philo_df['bert_vec'] = philo_df['spacy_sentence'].apply(lambda x: sbert_model.encode(x.text))

def get_recommendations_v4(sentence, series):
    vec = model.encode([sentence])[0]
    print('computing similarity')
    mat = cosine_similarity(vec, philo_df['bert_vec'].values)
    best_sim_each_token = np.argmax(mat, axis=1)
    index = np.argsort(best_sim_each_token)[::-1] #take the five highest norm 
    print('norms, indices', best_sim_each_token, index)
    null_index = best_sim_each_token != 0
    null_index = null_index[index]
    index = index[null_index==True]
    best_index = best_sim_each_token[index][:3]
    print('best_index', best_index)
    print(philo_df[['sentence', 'author']].iloc[best_index])
    return best_index


In [None]:
# V6 Torch bert transformers
import torch
tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'tokenizer', 'bert-base-uncased')    # Download vocabulary from S3 and cache.
model = torch.hub.load('huggingface/pytorch-transformers', 'model', 'bert-base-uncased')    # Download model and configuration from S3 and cache.
# model = torch.hub.load('huggingface/pytorch-transformers', 'model', 'bert-base-uncased', output_attentions=True)  # Update configuration during loading
# assert model.config.output_attentions == True


In [None]:
text_1 = "Who was Jim Henson ?"
text_2 = "Jim Henson was a puppeteer"
indexed_tokens = tokenizer.encode(text_1, text_2, add_special_tokens=True)

# Convert inputs to PyTorch tensors
segments_tensors = torch.tensor([segments_ids])
tokens_tensor = torch.tensor([indexed_tokens])

model = torch.hub.load('huggingface/pytorch-transformers', 'model', 'bert-base-cased')

with torch.no_grad():
    encoded_layers, _ = model(tokens_tensor, token_type_ids=segments_tensors)

In [None]:
# Version 6 #infersen
# thank you https://www.analyticsvidhya.com/blog/2020/08/top-4-sentence-embedding-techniques-using-python/

In [None]:
! mkdir encoder
! curl -Lo encoder/infersent2.pkl https://dl.fbaipublicfiles.com/infersent/infersent2.pkl
  
! mkdir GloVe
! curl -Lo GloVe/glove.840B.300d.zip http://nlp.stanford.edu/data/glove.840B.300d.zip
! unzip GloVe/glove.840B.300d.zip -d GloVe/

In [None]:
from models import InferSent
import torch

V = 2
MODEL_PATH = 'encoder/infersent%s.pkl' % V
params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048,
                'pool_type': 'max', 'dpout_model': 0.0, 'version': V}
model_infersen = InferSent(params_model)
model_infersen.load_state_dict(torch.load(MODEL_PATH))

W2V_PATH = '/content/GloVe/glove.840B.300d.txt'
model_infersen.set_w2v_path(W2V_PATH)

In [None]:
model_infersen.build_vocab(philo_df['spacy_sentence'].values, tokenize=True)
infersen_mat = np.array([model.encode([sent])[0] for sent in philo_df['spacy_sentence']])

In [None]:
def get_recommendations_v4(sentence, series):
    vec = model_infersen.encode([sentence])[0]
    print('computing similarity')
    mat = cosine_similarity(vec, infersen_mat)
    best_sim_each_token = np.argmax(mat, axis=1)
    index = np.argsort(best_sim_each_token)[::-1] #take the five highest norm 
    print('norms, indices', best_sim_each_token, index)
    null_index = best_sim_each_token != 0
    null_index = null_index[index]
    index = index[null_index==True]
    best_index = best_sim_each_token[index][:3]
    print('best_index', best_index)
    print(philo_df[['sentence', 'author']].iloc[best_index])
    return best_index