In [1]:
from gensim.models import FastText
import pandas as pd
import re
import nltk
from string import punctuation
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import sent_tokenize
from nltk import WordPunctTokenizer
import numpy as np
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('omw-1.4')
en_stop = set(nltk.corpus.stopwords.words('english'))

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /usr/share/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [2]:
## read the data
opted_data = pd.read_pickle('datasets/Opted/opted.pkl')
wordnet_data = pd.read_pickle('datasets/WordNet/wordnet.pkl')
combined_data = pd.concat([opted_data, wordnet_data])

In [3]:
fastext_df = combined_data[['word', 'definition']]
fastext_df['modified_sentence'] = combined_data['word'] + " " + combined_data['definition']


In [4]:
def preprocess_text(sentence, stemmer):
        # Remove all the special characters
        sentence = re.sub(r'\W', ' ', str(sentence))

        # remove all single characters
        sentence = re.sub(r'\s+[a-zA-Z]\s+', ' ', sentence)

        # Remove single characters from the start
        sentence = re.sub(r'\^[a-zA-Z]\s+', ' ', sentence)

        # Substituting multiple spaces with single space
        sentence = re.sub(r'\s+', ' ', sentence, flags=re.I)

        # Removing prefixed 'b'
        sentence = re.sub(r'^b\s+', '', sentence)

        # Converting to Lowercase
        sentence = sentence.lower()

        # Lemmatization
        tokens = sentence.split()
        tokens = [stemmer.lemmatize(word) for word in tokens]
        tokens = [word for word in tokens if word not in en_stop]
        ##tokens = [word for word in tokens if len(word) > 3]

        preprocessed_text = ' '.join(tokens)

        return preprocessed_text

In [5]:
# preprocess and tokenize the data
stemmer = WordNetLemmatizer()
fastext_df['preprocessed_sentence'] = fastext_df.apply(lambda x: preprocess_text(x.modified_sentence, stemmer)\
                                                       , axis=1)
preprocessed_sentence = fastext_df.preprocessed_sentence.tolist()
preprocessed_sentence = [x for x in preprocessed_sentence if x != '']

In [6]:
word_punctuation_tokenizer = nltk.WordPunctTokenizer()
word_tokenized_corpus = [word_punctuation_tokenizer.tokenize(sent) for sent in preprocessed_sentence]

In [7]:
## train fastext

embedding_size = 60
window_size = 40
min_word = 1

model = FastText(word_tokenized_corpus,
                      vector_size=embedding_size,
                      window=window_size,
                      min_count=min_word)  # instantiate

In [8]:
## train test split
from sklearn.model_selection import train_test_split

y = fastext_df['word'].tolist()
X = fastext_df['definition'].tolist()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=224)

print('{} training samples'.format(len(X_train)))
print('{} validation samples'.format(len(X_test)))

233436 training samples
58360 validation samples


In [9]:
## top 10 accuracy
top10_accuracy_df = pd.DataFrame()
for index in range(0, len(X_test)):
    sent = preprocess_text(X_test[index], stemmer)
    tokenized_sentence = word_punctuation_tokenizer.tokenize(sent)
    try:
        prediction_tuples = model.wv.most_similar(positive=tokenized_sentence)
        predictions = []
        for prediction, score in prediction_tuples:
            predictions.append(prediction)
        if y_test[index] in predictions:
            top10_accuracy_df.loc[index, 'in_top_10'] = 1
        else:
            top10_accuracy_df.loc[index, 'in_top_10'] = 0  
    except:
        top10_accuracy_df.loc[index, 'in_top_10'] = np.nan 

In [10]:
top10_accuracy_df.dropna(inplace = True)

In [12]:
top10_accuracy_df.sum()/top10_accuracy_df.shape[0]

in_top_10    0.015123
dtype: float64

In [20]:
## top 100 accuracy


top100_accuracy_df = pd.DataFrame()
for index in range(0, len(X_test)):
    sent = preprocess_text(X_test[index], stemmer)
    tokenized_sentence = word_punctuation_tokenizer.tokenize(sent)
    try:
        prediction_tuples = model.wv.most_similar(positive=tokenized_sentence, topn = 100)
        predictions = []
        for prediction, score in prediction_tuples:
            predictions.append(prediction)
        if y_test[index] in predictions:
            top100_accuracy_df.loc[index, 'in_top_100'] = 1
        else:
            top100_accuracy_df.loc[index, 'in_top_100'] = 0  
    except:
        top100_accuracy_df.loc[index, 'in_top_100'] = np.nan 
top100_accuracy_df.dropna(inplace = True)

In [21]:
top100_accuracy_df.sum()/top100_accuracy_df.shape[0]

in_top_100    0.042883
dtype: float64

In [22]:
## top 1 accuracy
top1_accuracy_df = pd.DataFrame()
for index in range(0, len(X_test)):
    sent = preprocess_text(X_test[index], stemmer)
    tokenized_sentence = word_punctuation_tokenizer.tokenize(sent)
    try:
        prediction_tuples = model.wv.most_similar(positive=tokenized_sentence, topn = 1)
        predictions = []
        for prediction, score in prediction_tuples:
            predictions.append(prediction)
        if y_test[index] in predictions:
            top1_accuracy_df.loc[index, 'in_top_1'] = 1
        else:
            top1_accuracy_df.loc[index, 'in_top_1'] = 0  
    except:
        top1_accuracy_df.loc[index, 'in_top_1'] = np.nan 
top1_accuracy_df.dropna(inplace = True)
top1_accuracy_df.sum()/top1_accuracy_df.shape[0]

in_top_1    0.004441
dtype: float64