# EE461P Term Project: Compare Embedding Coverage

### Embeddings
External data sources are not allowed for this competition. We are, though, providing a number of word embeddings along with the dataset that can be used in the models. These are as follows:

GoogleNews-vectors-negative300 - https://code.google.com/archive/p/word2vec/

glove.840B.300d - https://nlp.stanford.edu/projects/glove/

paragram_300_sl999 - https://cogcomp.org/page/resource_view/106

wiki-news-300d-1M - https://fasttext.cc/docs/en/english-vectors.html


In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold, cross_val_score, train_test_split, cross_val_predict, GridSearchCV, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier, AdaBoostClassifier, ExtraTreesClassifier
import xgboost as xgb
from sklearn import preprocessing
from nltk.corpus import stopwords
import string
import pprint
import operator 
from tqdm import tqdm

%matplotlib inline

In [2]:
# Load the data
train = pd.read_csv("../input/train.csv")
test = pd.read_csv("../input/test.csv")

train_qid = train['qid']
test_qid  = test['qid']
y_train = train['target']

train.drop("qid", axis = 1, inplace = True)
test.drop("qid", axis = 1, inplace = True)

train.head()

Unnamed: 0,question_text,target
0,How did Quebec nationalists see their province...,0
1,"Do you have an adopted dog, how would you enco...",0
2,Why does velocity affect time? Does velocity a...,0
3,How did Otto von Guericke used the Magdeburg h...,0
4,Can I convert montra helicon D to a mountain b...,0


## Data Preprocessing:

In [3]:
# Concat training and set set for data exploration and feature engineering
all_data = pd.DataFrame(pd.concat((train['question_text'], test['question_text'])))

# manipulate all_data for any feature engineering, unless you need to compare distributions of train vs test data.
print(all_data.shape) 

(1362492, 1)


### Functions for checking coverage

In [4]:
# Counts occurance of words in dataset vocabulary
def build_vocab(texts):
    sentences = texts.apply(lambda x: x.split()).values
    vocab = {}
    for sentence in sentences:
        for word in sentence:
            try:
                vocab[word] += 1
            except KeyError:
                vocab[word] = 1
    return vocab

In [5]:
# checks the intersection between vocabulary and the embeddings
# returns a list of words that can be used to improve preprocessing
def check_coverage(vocab, embeddings_index):
    known_words = {}
    unknown_words = {}
    nb_known_words = 0
    nb_unknown_words = 0
    for word in vocab.keys():
        try:
            known_words[word] = embeddings_index[word]
            nb_known_words += vocab[word]
        except:
            unknown_words[word] = vocab[word]
            nb_unknown_words += vocab[word]
            pass

    print('Found embeddings for {:.3%} of vocab'.format(len(known_words) / len(vocab)))
    print('Found embeddings for  {:.3%} of all text'.format(nb_known_words / (nb_known_words + nb_unknown_words)))
    unknown_words = sorted(unknown_words.items(), key=operator.itemgetter(1))[::-1]

    return unknown_words

In [6]:
from gensim.models import KeyedVectors

# loads embedding from file
def load_embed(file):
    def get_coefs(word,*arr): 
        return word, np.asarray(arr, dtype='float32')
    
    if file == '../input/embeddings/GoogleNews-vectors-negative300/GoogleNews-vectors-negative300.bin':
        embeddings_index = KeyedVectors.load_word2vec_format(file, binary=True)
    elif file == '../input/embeddings/wiki-news-300d-1M/wiki-news-300d-1M.vec':
        embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(file) if len(o)>100)
    else:
        embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(file, encoding='latin'))
        
    return embeddings_index

### Load embeddings

In [7]:
glove = '../input/embeddings/glove.840B.300d/glove.840B.300d.txt'
print("Extracting GloVe embedding")
embed_glove = load_embed(glove)

Extracting GloVe embedding


In [8]:
fasttext = '../input/embeddings/wiki-news-300d-1M/wiki-news-300d-1M.vec'
print("Extracting FastText embedding")
embed_fasttext = load_embed(fasttext)

Extracting FastText embedding


In [9]:
para = '../input/embeddings/paragram_300_sl999/paragram_300_sl999.txt'
print("Extracting Paragram embedding")
embed_para = load_embed(para)

Extracting Paragram embedding


### Create default vocab and check base coverage

In [10]:
sentences = all_data["question_text"]
vocab = build_vocab(sentences)
print({k: vocab[k] for k in list(vocab)[:5]})

{'How': 273144, 'did': 34918, 'Quebec': 102, 'nationalists': 97, 'see': 9397}


In [11]:
print("Glove Coverage: ")
oov_glove = check_coverage(vocab, embed_glove)
print("Fasttext Coverage: ")
oov_fasttext = check_coverage(vocab, embed_fasttext)
print("Paragram Coverage: ")
oov_para = check_coverage(vocab, embed_para)

Glove Coverage: 
Found embeddings for 32.773% of vocab
Found embeddings for  88.149% of all text
Fasttext Coverage: 
Found embeddings for 29.774% of vocab
Found embeddings for  87.658% of all text
Paragram Coverage: 
Found embeddings for 19.369% of vocab
Found embeddings for  72.205% of all text


### Clean Contractions

In [12]:
contraction_mapping = {"ain't": "is not", "aren't": "are not",
                       "can't": "cannot", "'cause": "because", 
                       "could've": "could have", "couldn't": "could not", 
                       "didn't": "did not",  "doesn't": "does not", 
                       "don't": "do not", "hadn't": "had not", 
                       "hasn't": "has not", "haven't": "have not", 
                       "he'd": "he would","he'll": "he will", "he's": "he is", 
                       "how'd": "how did", "how'd'y": "how do you", 
                       "how'll": "how will", "how's": "how is",  
                       "I'd": "I would", "I'd've": "I would have", 
                       "I'll": "I will", "I'll've": "I will have",
                       "I'm": "I am", "I've": "I have", "i'd": "i would", 
                       "i'd've": "i would have", "i'll": "i will",  
                       "i'll've": "i will have","i'm": "i am",
                       "i've": "i have", "isn't": "is not", "it'd": "it would", 
                       "it'd've": "it would have", "it'll": "it will", 
                       "it'll've": "it will have","it's": "it is", 
                       "let's": "let us", "ma'am": "madam", 
                       "mayn't": "may not", "might've": "might have",
                       "mightn't": "might not","mightn't've": "might not have", 
                       "must've": "must have", "mustn't": "must not", 
                       "mustn't've": "must not have", "needn't": "need not", 
                       "needn't've": "need not have","o'clock": "of the clock", 
                       "oughtn't": "ought not", "oughtn't've": "ought not have",
                       "shan't": "shall not", "sha'n't": "shall not", 
                       "shan't've": "shall not have", "she'd": "she would",
                       "she'd've": "she would have", "she'll": "she will", 
                       "she'll've": "she will have", "she's": "she is", 
                       "should've": "should have", "shouldn't": "should not",
                       "shouldn't've": "should not have", "so've": "so have",
                       "so's": "so as", "this's": "this is",
                       "that'd": "that would", "that'd've": "that would have", 
                       "that's": "that is", "there'd": "there would", 
                       "there'd've": "there would have", "there's": "there is", 
                       "here's": "here is","they'd": "they would",
                       "they'd've": "they would have", "they'll": "they will",
                       "they'll've": "they will have", "they're": "they are", 
                       "they've": "they have", "to've": "to have", 
                       "wasn't": "was not", "we'd": "we would", 
                       "we'd've": "we would have", "we'll": "we will", 
                       "we'll've": "we will have", "we're": "we are", 
                       "we've": "we have", "weren't": "were not", 
                       "what'll": "what will", "what'll've": "what will have", 
                       "what're": "what are",  "what's": "what is", 
                       "what've": "what have", "when's": "when is", 
                       "when've": "when have", "where'd": "where did", 
                       "where's": "where is", "where've": "where have", 
                       "who'll": "who will", "who'll've": "who will have", 
                       "who's": "who is", "who've": "who have", 
                       "why's": "why is", "why've": "why have", 
                       "will've": "will have", "won't": "will not", 
                       "won't've": "will not have", "would've": "would have", 
                       "wouldn't": "would not", "wouldn't've": "would not have", 
                       "y'all": "you all", "y'all'd": "you all would",
                       "y'all'd've": "you all would have",
                       "y'all're": "you all are","y'all've": "you all have",
                       "you'd": "you would", "you'd've": "you would have", 
                       "you'll": "you will", "you'll've": "you will have", 
                       "you're": "you are", "you've": "you have" }

In [13]:
def known_contractions(embed):
    known = []
    for contract in contraction_mapping:
        if contract in embed:
            known.append(contract)
    return known

In [14]:
print("- Known Contractions -")
print("   Glove :")
print(known_contractions(embed_glove))

- Known Contractions -
   Glove :
["can't", "'cause", "didn't", "doesn't", "don't", "I'd", "I'll", "I'm", "I've", "it's", "ma'am", "o'clock", "that's", "you'll", "you're"]


In [15]:
print("- Known Contractions -")
print("   Fasttext :")
print(known_contractions(embed_fasttext))

- Known Contractions -
   Fasttext :
[]


In [16]:
print("- Known Contractions -")
print("   Paragram :")
print(known_contractions(embed_para))

- Known Contractions -
   Paragram :
["can't", "'cause", "didn't", "doesn't", "don't", "i'd", "i'll", "i'm", "i've", "it's", "ma'am", "o'clock", "that's", "you'll", "you're"]


In [17]:
def clean_contractions(text, mapping):
    specials = ["’", "‘", "´", "`"]
    for s in specials:
        text = text.replace(s, "'")
    text = ' '.join([mapping[t] if t in mapping else t for t in text.split(" ")])
    return text

In [18]:
#Make lower Case (not sure if this is needed)
all_data['lowered_text'] = all_data['question_text'].apply(lambda x: x.lower())

In [19]:
# Map contractions to words
all_data['question_text'] = all_data['question_text'].apply(lambda x: clean_contractions(x, contraction_mapping))

In [20]:
vocab = build_vocab(all_data['question_text'])

print("Glove Coverage: ")
oov_glove = check_coverage(vocab, embed_glove)
print("Fasttext Coverage: ")
oov_fasttext = check_coverage(vocab, embed_fasttext)
print("Paragram Coverage: ")
oov_para = check_coverage(vocab, embed_para)

Glove Coverage: 
Found embeddings for 32.915% of vocab
Found embeddings for  88.392% of all text
Fasttext Coverage: 
Found embeddings for 29.905% of vocab
Found embeddings for  88.229% of all text
Paragram Coverage: 
Found embeddings for 19.451% of vocab
Found embeddings for  72.501% of all text


### Remove non-printable

In [21]:
def remove_non_printable(sentence):
    
    # remove non printable characters
    output = ''.join([x for x in sentence if x in string.printable])
    
    return output

In [22]:
all_data["question_text"] = all_data["question_text"].apply(lambda x: remove_non_printable(x))

In [23]:
vocab = build_vocab(all_data["question_text"])

print("Glove Coverage: ")
oov_glove = check_coverage(vocab, embed_glove)
print("Fasttext Coverage: ")
oov_fasttext = check_coverage(vocab, embed_fasttext)
print("Paragram Coverage: ")
oov_para = check_coverage(vocab, embed_para)

Glove Coverage: 
Found embeddings for 33.589% of vocab
Found embeddings for  88.449% of all text
Fasttext Coverage: 
Found embeddings for 30.290% of vocab
Found embeddings for  88.269% of all text
Paragram Coverage: 
Found embeddings for 19.850% of vocab
Found embeddings for  72.542% of all text


### Clean Punctuation

In [24]:
punct = "/-'?!.,#$%\'()*+-/:;<=>@[\\]^_`{|}~" + '""“”’' + '∞θ÷α•à−β∅³π‘₹´°£€\×™√²—–&'

In [25]:
def unknown_punct(embed, punct):
    unknown = ''
    for p in punct:
        if p not in embed:
            unknown += p
            unknown += ' '
    return unknown

print("Glove Unknown Punctuation:")
print(unknown_punct(embed_glove, punct))
print()
print("Fasttext Unknown Punctuation:")
print(unknown_punct(embed_fasttext, punct))
print()
print("Paragram Unknown Punctuation:")
print(unknown_punct(embed_para, punct))

Glove Unknown Punctuation:
“ ” ’ ∞ θ ÷ α • à − β ∅ ³ π ‘ ₹ ´ ° £ € × ™ √ ² — – 

Fasttext Unknown Punctuation:
_ ` 

Paragram Unknown Punctuation:
“ ” ’ ∞ θ ÷ α • à − β ∅ ³ π ‘ ₹ ´ ° £ € × ™ √ ² — – 


In [26]:
punct_mapping = {"‘": "'", "₹": "e", "´": "'", "°": "",
                 "€": "e", "™": "tm", "√": " sqrt ", "×": "x", 
                 "²": "2", "—": "-", "–": "-", "’": "'", "_": "-", "`": "'", 
                 '“': '"', '”': '"', '“': '"', "£": "e", '∞': 'infinity', 
                 'θ': 'theta', '÷': '/', 'α': 'alpha', '•': '.', 'à': 'a', 
                 '−': '-', 'β': 'beta', '∅': '', '³': '3', 'π': 'pi', }

In [27]:
def clean_special_chars(text, punct, mapping):
    # replace special characters with mapping
    for p in mapping:
        text = text.replace(p, mapping[p])
    
    # space out punctuation
    for p in punct:
        text = text.replace(p, f' {p} ')
    
    # Other special characters that I have to deal with in last
    specials = {'\u200b': ' ', '…': ' ... ', '\ufeff': '', 'करना': '', 'है': ''} 
    for s in specials:
        text = text.replace(s, specials[s])
    
    return text

In [28]:
all_data['question_text'] = all_data['question_text'].apply(lambda x: clean_special_chars(x, punct, punct_mapping))

In [29]:
vocab = build_vocab(all_data["question_text"])

print("Glove Coverage: ")
oov_glove = check_coverage(vocab, embed_glove)
print("Fasttext Coverage: ")
oov_fasttext = check_coverage(vocab, embed_fasttext)
print("Paragram Coverage: ")
oov_para = check_coverage(vocab, embed_para)

Glove Coverage: 
Found embeddings for 75.141% of vocab
Found embeddings for  99.595% of all text
Fasttext Coverage: 
Found embeddings for 67.063% of vocab
Found embeddings for  99.441% of all text
Paragram Coverage: 
Found embeddings for 41.175% of vocab
Found embeddings for  83.334% of all text


### Correct Mispellings

In [30]:
mispell_dict = {'colour': 'color',
                'centre': 'center', 
                'favourite': 'favorite', 
                'travelling': 'traveling',
                'counselling': 'counseling', 
                'theatre': 'theater', 
                'cancelled': 'canceled', 
                'labour': 'labor', 
                'organisation': 'organization', 
                'wwii': 'world war two', 
                'citicise': 'criticize', 
                'youtu ': 'youtube ', 
                'Qoura': 'Quora', 
                'sallary': 'salary', 
                'Whta': 'What', 
                'narcisist': 'narcissist', 
                'howdo': 'how do', 
                'whatare': 'what are',
                'howcan': 'how can', 
                'howmuch': 'how much', 
                'howmany': 'how many', 
                'whydo': 'why do', 
                'doI': 'do I', 
                'theBest': 'the best', 
                'howdoes': 'how does', 
                'mastrubation': 'masturbation', 
                'mastrubate': 'masturbate',
                "mastrubating": 'masturbating', 
                'pennis': 'penis', 
                'Etherium': 'Ethereum', 
                'narcissit': 'narcissist',
                'bigdata': 'big data', 
                '2k17': '2017', 
                '2k18': '2018', 
                'qouta': 'quota', 
                'exboyfriend': 'ex boyfriend', 
                'airhostess': 'air hostess', 
                "whst": 'what', 
                'watsapp': 'social medium', 
                'demonitisation': 'demonetization', 
                'demonitization': 'demonetization', 
                'demonetisation': 'demonetization', 
                'pokémon': 'pokemon',
                'didnt':'did not',
                'doesnt':'does not',
                'isnt':'is not',
                'shouldnt':'should not',
                'behaviour': 'behavior',
                'instagram': 'social medium',
                'whatsapp': 'social medium',
                'snapchat': 'social medium',
                'litre': 'liter',
                'flavour': 'flavor',
                'humour': 'humor',
                'neighbour': 'neighbor',
                'apologise': 'aplogize',
                'organise': 'organize',
                'recognise': 'recognize',
                'analyse': 'analyze',
                'travelled': 'traveled',
                'travelling': 'traveling',
                'traveller': 'traveler',
                'fuelled': 'fueled',
                'fuelling': 'fueling',
                'defence': 'defense',
                'licence': 'license',
                'offence': 'offense',
                'pretence': 'pretense',
                'analog': 'analogue',
                'catalog': 'catalogue',
                'dialog': 'dialogue'}

In [31]:
def correct_spelling(x, dic):
    for word in dic.keys():
        x = x.replace(word, dic[word])
    return x

In [32]:
all_data['question_text'] = all_data['question_text'].apply(lambda x: correct_spelling(x, mispell_dict))

In [33]:
vocab = build_vocab(all_data["question_text"])

print("Glove Coverage: ")
oov_glove = check_coverage(vocab, embed_glove)
print("Fasttext Coverage: ")
oov_fasttext = check_coverage(vocab, embed_fasttext)
print("Paragram Coverage: ")
oov_para = check_coverage(vocab, embed_para)

Glove Coverage: 
Found embeddings for 75.127% of vocab
Found embeddings for  99.595% of all text
Fasttext Coverage: 
Found embeddings for 67.050% of vocab
Found embeddings for  99.441% of all text
Paragram Coverage: 
Found embeddings for 41.141% of vocab
Found embeddings for  83.333% of all text


### Remove stopwords, one character words

In [34]:
import re
def remove_extra(sentence, wordsToRemove):
    
    # remove stop words
    resultwords  = [word for word in sentence.split() if word.lower() not in wordsToRemove]
    result = ' '.join(resultwords)
    
    #remove one character long words
    output =  re.sub(r"\b[a-zA-Z]\b", "", result)
    
    return output

In [35]:
to_remove = ['a','to','of','and']
all_data["question_text"] = all_data["question_text"].apply(lambda x: remove_extra(x, to_remove))

In [None]:
vocab = build_vocab(all_data["question_text"])

print("Glove Coverage: ")
oov_glove = check_coverage(vocab, embed_glove)
print("Fasttext Coverage: ")
oov_fasttext = check_coverage(vocab, embed_fasttext)
print("Paragram Coverage: ")
oov_para = check_coverage(vocab, embed_para)

## Featue Engineering

## Functions:

In [None]:
# Will run 5-fold cross validation evaluated with the F1 Score
# model: model to cross-validate 
# train_set: training set being used to validate model
def f1_cv(model, train_set):
    kf = KFold(5, shuffle=True, random_state=0).get_n_splits(train_set)
    f1_score = cross_val_score(model, train_set, y_train, scoring="f1", cv = kf)
    return f1_score.mean()

In [None]:
# Generates a submission file from predictions of the given model on the test set
def gen_sub(name, model):
    y_pred = model.predict(X_test)
    solution = pd.DataFrame({"qid":test_qid, "prediction":y_pred})
    solution.to_csv(name +".csv", index = False)

## Models:

In [None]:
# split the data back into original train and test set
X_train = all_data[:train_qid.size]
X_test = all_data[train_qid.size:]
y_train = y_train

In [None]:
# first model code here...