# Day 2 - Exercise 3 - FastText

## Necessary imports

In order to handle the data properly we have to import the data and the modules we need:

In [5]:
# modules
import pandas as pd
import numpy as np
import re
import nltk
from gensim.models import Word2Vec
from gensim.models.fasttext import FastText

First of all, you need to download the data set "tweets.csv" from the GitHub repository https://github.com/assenmacher-mat/nlp_notebooks.

__If you are running this notebook on colab ( https://colab.research.google.com/ ), you also need to run the next chunk in order to upload the data to colab.  
Choose it in the upload window and in it will be available on colab from now on.__  
(If you are running this notebook locally on your machine, you can skip the execution of this chunk)

In [124]:
from google.colab import files
uploaded = files.upload()

### Import the data set and perform pre-processing

__If you are running this notebook locally on your machine, you might need to adjust the path (depending on where you've saved the data).__  
(If you are running this notebook on colab, you can can leave the path unchanged)

In [6]:
tweet_data = pd.read_csv("trump.csv")
tweets_raw = [tweet for tweet in list(tweet_data.text)]
tweets = [doc.lower() for doc in tweets_raw]
tweets = [re.sub(r"https://.*|“|”|@", "", doc) for doc in tweets]
tweets = [re.sub(r"[\)\(\.\,;:!?\+\-\_\#\'\*\§\$\%\&]", "", doc) for doc in tweets]
tweets = [nltk.word_tokenize(doc) for doc in tweets]

# After all, we can finally start with the modeling part!  
(If you want to have a look at the help page, just execute the following chunk)

In [1]:
help(FastText)

### First, we determine the number of CPUs that are available on our machine  
(The more cores are available, the faster we can train our model)

In [7]:
import multiprocessing
cpus = multiprocessing.cpu_count()
print(cpus)

8


### Set up the model parameters

In [8]:
ft_model = FastText(sg = 0, cbow_mean = 1, size = 100, alpha = 0.025, min_alpha = 0.0001, min_n = 3, max_n = 5,
                    window = 5, min_count = 5, sample = 0.001, negative = 5, workers = cpus - 1)

### Initialize the model with our twitter data:

In [9]:
ft_model.build_vocab(sentences = tweets, update = False, progress_per = 10000)

### Train our model:  
(Hint: If you want to compre the runtime of the model for different number of cores or epochs, just put "%timeit" in front of the command  
 in the next chunk. You will then get an evaluation of how long the process takes.)

In [10]:
%timeit ft_model.train(sentences = tweets, total_examples = ft_model.corpus_count, epochs = 100)

11.6 s ± 310 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


### Train a word2vec model with the same model specifications (skip-gram, vector size, window, etc.)

In [11]:
w2v_model = Word2Vec(sg = 0, cbow_mean = 1, size = 100, alpha = 0.025, min_alpha = 0.0001, 
                     window = 5, min_count = 5, sample = 0.001, negative = 5, workers = cpus - 1)
w2v_model.build_vocab(sentences = tweets, update = False, progress_per = 10000)
w2v_model.train(sentences = tweets, total_examples = w2v_model.corpus_count, epochs = 100)

(7088596, 10534100)

### Check, whether the word "example" does occur in your model's vocabulary

In [12]:
"example" in ft_model.wv.vocab

False

### Try to query your word2vec model for a vector representation of this word

In [13]:
w2v_model.wv["example"]

KeyError: "word 'example' not in vocabulary"

### Now try to query your fastText model for a vector representation of this word

In [14]:
ft_model.wv["example"]

array([ 1.0486553e+00,  1.6520401e+00,  2.1554410e+00,  1.1430428e+00,
        1.6350437e+00, -2.8443673e+00,  3.1840882e+00,  1.6516072e+00,
        1.7214676e+00, -2.5208766e+00,  1.3167694e+00, -2.6342528e+00,
       -2.3123934e+00, -2.2165632e-01, -6.5016836e-01, -9.7515565e-01,
        2.1689889e+00,  1.9777322e+00, -7.1687174e-01,  7.2892499e-01,
        1.5738661e-03, -6.3561291e-01,  2.0284667e+00,  2.1132912e-01,
       -1.2740127e+00, -3.0367064e+00, -1.0260161e+00,  2.2499128e-01,
        6.3597220e-01, -7.5686151e-01, -1.0237803e+00,  3.9474463e+00,
       -1.8808597e+00, -7.2630548e-01, -1.4895096e+00, -2.0428224e+00,
       -6.9836181e-01,  3.1248198e+00, -1.0949135e-01,  4.1144320e-01,
       -9.1552299e-01, -9.9182791e-01,  4.2851403e-01, -4.1289307e-02,
        1.3214664e+00, -8.6032259e-01, -2.4536220e-02,  1.2301015e+00,
        6.5257055e-01, -1.3914090e+00, -1.9630507e+00,  1.6500744e+00,
       -2.0449514e+00,  6.4786309e-01, -1.2088287e+00, -1.9017792e+00,
      

### Print the most word with the most similar vector representations

In [24]:
ft_model.wv.most_similar(positive = ["example"])

[('simple', 0.5714014768600464),
 ('exact', 0.4678761959075928),
 ('exactly', 0.46437159180641174),
 ('texas', 0.31973931193351746),
 ('india', 0.30366888642311096),
 ('ohr', 0.2940193712711334),
 ('having', 0.29206737875938416),
 ('aspect', 0.2863953709602356),
 ('justices', 0.2860548198223114),
 ('inside', 0.28579258918762207)]

### Check, whether the word "democrats" does occur in your model's vocabulary

In [16]:
"democrats" in ft_model.wv.vocab

True

### Try to query your word2vec model for a vector representation of this word

In [17]:
w2v_model.wv["democrats"]

array([ 0.56760174,  1.0978279 , -0.44636375, -1.370232  ,  0.6363281 ,
       -0.6500159 ,  1.0603571 ,  3.1058614 ,  2.14952   , -0.6802631 ,
       -1.0683842 , -0.8041357 , -2.0587215 , -1.1914525 , -1.0351582 ,
       -4.352592  ,  0.11410102,  1.0767232 ,  1.997807  ,  1.2732118 ,
        0.50725424, -0.74185264,  1.2317436 , -0.60479367, -0.09600411,
       -0.62712747,  0.8109986 , -1.9294683 , -2.543967  , -1.3310887 ,
        1.2558452 , -0.47315744, -1.4003739 ,  2.3724196 ,  1.880108  ,
        0.14799374, -2.268063  ,  2.5570033 , -0.8379476 ,  0.38235664,
       -0.41780508,  0.63027084,  0.606201  ,  0.92862767,  0.18029945,
        1.3708873 ,  2.6459687 ,  1.7692064 ,  2.7067606 , -0.01189727,
       -0.15033926,  0.9917862 , -4.098683  ,  3.9586165 ,  0.5021969 ,
       -3.3960874 ,  2.5581117 , -4.6729093 ,  0.5639138 ,  0.6061467 ,
       -1.182844  ,  2.1353543 , -1.5632311 ,  2.1959524 , -1.0714059 ,
        0.28672642, -0.28899705,  1.641964  ,  0.34226125,  0.20

### Now try to query your fastText model for a vector representation of this word

In [18]:
ft_model.wv["democrats"]

array([-0.8398814 , -1.8945438 ,  1.1597911 , -0.81199354,  0.6732897 ,
       -1.401981  , -0.438043  , -0.23753382, -0.65049314,  0.20491856,
       -0.11132   ,  1.7340851 , -0.33728626,  0.4092576 , -0.0639997 ,
        1.2539299 , -2.389616  ,  0.8441182 ,  0.6235202 ,  0.57921076,
        1.2038391 ,  2.2296095 ,  0.3568454 , -0.93572885,  1.2726535 ,
       -0.39360413,  0.88321304,  0.56413466, -0.18699293, -1.6253219 ,
       -0.8681686 , -1.036386  , -1.2211905 , -0.15744247, -1.3042277 ,
        0.50965905,  0.36817193,  0.20597057,  1.7013777 ,  0.22775795,
       -0.8392609 , -0.25008392, -0.07426292, -1.6004959 , -2.1281245 ,
        0.31867576,  0.46663764, -1.0829184 , -1.9937471 ,  0.04136777,
       -0.6111922 ,  0.50176954, -0.3769376 , -1.9965129 ,  1.1369956 ,
        0.09730179, -0.6319924 ,  0.82005835, -0.5232716 , -0.7630313 ,
       -1.0930935 ,  1.9585453 ,  2.076092  ,  0.48397627,  0.14669396,
       -2.3992023 ,  0.90727973,  0.4698415 , -0.26430663,  0.48

### Print the most word with the most similar vector representations (for each of the two models)

In [19]:
w2v_model.wv.most_similar(positive = ["democrats"])

[('dems', 0.7657046318054199),
 ('they', 0.479423463344574),
 ('democrat', 0.44336768984794617),
 ('we', 0.4397535026073456),
 ('losers', 0.3709675669670105),
 ('people', 0.33017468452453613),
 ('fix', 0.3238281011581421),
 ('others', 0.32353824377059937),
 ('you', 0.31534522771835327),
 ('actions', 0.31037071347236633)]

In [20]:
ft_model.wv.most_similar(positive = ["democrats"])

[('dems', 0.6397151350975037),
 ('democrat', 0.5384020209312439),
 ('they', 0.496374249458313),
 ('we', 0.4879452586174011),
 ('democracy', 0.44930195808410645),
 ('democratic', 0.34020522236824036),
 ('you', 0.32630443572998047),
 ('people', 0.31717851758003235),
 ('drugs', 0.2760796546936035),
 ('committees', 0.268619567155838)]

### Do you recognize any systematic differences?

### Explore the possibilities the model by e.g. switching from skip-gram to cbow, trying different n-gram ranges, chosing a larger embedding size, etc.