# FastText Deep NLP 

### Import FastText

In [1]:
import fasttext

### Create Dataset 

In [3]:
from datasets import load_dataset

dataset = load_dataset("imdb")

Reusing dataset imdb (/Users/quentinlehelloco/.cache/huggingface/datasets/imdb/plain_text/1.0.0/e3c66f1788a67a89c7058d97ff62b6c30531e05b549de56d3ab91891f0561f9a)


  0%|          | 0/3 [00:00<?, ?it/s]

In [4]:
x_train = dataset["train"][:]["text"]
y_train = dataset["train"][:]["label"]

x_test = dataset["test"][:]["text"]
y_test = dataset["test"][:]["label"]

In [5]:
x_train[0]

'Bromwell High is a cartoon comedy. It ran at the same time as some other programs about school life, such as "Teachers". My 35 years in the teaching profession lead me to believe that Bromwell High\'s satire is much closer to reality than is "Teachers". The scramble to survive financially, the insightful students who can see right through their pathetic teachers\' pomp, the pettiness of the whole situation, all remind me of the schools I knew and their students. When I saw the episode in which a student repeatedly tried to burn down the school, I immediately recalled ......... at .......... High. A classic line: INSPECTOR: I\'m here to sack one of your teachers. STUDENT: Welcome to Bromwell High. I expect that many adults of my age think that Bromwell High is far fetched. What a pity that it isn\'t!'

In [14]:
y_train[0]

1

### Format dataset for FastText usage 

In [30]:
import pandas as pd
import numpy as np

In [31]:
df = pd.DataFrame({'Value':x_train, 'Positive':y_train})
df.head()

Unnamed: 0,Value,Positive
0,Bromwell High is a cartoon comedy. It ran at t...,1
1,Homelessness (or Houselessness as George Carli...,1
2,Brilliant over-acting by Lesley Ann Warren. Be...,1
3,This is easily the most underrated film inn th...,1
4,This is not the typical Mel Brooks film. It wa...,1


#### We need to add "\__label\__positive" and "\__label\__negative" before every document

In [32]:
def add_label(s, p):
    if p == 1:
        return "__label__positive " + s
    else:
        return "__label__negative " + s 

In [33]:
df["Value"] = df.apply(lambda x: add_label(x.Value, x.Positive), axis=1)

In [37]:
df

Unnamed: 0,Value,Positive
0,__label__positive Bromwell High is a cartoon c...,1
1,__label__positive Homelessness (or Houselessne...,1
2,__label__positive Brilliant over-acting by Les...,1
3,__label__positive This is easily the most unde...,1
4,__label__positive This is not the typical Mel ...,1
...,...,...
24995,__label__negative Towards the end of the movie...,0
24996,__label__negative This is the kind of movie th...,0
24997,__label__negative I saw 'Descent' last night a...,0
24998,__label__negative Some films that you pick up ...,0


### Shuffle data

In [40]:
df = df.sample(frac=1).reset_index(drop=True)
df

Unnamed: 0,Value,Positive
0,__label__positive What is so taboo about love?...,1
1,"__label__positive well, this is an Ivan Reitma...",1
2,__label__positive I tracked the trip two years...,1
3,__label__negative The movie that shoots scenes...,0
4,__label__positive I saw this in the theater an...,1
...,...,...
24995,__label__positive Strange yet emotionally dist...,1
24996,__label__negative I saw this at the premiere i...,0
24997,__label__positive This is by far one of my fav...,1
24998,__label__positive Ghosts That Still Walk is on...,1


### Put data as a file for FastText input

In [42]:
with open("FastText_input.txt", 'a') as f:
    for values in df["Value"]:
        f.write(values + "\n")

### Create model 

In [43]:
model = fasttext.train_supervised("FastText_input.txt")

Read 5M words
Number of words:  281132
Number of labels: 2
Progress: 100.0% words/sec/thread: 2515036 lr:  0.000000 avg.loss:  0.428858 ETA:   0h 0m 0s


### Save model

In [44]:
model.save_model("model_cooking.bin")

### Test model predictions

In [47]:
model.predict("I loved this movie !")

(('__label__positive',), array([0.99996102]))

In [48]:
model.predict("I hated this movie !")

(('__label__negative',), array([0.97816825]))

In [52]:
model.predict("I did not enjoyed it as I should have")

(('__label__positive',), array([0.98001492]))

In [53]:
model.predict("I really enjoyed how bad the actors played")

(('__label__negative',), array([0.81813389]))

### Test our accuracy

In [65]:
def test(x_test, y_test, model):
    """
    Calculate accuracy of predictions for a model
    
    Input:
        x_test : list of sentences to predict
        y_test : list of labels for x_test sentences
        model : model to predict from
        
    Output:
        Accuracy (ratio of correct predictions)
    """
    number_doc = len(x_test)
    correct_pred = 0
    
    for i in range(number_doc):
        pred = model.predict(x_test[i])
        
        if pred[0][0] == '__label__negative' and y_test[i] == 0 or \
            pred[0][0] == '__label__positive' and y_test[i] == 1:
                correct_pred += 1
                
    return correct_pred / number_doc
    

In [67]:
%%time
test(x_test, y_test, model)

CPU times: user 1.61 s, sys: 11.3 ms, total: 1.62 s
Wall time: 1.7 s


0.86004