# FastText Deep NLP 

### Import FastText

In [70]:
import fasttext

### Create Dataset 

In [71]:
from datasets import load_dataset

dataset = load_dataset("imdb")

Reusing dataset imdb (/Users/quentinlehelloco/.cache/huggingface/datasets/imdb/plain_text/1.0.0/e3c66f1788a67a89c7058d97ff62b6c30531e05b549de56d3ab91891f0561f9a)


  0%|          | 0/3 [00:00<?, ?it/s]

In [4]:
x_train = dataset["train"][:]["text"]
y_train = dataset["train"][:]["label"]

x_test = dataset["test"][:]["text"]
y_test = dataset["test"][:]["label"]

In [5]:
x_train[0]

'Bromwell High is a cartoon comedy. It ran at the same time as some other programs about school life, such as "Teachers". My 35 years in the teaching profession lead me to believe that Bromwell High\'s satire is much closer to reality than is "Teachers". The scramble to survive financially, the insightful students who can see right through their pathetic teachers\' pomp, the pettiness of the whole situation, all remind me of the schools I knew and their students. When I saw the episode in which a student repeatedly tried to burn down the school, I immediately recalled ......... at .......... High. A classic line: INSPECTOR: I\'m here to sack one of your teachers. STUDENT: Welcome to Bromwell High. I expect that many adults of my age think that Bromwell High is far fetched. What a pity that it isn\'t!'

In [14]:
y_train[0]

1

### Format dataset for FastText usage 

In [103]:
import pandas as pd
import numpy as np

In [104]:
df = pd.DataFrame({'Value':x_train, 'Positive':y_train})
df.head()

Unnamed: 0,Value,Positive
0,Bromwell High is a cartoon comedy. It ran at t...,1
1,Homelessness (or Houselessness as George Carli...,1
2,Brilliant over-acting by Lesley Ann Warren. Be...,1
3,This is easily the most underrated film inn th...,1
4,This is not the typical Mel Brooks film. It wa...,1


#### We need to add "\__label\__positive" and "\__label\__negative" before every document

In [105]:
def add_label(s, p):
    if p == 1:
        return "__label__positive " + s
    else:
        return "__label__negative " + s 

In [106]:
df["Value"] = df.apply(lambda x: add_label(x.Value, x.Positive), axis=1)

In [107]:
df

Unnamed: 0,Value,Positive
0,__label__positive Bromwell High is a cartoon c...,1
1,__label__positive Homelessness (or Houselessne...,1
2,__label__positive Brilliant over-acting by Les...,1
3,__label__positive This is easily the most unde...,1
4,__label__positive This is not the typical Mel ...,1
...,...,...
24995,__label__negative Towards the end of the movie...,0
24996,__label__negative This is the kind of movie th...,0
24997,__label__negative I saw 'Descent' last night a...,0
24998,__label__negative Some films that you pick up ...,0


### Shuffle data

In [108]:
df = df.sample(frac=1).reset_index(drop=True)
df

Unnamed: 0,Value,Positive
0,__label__positive This is simply a classic fil...,1
1,__label__negative Wow. Some movies just leave ...,0
2,__label__negative As an ex-teacher(!) I must c...,0
3,"__label__positive Robert Jannuci,Luca Venantin...",1
4,__label__positive I would never have thought I...,1
...,...,...
24995,__label__positive This project was originally ...,1
24996,"__label__positive When I first watched this, w...",1
24997,__label__positive Madhur Bhandarkar goes all o...,1
24998,__label__negative I rented this thinking it mi...,0


### Put data as a file for FastText input

In [42]:
with open("FastText_input.txt", 'w+') as f:
    for values in df["Value"]:
        f.write(values + "\n")

### Create model 

In [109]:
model = fasttext.train_supervised("FastText_input.txt")

Read 5M words
Number of words:  281132
Number of labels: 2
Progress: 100.0% words/sec/thread: 2805986 lr:  0.000000 avg.loss:  0.427186 ETA:   0h 0m 0s 67.4% words/sec/thread: 2791510 lr:  0.032629 avg.loss:  0.485156 ETA:   0h 0m 1s


### Save model

In [110]:
model.save_model("model.bin")

### Test model predictions

In [111]:
model.predict("I loved this movie !")

(('__label__positive',), array([0.99994802]))

In [112]:
model.predict("I hated this movie !")

(('__label__negative',), array([0.98021179]))

In [113]:
model.predict("I did not enjoyed it as I should have")

(('__label__positive',), array([0.97754908]))

In [114]:
model.predict("I really enjoyed how bad the actors played")

(('__label__negative',), array([0.86012012]))

### Test our accuracy

In [115]:
def pred_tests(x_test, y_test, model):
    """
    Calculate accuracy of predictions for a model
    
    Input:
        x_test : list of sentences to predict
        y_test : list of labels for x_test sentences
        model : model to predict from
        
    Output:
        List of predictions
    """
    
    preds = []
    
    number_doc = len(x_test)
    
    for i in range(number_doc):
        pred = model.predict(x_test[i])
        
        #
        if pred[0][0] == '__label__negative':
            preds.append(0)
        else:
            preds.append(1)
                
    return preds

In [116]:
%%time
y_pred = pred_tests(x_test, y_test, model)

CPU times: user 1.59 s, sys: 65 ms, total: 1.66 s
Wall time: 1.66 s


In [117]:
from sklearn.metrics import classification_report

In [118]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.86      0.86      0.86     12500
           1       0.86      0.86      0.86     12500

    accuracy                           0.86     25000
   macro avg       0.86      0.86      0.86     25000
weighted avg       0.86      0.86      0.86     25000



### Hyperparameters fitting

#### Creating val and train dataset for hyperparameters tuning

In [119]:
def df_to_FastText_file(value_column:pd.DataFrame, file_name:str):
    """
    Create a file with FastText Format from a DataFrame
    
    Input:
        value_column : column of dataFrame containing formatted values
        file_name : name of the file to create
        
    Output: None
    """
    
    with open(file_name, "w+") as f:
        for values in value_column:
            f.write(values + "\n")

In [120]:
# Create val dataset as FastText input file
df_to_FastText_file(df[:2000]["Value"],"FastText_val.txt")

In [123]:
# Create train dataset as FastText input file
df_to_FastText_file(df[2000:]["Value"], "FastText_train.txt")

#### Use FastText hyperparameters tuning method

In [125]:
tuned_model = fasttext.train_supervised(input='FastText_train.txt', autotuneValidationFile='FastText_val.txt', autotuneDuration=60)


Aborting autotune...

Training again with best arguments
Read 5M words
Number of words:  266902
Number of labels: 2
Progress: 100.0% words/sec/thread:  352928 lr:  0.000000 avg.loss:  0.039604 ETA:   0h 0m 0s  0.2% words/sec/thread:  269543 lr:  0.442348 avg.loss:  0.694788 ETA:   0h10m42s  0.2% words/sec/thread:  272075 lr:  0.442197 avg.loss:  0.694770 ETA:   0h10m36s  2.6% words/sec/thread:  341562 lr:  0.431503 avg.loss:  0.462021 ETA:   0h 8m14s  3.4% words/sec/thread:  345684 lr:  0.427993 avg.loss:  0.414653 ETA:   0h 8m 4s  4.4% words/sec/thread:  348186 lr:  0.423841 avg.loss:  0.379316 ETA:   0h 7m56s  4.6% words/sec/thread:  348538 lr:  0.422895 avg.loss:  0.372985 ETA:   0h 7m54s  8.2% words/sec/thread:  350429 lr:  0.406847 avg.loss:  0.292642 ETA:   0h 7m34s  9.4% words/sec/thread:  351392 lr:  0.401656 avg.loss:  0.274278 ETA:   0h 7m27s 11.0% words/sec/thread:  352534 lr:  0.394387 avg.loss:  0.252639 ETA:   0h 7m17s 13.2% words/sec/thread:  353232 lr:  0.384831 avg.lo

In [127]:
%%time
y_pred_tuned = pred_tests(x_test, y_test, tuned_model)
print(classification_report(y_test, y_pred_tuned))

              precision    recall  f1-score   support

           0       0.87      0.87      0.87     12500
           1       0.87      0.86      0.87     12500

    accuracy                           0.87     25000
   macro avg       0.87      0.87      0.87     25000
weighted avg       0.87      0.87      0.87     25000

CPU times: user 6.47 s, sys: 199 ms, total: 6.67 s
Wall time: 7.1 s


### Testing pretreatment methods

In [132]:
def full_predict(x_train:list, y_train:list ,x_test:list, y_test:list, file_name:str):
    """
    Predict samples from x_test using FastText trained model with x_train samples.
    
    Input:
        x_train: list of samples (train)
        y_train: list of labels (train)
        x_test: list of samples (test)
        y_test: list of labels (test)
        file_name: name of file created for FastText input formatting
    """
    # Create dataframe
    df = pd.DataFrame({'Value':x_train, 'Positive':y_train})
    
    # Format samples
    df["Value"] = df.apply(lambda x: add_label(x.Value, x.Positive), axis=1)
    
    # Shuffle data
    df = df.sample(frac=1).reset_index(drop=True)
    
    # Create file
    with open(file_name, 'w+') as f:
        for values in df["Value"]:
            f.write(values + "\n")
            
    # Train model
    print("Training model")
    model = fasttext.train_supervised(file_name)
    
    # Predict test
    print("Predicting test samples")
    y_pred = pred_tests(x_test, y_test, model)
    
    # Print scores
    print(classification_report(y_test, y_pred))
    
    return (model, y_pred)

In [134]:
%%time
test_model, test_pred = full_predict(x_train, y_train, x_test, y_test, "TEST_func.txt")

Training model


Read 5M words
Number of words:  281132
Number of labels: 2
Progress: 100.0% words/sec/thread: 2345249 lr:  0.000000 avg.loss:  0.428306 ETA:   0h 0m 0s 71.9% words/sec/thread: 2306561 lr:  0.028140 avg.loss:  0.475197 ETA:   0h 0m 1s


Predicting test samples
              precision    recall  f1-score   support

           0       0.86      0.86      0.86     12500
           1       0.86      0.86      0.86     12500

    accuracy                           0.86     25000
   macro avg       0.86      0.86      0.86     25000
weighted avg       0.86      0.86      0.86     25000

