## using fast-text to process text data classification

In [3]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import xgboost as xgb
import re
from nltk.corpus import stopwords
import keras.preprocessing.text
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

import keras
# from keras.datasets import cifar10
from keras.preprocessing.image import ImageDataGenerator
from keras.models import Sequential, load_model
from keras.layers import Dense, Dropout, Activation, Flatten
from keras.layers import Conv2D, MaxPooling2D, SpatialDropout1D, Embedding, LSTM
from keras.callbacks import ModelCheckpoint,EarlyStopping

import fasttext

In [4]:
# download fasttext
! git clone https://github.com/facebookresearch/fastText.git
! cd /kaggle/working/fastText; make

Cloning into 'fastText'...
remote: Enumerating objects: 3854, done.[K
remote: Total 3854 (delta 0), reused 0 (delta 0), pack-reused 3854[K
Receiving objects: 100% (3854/3854), 8.22 MiB | 15.89 MiB/s, done.
Resolving deltas: 100% (2417/2417), done.
c++ -pthread -std=c++11 -march=native -O3 -funroll-loops -DNDEBUG -c src/args.cc
c++ -pthread -std=c++11 -march=native -O3 -funroll-loops -DNDEBUG -c src/autotune.cc
c++ -pthread -std=c++11 -march=native -O3 -funroll-loops -DNDEBUG -c src/matrix.cc
c++ -pthread -std=c++11 -march=native -O3 -funroll-loops -DNDEBUG -c src/dictionary.cc
c++ -pthread -std=c++11 -march=native -O3 -funroll-loops -DNDEBUG -c src/loss.cc
c++ -pthread -std=c++11 -march=native -O3 -funroll-loops -DNDEBUG -c src/productquantizer.cc
c++ -pthread -std=c++11 -march=native -O3 -funroll-loops -DNDEBUG -c src/densematrix.cc
c++ -pthread -std=c++11 -march=native -O3 -funroll-loops -DNDEBUG -c src/quantmatrix.cc
c++ -pthread -std=c++11 -march=native -O3 -funroll-loops -DNDEBU

In [5]:
train = pd.read_csv("/kaggle/input/uw-cs480-fall20/train.csv")
test = pd.read_csv("/kaggle/input/uw-cs480-fall20/test.csv")

data = pd.DataFrame(pd.concat([train, test]))

# text processing

REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))

def clean_text(text):
    """
        text: a string
        
        return: modified initial string
    """
    text = text.lower() # lowercase text
    text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text. substitute the matched string in REPLACE_BY_SPACE_RE with space.
    text = BAD_SYMBOLS_RE.sub('', text) # remove symbols which are in BAD_SYMBOLS_RE from text. substitute the matched string in BAD_SYMBOLS_RE with nothing. 

    text = ' '.join(word for word in text.split() if word not in STOPWORDS) # remove stopwors from text
    return text

train['TextDescription'] = train['noisyTextDescription'].apply(clean_text)
train['TextDescription'] = train['TextDescription'].str.replace('\d+', '')

test['TextDescription'] = test['noisyTextDescription'].apply(clean_text)
test['TextDescription'] = test['TextDescription'].str.replace('\d+', '')

In [6]:
# text processing to the fast-text format

train["label"] = "__label__" + train["category"].str.replace(" ", "_")
train_fast = pd.DataFrame(train, columns = ["label", "TextDescription"])
train_fast.to_csv(r'train_fast.txt', index=False, sep=' ', header=False)

In [7]:
# separate the train and val data (5-fold)

val_size = int(len(train)/5)

train1 = train_fast[val_size:]
train1.to_csv("train1.txt",sep=' ',index = None, header=None)
val1 = train_fast[:val_size]
val1.to_csv("val1.txt",sep=' ',index = None, header=None)

train2 = pd.concat([train_fast.iloc[:val_size,], train_fast.iloc[val_size*2:]])
train2.to_csv("train2.txt",sep=' ',index = None, header=None)
val2 = train_fast[val_size:val_size*2]
val2.to_csv("val2.txt",sep=' ',index = None, header=None)

train3 = pd.concat([train_fast.iloc[:val_size*2,], train_fast.iloc[val_size*3:]])
train3.to_csv("train3.txt",sep=' ',index = None, header=None)
val3 = train_fast[val_size*2:val_size*3]
val3.to_csv("val3.txt",sep=' ',index = None, header=None)

train4 = pd.concat([train_fast.iloc[:val_size*3,], train_fast.iloc[val_size*4:]])
train4.to_csv("train4.txt",sep=' ',index = None, header=None)
val4 = train_fast[val_size*3:val_size*4]
val4.to_csv("val4.txt",sep=' ',index = None, header=None)

train5 = train_fast.iloc[:val_size*4,]
train5.to_csv("train5.txt",sep=' ',index = None, header=None)
val5 = train_fast[val_size*4:]
val5.to_csv("val5.txt",sep=' ',index = None, header=None)

In [8]:
label_list = train["label"].unique()
label_list.sort()

In [9]:
def train_pred_fast(k, val):
    model = fasttext.train_supervised(input="train"+str(k)+".txt", lr = 0.6, epoch = 25)
    _,p,r = model.test("val"+str(k)+".txt")
    print(p)
    dictionary = {}
    for l in label_list:
        dictionary[l] = []
    val = val.reset_index()
    for index in range(len(val)):
        label_array, prednum_array = model.predict(val["TextDescription"][index], k=27)
        for i, label in enumerate(label_array):
            dictionary[label].append(prednum_array[i])
    val_pred_df = pd.DataFrame(dictionary)
    return val_pred_df

val_pred_df_1 = train_pred_fast(1, val1)
val_pred_df_2 = train_pred_fast(2, val2)
val_pred_df_3 = train_pred_fast(3, val3)
val_pred_df_4 = train_pred_fast(4, val4)
val_pred_df_5 = train_pred_fast(5, val5)

# the output is the val precision for 5 fold

0.903121387283237
0.9065895953757226
0.8991907514450868
0.91121387283237
0.907326091980587


In [10]:
# stacking the result for the second stage training

train_pred_fast_text = pd.concat([val_pred_df_1, val_pred_df_2, val_pred_df_3, val_pred_df_4, val_pred_df_5])
train_pred_fast_text.to_csv("train_pred_fast_text.csv", index = False)

In [11]:
# using the full train data 

model = fasttext.train_supervised(input="train_fast.txt", epoch = 25,lr=0.6)

dictionary = {}
for l in label_list:
    dictionary[l] = []


for index in range(len(test)):
    label_array, prednum_array = model.predict(test["TextDescription"][index], k=27)
    for i, label in enumerate(label_array):
        dictionary[label].append(prednum_array[i])

# save the result for the second stage training
test_df = pd.DataFrame(dictionary)
test_df.to_csv("test_pred_fast_text.csv", index = False)

In [13]:
model.save_model("fast_text_model.bin")

Help on _FastText in module fasttext.FastText object:

class _FastText(builtins.object)
 |  _FastText(model_path=None, args=None)
 |  
 |  This class defines the API to inspect models and should not be used to
 |  create objects. It will be returned by functions such as load_model or
 |  train.
 |  
 |  In general this API assumes to be given only unicode for Python2 and the
 |  Python3 equvalent called str for any string-like arguments. All unicode
 |  strings are then encoded as UTF-8 and fed to the fastText C++ API.
 |  
 |  Methods defined here:
 |  
 |  __contains__(self, word)
 |  
 |  __getitem__(self, word)
 |  
 |  __init__(self, model_path=None, args=None)
 |      Initialize self.  See help(type(self)) for accurate signature.
 |  
 |  get_analogies(self, wordA, wordB, wordC, k=10, on_unicode_error='strict')
 |  
 |  get_dimension(self)
 |      Get the dimension (size) of a lookup vector (hidden layer).
 |  
 |  get_input_matrix(self)
 |      Get a reference to the full input 