In [32]:
import numpy as np
import pandas as pd
import re
import nltk

import config as c
from utils import save_file, load_file
from processing import token_index
from dataset import TextDataset

from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm
from nltk.tokenize import word_tokenize

# Processing Glove Embeddings

In [2]:
with open(c.glove_vector_path,"rt") as f:
    emb = f.readlines()

In [3]:
len(emb)

400000

In [4]:
vocab = []
embed = []
for i in emb:
    vocab.append(i.split()[0])
    embed.append(i.split()[1:])

In [5]:
embed = np.array(embed, dtype=np.float32)

In [6]:
vocab = ["<pad>", "<unk>"] + vocab

In [7]:
embed = np.vstack([np.ones(50, dtype=np.float32), np.mean(embed, axis=0), embed])

In [8]:
save_file(c.embedding_path, embed)
save_file(c.vocab_path, vocab)

# Processing the Text Data

In [9]:
data = pd.read_csv(c.data_path)

In [10]:
data.dropna(subset=[c.text_col_name], inplace=True)

In [11]:
data.replace({c.label_col: c.product_map}, inplace=True)

In [12]:
label_encoder = LabelEncoder()
label_encoder.fit(data[c.label_col])
labels = label_encoder.transform(data[c.label_col])

In [13]:
label_encoder.classes_

array(['card', 'credit_report', 'debt_collection', 'loan',
       'money_transfer', 'mortgage', 'others', 'savings_account',
       'vehicle_loan'], dtype=object)

In [14]:
data[c.label_col]

1             vehicle_loan
7            credit_report
8            credit_report
10           credit_report
13           credit_report
                ...       
2326240               card
2326241    debt_collection
2326242           mortgage
2326243      credit_report
2326244      credit_report
Name: Product, Length: 809343, dtype: object

In [16]:
save_file(c.label_path, labels)
save_file(c.label_encoder_path, label_encoder)

In [17]:
input_text = data[c.text_col_name]

In [19]:
input_text = [i.lower() for i in tqdm(input_text)]

100%|██████████| 809343/809343 [00:02<00:00, 383521.64it/s]


In [22]:
input_text = [re.sub(r"[^\w\d'\s]+", " ", i) for i in input_text]


  0%|          | 0/809343 [00:00<?, ?it/s][A
  1%|          | 4196/809343 [00:00<00:19, 41952.44it/s][A
  1%|          | 9550/809343 [00:00<00:16, 48762.43it/s][A
  2%|▏         | 15008/809343 [00:00<00:15, 51415.75it/s][A
  3%|▎         | 20692/809343 [00:00<00:14, 53555.53it/s][A
  3%|▎         | 26570/809343 [00:00<00:14, 55426.38it/s][A
  4%|▍         | 32364/809343 [00:00<00:13, 56274.69it/s][A
  5%|▍         | 38276/809343 [00:00<00:13, 57200.39it/s][A
  5%|▌         | 44086/809343 [00:00<00:13, 57481.60it/s][A
  6%|▌         | 50141/809343 [00:00<00:12, 58438.04it/s][A
  7%|▋         | 56193/809343 [00:01<00:12, 59078.87it/s][A
  8%|▊         | 62201/809343 [00:01<00:12, 59381.04it/s][A
  8%|▊         | 68140/809343 [00:01<00:12, 59209.37it/s][A
  9%|▉         | 74198/809343 [00:01<00:12, 59620.65it/s][A
 10%|▉         | 80326/809343 [00:01<00:12, 60120.24it/s][A
 11%|█         | 86843/809343 [00:01<00:11, 61637.36it/s][A
 11%|█▏        | 93054/809343 [00:01<00:

In [23]:
input_text = [re.sub("\d+", "", i) for i in input_text]

In [24]:
input_text = [re.sub("[x]{2,}", "", i) for i in input_text]

In [25]:
input_text = [re.sub(' +', ' ', i) for i in input_text]

In [31]:
tokens = [word_tokenize(i) for i in tqdm(input_text)]

In [None]:
tokens = [i[:20] if len(i) > 19 else ["<pad>"] * (20 - len(i)) + i for i in tokens]

In [None]:
tokens = token_index(tokens, vocab)

In [None]:
save_file(c.token_path, tokens)