In [1]:
from modules.text_processing import *
import pandas as pd

## Load Data

In [2]:
text_labels_file = "./../data/text_labels.xlsx"
dir_path = "./../data/text/raw/"
df = get_text_dataframe(dir_path, text_labels_file)
df.head()

Unnamed: 0,file,cogn_func,label,text
0,00_Caida piso mojado.txt,ATTENTION,0,Lecciones Aprendidas\n\nAccidente de trabajo\n...
1,01_Caída de altura.txt,PERCEPTION,2,LECCIONES APRENDIDAS\n\nTipo de Accidente: Caí...
2,02_ auxiliar_trafico_aprisionado_vehiculo.txt,WORKING MEMORY,1,Auxiliar de tráfico (Paletero - Señalelo) apri...
3,03_Caida de alturas_Lesiones Múltiples.txt,PERCEPTION,2,Lecciones aprendidas\n\nCaida de alturas \nLes...
4,04_Caida_alturas_montaje_estructura.txt,COGNITIVE FLEXIBILITY,3,LECCIONES APRENDIDAS\n\nCaída de alturas en mo...


## Clean and lemmatize text

In [3]:
df['clean_text'] = clean_text(df.text)
df.head()

Unnamed: 0,file,cogn_func,label,text,clean_text
0,00_Caida piso mojado.txt,ATTENTION,0,Lecciones Aprendidas\n\nAccidente de trabajo\n...,leccion aprendido accidente trabajo caida piso...
1,01_Caída de altura.txt,PERCEPTION,2,LECCIONES APRENDIDAS\n\nTipo de Accidente: Caí...,leccion aprendido tipo accidente caida altura ...
2,02_ auxiliar_trafico_aprisionado_vehiculo.txt,WORKING MEMORY,1,Auxiliar de tráfico (Paletero - Señalelo) apri...,auxiliar trafico paletero senalelo aprisionado...
3,03_Caida de alturas_Lesiones Múltiples.txt,PERCEPTION,2,Lecciones aprendidas\n\nCaida de alturas \nLes...,leccion aprendido caida altura lesion multiple...
4,04_Caida_alturas_montaje_estructura.txt,COGNITIVE FLEXIBILITY,3,LECCIONES APRENDIDAS\n\nCaída de alturas en mo...,leccion aprendido caida altura montaje estruct...


In [4]:
# Train test split
from sklearn.model_selection import train_test_split

df_train, df_test = train_test_split(df, test_size=0.2, stratify=df.label, random_state=42)

In [5]:
df_train.label.value_counts()

0    29
2    16
1    10
3     5
4     2
Name: label, dtype: int64

In [6]:
df_test.label.value_counts()

0    8
2    4
1    3
3    1
Name: label, dtype: int64

## TF-IDF Representation

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()

vectorizer.fit(df_train.clean_text)

In [8]:
X_train = vectorizer.transform(df_train.clean_text).toarray()
X_train = pd.DataFrame(X_train)
X_train

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2583,2584,2585,2586,2587,2588,2589,2590,2591,2592
0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,...,0.00000,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000
1,0.000000,0.128417,0.064209,0.064209,0.064209,0.0,0.0,0.0,0.0,0.058358,...,0.00000,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000
2,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,...,0.00000,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.053840,0.000000
3,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,...,0.04859,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.023525,0.010614,0.000000
4,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,...,0.00000,0.0,0.0,0.000000,0.0,0.000000,0.069118,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,...,0.00000,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.057536,0.000000
58,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,...,0.00000,0.0,0.0,0.000000,0.0,0.046055,0.000000,0.000000,0.000000,0.000000
59,0.014925,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,...,0.00000,0.0,0.0,0.022392,0.0,0.000000,0.000000,0.000000,0.023933,0.026524
60,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,...,0.00000,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000


In [9]:
X_test = vectorizer.transform(df_test.clean_text).toarray()
X_test = pd.DataFrame(X_test)
X_test

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2583,2584,2585,2586,2587,2588,2589,2590,2591,2592
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.040557,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.06389,0.0
3,0.054676,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.033806,0.0,0.074389,0.016781,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.105083,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.022836,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.029713,0.0


## Dimension Reduction (PCA)

In [10]:
from sklearn.decomposition import PCA

pca_model = PCA(n_components=0.999, svd_solver="full")

pca_model.fit(X_train)

print(len(pca_model.components_))
print(pca_model.explained_variance_ratio_)
print(pca_model.explained_variance_ratio_.cumsum())


61
[0.07369968 0.05148544 0.03820371 0.03516855 0.03204247 0.0284426
 0.02666107 0.0259147  0.02448007 0.0233258  0.02280455 0.02156307
 0.02146262 0.01996793 0.01946453 0.01889011 0.01850171 0.01810765
 0.01758698 0.01735163 0.01658788 0.01650009 0.01604912 0.01574408
 0.01552217 0.0153433  0.01485581 0.01428323 0.01419247 0.01375584
 0.01363289 0.01328941 0.01306814 0.01285905 0.01263319 0.01192212
 0.01182873 0.01169437 0.01146624 0.01111616 0.01090693 0.01074967
 0.01042711 0.01024965 0.00996767 0.00964968 0.00930944 0.00909322
 0.00900314 0.00866467 0.00831694 0.00791452 0.00738519 0.00698587
 0.00684436 0.00674947 0.00655035 0.00622963 0.00517572 0.00464084
 0.0037168 ]
[0.07369968 0.12518512 0.16338883 0.19855739 0.23059986 0.25904246
 0.28570353 0.31161823 0.33609829 0.35942409 0.38222864 0.40379171
 0.42525433 0.44522226 0.46468678 0.4835769  0.50207861 0.52018626
 0.53777324 0.55512487 0.57171275 0.58821284 0.60426196 0.62000604
 0.63552821 0.65087151 0.66572732 0.68001055 0.

In [11]:
X_train_reduced = pd.DataFrame(pca_model.transform(X_train))
X_train_reduced.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,51,52,53,54,55,56,57,58,59,60
0,0.290225,0.014143,-0.148433,-0.014729,-0.38642,-0.002343,0.228131,-0.003896,-0.302321,0.025927,...,0.023061,-0.001689,-0.001274,0.009761,0.029721,0.008444,-0.027011,-0.000868,-0.000223,0.007847
1,0.25879,0.000947,-0.1662,-0.002555,-0.214385,-0.133618,0.328689,-0.033401,-0.269508,0.062059,...,0.00406,-0.012163,0.009878,0.007723,-0.015709,-0.001159,-0.005058,-0.006837,-0.008596,0.00394
2,-0.243648,-0.242802,0.077278,-0.09195,0.034979,0.190658,0.019291,-0.096767,-0.016909,-0.069508,...,-0.114372,-0.063961,-0.006854,-0.014969,0.012111,0.057421,0.028676,-0.034707,0.040905,-0.006054
3,-0.389387,0.484928,-0.041031,-0.041372,-0.014162,0.050125,-0.051003,0.02504,-0.100511,0.032859,...,0.09445,0.028946,0.164747,-0.11268,-0.155308,-0.040204,-0.080004,0.303992,-0.036762,-0.004217
4,0.239713,0.056559,-0.019474,-0.029248,0.04149,-0.179993,-0.288456,-0.284264,0.102663,0.191094,...,-0.024814,-0.003841,0.023083,-0.012181,0.006351,-0.012221,-0.003424,-0.007962,-0.005631,0.000161


In [12]:
X_train.shape, X_train_reduced.shape

((62, 2593), (62, 61))

## Set everything in a pipeline

In [13]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer


model = Pipeline([
    ("cleaner", FunctionTransformer(clean_text)),
    ("tfidf", TfidfVectorizer()),
    ("toarray", FunctionTransformer(to_array)),
    ("pca", PCA(n_components=0.999, svd_solver="full")),
])

model.fit(df_train.text)

In [14]:
pd.DataFrame(model.transform(df_train.text)).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,51,52,53,54,55,56,57,58,59,60
0,0.290225,0.014143,-0.148433,-0.014729,-0.38642,-0.002343,0.228131,-0.003896,-0.302321,0.025927,...,0.023061,-0.001689,-0.001274,0.009761,0.029721,0.008444,-0.027011,-0.000868,-0.000223,0.007847
1,0.25879,0.000947,-0.1662,-0.002555,-0.214385,-0.133618,0.328689,-0.033401,-0.269508,0.062059,...,0.00406,-0.012163,0.009878,0.007723,-0.015709,-0.001159,-0.005058,-0.006837,-0.008596,0.00394
2,-0.243648,-0.242802,0.077278,-0.09195,0.034979,0.190658,0.019291,-0.096767,-0.016909,-0.069508,...,-0.114372,-0.063961,-0.006854,-0.014969,0.012111,0.057421,0.028676,-0.034707,0.040905,-0.006054
3,-0.389387,0.484928,-0.041031,-0.041372,-0.014162,0.050125,-0.051003,0.02504,-0.100511,0.032859,...,0.09445,0.028946,0.164747,-0.11268,-0.155308,-0.040204,-0.080004,0.303992,-0.036762,-0.004217
4,0.239713,0.056559,-0.019474,-0.029248,0.04149,-0.179993,-0.288456,-0.284264,0.102663,0.191094,...,-0.024814,-0.003841,0.023083,-0.012181,0.006351,-0.012221,-0.003424,-0.007962,-0.005631,0.000161


## Save preprocessing model

In [15]:
import os
path = "./models/"
try:
    os.mkdir(path)
except FileExistsError:
    pass

In [16]:
import joblib

joblib.dump(model, path + "preprocessor.pk")

['./models/preprocessor.pk']

In [17]:
# Load model
loaded_model = joblib.load(path + "preprocessor.pk")

pd.DataFrame(loaded_model.transform(df_train.text)).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,51,52,53,54,55,56,57,58,59,60
0,0.290225,0.014143,-0.148433,-0.014729,-0.38642,-0.002343,0.228131,-0.003896,-0.302321,0.025927,...,0.023061,-0.001689,-0.001274,0.009761,0.029721,0.008444,-0.027011,-0.000868,-0.000223,0.007847
1,0.25879,0.000947,-0.1662,-0.002555,-0.214385,-0.133618,0.328689,-0.033401,-0.269508,0.062059,...,0.00406,-0.012163,0.009878,0.007723,-0.015709,-0.001159,-0.005058,-0.006837,-0.008596,0.00394
2,-0.243648,-0.242802,0.077278,-0.09195,0.034979,0.190658,0.019291,-0.096767,-0.016909,-0.069508,...,-0.114372,-0.063961,-0.006854,-0.014969,0.012111,0.057421,0.028676,-0.034707,0.040905,-0.006054
3,-0.389387,0.484928,-0.041031,-0.041372,-0.014162,0.050125,-0.051003,0.02504,-0.100511,0.032859,...,0.09445,0.028946,0.164747,-0.11268,-0.155308,-0.040204,-0.080004,0.303992,-0.036762,-0.004217
4,0.239713,0.056559,-0.019474,-0.029248,0.04149,-0.179993,-0.288456,-0.284264,0.102663,0.191094,...,-0.024814,-0.003841,0.023083,-0.012181,0.006351,-0.012221,-0.003424,-0.007962,-0.005631,0.000161


## Save feature space for model training

In [18]:
y_train = df_train.reset_index().label
y_test = df_test.reset_index().label

### Non-PCA feature space

In [19]:
X_train = pd.DataFrame(loaded_model[:3].transform(df_train.text))
X_train.columns = X_train.columns.astype(str)
X_test = pd.DataFrame(loaded_model[:3].transform(df_test.text))
X_test.columns = X_test.columns.astype(str)

### PCA feature space

In [20]:
X_train_pca = pd.DataFrame(loaded_model.transform(df_train.text))
X_train_pca.columns = X_train_pca.columns.astype(str)
X_test_pca = pd.DataFrame(loaded_model.transform(df_test.text))
X_test_pca.columns = X_test_pca.columns.astype(str)

### Balancing data

In [21]:
y_train.value_counts()

0    29
2    16
1    10
3     5
4     2
Name: label, dtype: int64

In [22]:
from imblearn.over_sampling import SMOTE, RandomOverSampler
from imblearn.pipeline import Pipeline

random = RandomOverSampler(sampling_strategy={3: 6, 4: 6})
smote = SMOTE(random_state=42)

resampler = Pipeline([("random", random), ("smote", smote)])

X, y = resampler.fit_resample(X_train, y_train)
X_pca, y_pca = resampler.fit_resample(X_train_pca, y_train)


In [23]:
y_pca.value_counts()

0    29
2    29
1    29
3    29
4    29
Name: label, dtype: int64

In [24]:
X.to_csv("./../data/features/in_features_train.csv", index=False)
X_pca.to_csv("./../data/features/pca_in_features_train.csv", index=False)
y.to_csv("./../data/features/out_features_train.csv", index=False)
y_pca.to_csv("./../data/features/pca_out_features_train.csv", index=False)

X_test.to_csv("./../data/features/in_features_test.csv", index=False)
X_test_pca.to_csv("./../data/features/pca_in_features_test.csv", index=False)
y_test.to_csv("./../data/features/out_features_test.csv", index=False)


## Save clean texts

In [25]:
clean_texts = clean_text(df.text)
df['clean_text'] = clean_texts

In [26]:
df.to_excel("./../data/clean_text_labels.xlsx", index=False)

In [27]:
path = "./../data/text/clean/"
try:
    os.mkdir(path)
except FileExistsError:
    pass


In [28]:
for i in range(df.shape[0]):
    with open(path + df.file[i], "w") as f:
        f.write(df.clean_text[i])