In [1]:
from modules.text_processing import *
import pandas as pd

## Load Data

In [2]:
text_labels_file = "./../data/text_labels.xlsx"
dir_path = "./../data/text/raw/"
df = get_text_dataframe(dir_path, text_labels_file)
df.head()

Unnamed: 0,file,cogn_func,text
0,0000_000_999-accidentes-trabajo.txt,3,lecciones aprendidas\n\naccidente de trabajo\n...
1,31-5_Caida_desde_escalera_de_silo.txt,3,lecciones aprendidas\n\ntipo de accidente: caí...
2,012015-Lecciones-aprendidas.txt,4,descripción de caso\n\nel 08 de octubre de 201...
3,auxiliar_trafico_aprisionado_vehiculo.txt,5,auxiliar de tráfico (paletero - señalelo) apri...
4,caida_alturas.txt,6,lecciones aprendidas\n\ncaida de alturas \nles...


In [3]:
# Train test split
from sklearn.model_selection import train_test_split

df_train, df_test = train_test_split(df, test_size=0.2, random_state=42)

## Clean and lemmatize text

In [4]:
train_clean_texts = clean_text(df_train.text)
test_clean_texts = clean_text(df_test.text)

In [5]:
df_train['clean_text'] = train_clean_texts
df_test['clean_text'] = test_clean_texts
df_train.head()

Unnamed: 0,file,cogn_func,text,clean_text
85,leccion_aprendida_1-2.txt,2,riesgo: condiciones de seguridad – peligro loc...,riesgo condicion seguridad peligro locativo ti...
35,leccion_aprendida_explo_sub_carbon_socha_boyac...,0,accidente minero por\n\nfecha de ocurrencia:\n...,accidente minero fecha ocurrencia diciembre 6 ...
26,leccion_aprendida_explo_sub_carbon_jerico_boya...,1,accidente minero por\n\nfecha de ocurrencia:\n...,accidente minero fecha ocurrencia octubre 12 2...
47,leccion_aprendida_explo_sub_oro_el_tambo_cauca...,6,accidente minero por\n\nfecha de ocurrencia:\n...,accidente minero fecha ocurrencia marzo 21 201...
31,leccion_aprendida_explo_sub_carbon_sardinata_n...,2,lección aprendida vs\n\n¿qué pasó?\n\nse prese...,leccion aprendido vs pasar presentar accidente...


## TF-IDF Representation

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()

vectorizer.fit(df_train.clean_text)

In [7]:
X_train = vectorizer.transform(df_train.clean_text).toarray()
X_train = pd.DataFrame(X_train)
X_train

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,6214,6215,6216,6217,6218,6219,6220,6221,6222,6223
0,0.000000,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.068527,0.0,0.0
1,0.000000,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
2,0.000000,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.047384,0.0,0.0
3,0.000000,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
4,0.000000,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
66,0.000000,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.055361,0.0,0.0
67,0.022158,0.04173,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.025618,0.0,0.0
68,0.016927,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.029355,0.0,0.0
69,0.000000,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0


In [8]:
X_test = vectorizer.transform(df_test.clean_text).toarray()
X_test = pd.DataFrame(X_test)
X_test

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,6214,6215,6216,6217,6218,6219,6220,6221,6222,6223
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.032182,0.0,0.0
1,0.013501,0.030511,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.08959,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.016018,0.0543,0.0,0.00472,0.005531,0.0,0.0,0.0,0.015223,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.016667,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.039447,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.026569,0.0,0.0


## Dimension Reduction (PCA)

In [9]:
from sklearn.decomposition import PCA

pca_model = PCA(n_components=0.999, svd_solver="full")

pca_model.fit(X_train)

print(len(pca_model.components_))
print(pca_model.explained_variance_ratio_)
print(pca_model.explained_variance_ratio_.cumsum())


70
[0.12618992 0.05809525 0.05086338 0.03344183 0.03024129 0.02916291
 0.02444907 0.02112257 0.020392   0.01953715 0.01914346 0.01848101
 0.01786663 0.01719424 0.01679841 0.01634069 0.01608384 0.01566361
 0.01526582 0.01476303 0.01446226 0.01434444 0.01408791 0.01403775
 0.01377247 0.01370593 0.01336995 0.01314067 0.01295081 0.01248069
 0.01205932 0.01173419 0.01109864 0.01081259 0.01066782 0.01014965
 0.00999819 0.00996776 0.00966852 0.00924783 0.00920898 0.00888556
 0.00850556 0.00821579 0.00793491 0.00792057 0.00753415 0.00745058
 0.0072429  0.00701403 0.00665902 0.00636334 0.0061766  0.0061326
 0.00573475 0.0056264  0.00545163 0.00529015 0.00522415 0.00500214
 0.0048982  0.00474019 0.00458449 0.00441603 0.00393767 0.00390951
 0.00354525 0.00344389 0.00316654 0.00293093]
[0.12618992 0.18428518 0.23514856 0.26859039 0.29883168 0.32799459
 0.35244366 0.37356623 0.39395823 0.41349537 0.43263884 0.45111985
 0.46898647 0.48618072 0.50297913 0.51931982 0.53540367 0.55106727
 0.56633309 0.

In [10]:
X_train_reduced = pd.DataFrame(pca_model.transform(X_train))
X_train_reduced.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,60,61,62,63,64,65,66,67,68,69
0,0.036668,0.267426,-0.051297,-0.442746,0.060701,-0.019967,-0.312617,0.087557,-0.01084,0.451194,...,0.001856,0.009998,-0.009992,0.001146,-0.006991,-0.003178,-0.001889,-0.005881,-0.009393,0.00161
1,-0.315668,-0.238281,-0.289792,0.079855,-0.071224,0.025404,-0.078846,-0.096865,-0.126999,-0.143534,...,0.042603,-0.034969,0.039677,-0.046703,0.014894,0.007692,-0.001883,0.003188,0.008665,-0.01584
2,-0.269933,-0.170565,-0.146384,0.0091,-0.258595,0.108009,-0.056108,0.143854,-0.110675,-0.134904,...,-0.022194,-0.004605,-0.003859,0.037744,-0.016773,0.010361,-0.003088,-0.010093,0.00645,0.000647
3,-0.239696,-0.134002,-0.048839,0.071776,0.528795,-0.293545,0.116879,0.061828,0.153445,0.031463,...,0.008758,0.008011,-7.4e-05,0.006118,0.006675,-0.008038,0.008269,0.009468,-0.004031,-0.006512
4,-0.281726,-0.217913,-0.30722,0.062001,0.035254,-0.097335,-0.111329,-0.090788,-0.172491,0.118281,...,-0.035867,-0.07767,0.014328,0.263785,-0.035473,0.013528,0.001369,0.003064,-0.006261,0.013245


In [11]:
X_train.shape, X_train_reduced.shape

((71, 6224), (71, 70))

## Set everything in a pipeline

In [12]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer


def to_array(X):
    return X.toarray()

model = Pipeline([
    ("cleaner", FunctionTransformer(clean_text)),
    ("tfidf", TfidfVectorizer()),
    ("toarray", FunctionTransformer(to_array)),
    ("pca", PCA(n_components=0.999, svd_solver="full")),
])

model.fit(df_train.text)

In [13]:
pd.DataFrame(model.transform(df_train.text)).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,60,61,62,63,64,65,66,67,68,69
0,0.036668,0.267426,-0.051297,-0.442746,0.060701,-0.019967,-0.312617,0.087557,-0.01084,0.451194,...,0.001856,0.009998,-0.009992,0.001146,-0.006991,-0.003178,-0.001889,-0.005881,-0.009393,0.00161
1,-0.315668,-0.238281,-0.289792,0.079855,-0.071224,0.025404,-0.078846,-0.096865,-0.126999,-0.143534,...,0.042603,-0.034969,0.039677,-0.046703,0.014894,0.007692,-0.001883,0.003188,0.008665,-0.01584
2,-0.269933,-0.170565,-0.146384,0.0091,-0.258595,0.108009,-0.056108,0.143854,-0.110675,-0.134904,...,-0.022194,-0.004605,-0.003859,0.037744,-0.016773,0.010361,-0.003088,-0.010093,0.00645,0.000647
3,-0.239696,-0.134002,-0.048839,0.071776,0.528795,-0.293545,0.116879,0.061828,0.153445,0.031463,...,0.008758,0.008011,-7.4e-05,0.006118,0.006675,-0.008038,0.008269,0.009468,-0.004031,-0.006512
4,-0.281726,-0.217913,-0.30722,0.062001,0.035254,-0.097335,-0.111329,-0.090788,-0.172491,0.118281,...,-0.035867,-0.07767,0.014328,0.263785,-0.035473,0.013528,0.001369,0.003064,-0.006261,0.013245


## Save preprocessing model

In [14]:
import os
path = "./models/"
try:
    os.mkdir(path)
except FileExistsError:
    pass

In [15]:
import joblib

joblib.dump(model, path + "preprocessor.pk")

['./models/preprocessor.pk']

In [16]:
# Load model
loaded_model = joblib.load(path + "preprocessor.pk")

pd.DataFrame(loaded_model.transform(df_train.text)).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,60,61,62,63,64,65,66,67,68,69
0,0.036668,0.267426,-0.051297,-0.442746,0.060701,-0.019967,-0.312617,0.087557,-0.01084,0.451194,...,0.001856,0.009998,-0.009992,0.001146,-0.006991,-0.003178,-0.001889,-0.005881,-0.009393,0.00161
1,-0.315668,-0.238281,-0.289792,0.079855,-0.071224,0.025404,-0.078846,-0.096865,-0.126999,-0.143534,...,0.042603,-0.034969,0.039677,-0.046703,0.014894,0.007692,-0.001883,0.003188,0.008665,-0.01584
2,-0.269933,-0.170565,-0.146384,0.0091,-0.258595,0.108009,-0.056108,0.143854,-0.110675,-0.134904,...,-0.022194,-0.004605,-0.003859,0.037744,-0.016773,0.010361,-0.003088,-0.010093,0.00645,0.000647
3,-0.239696,-0.134002,-0.048839,0.071776,0.528795,-0.293545,0.116879,0.061828,0.153445,0.031463,...,0.008758,0.008011,-7.4e-05,0.006118,0.006675,-0.008038,0.008269,0.009468,-0.004031,-0.006512
4,-0.281726,-0.217913,-0.30722,0.062001,0.035254,-0.097335,-0.111329,-0.090788,-0.172491,0.118281,...,-0.035867,-0.07767,0.014328,0.263785,-0.035473,0.013528,0.001369,0.003064,-0.006261,0.013245


## Save feature space for model training

In [17]:
X_train = pd.DataFrame(loaded_model.transform(df_train.text))
y_train = df_train.cogn_func

X_test = pd.DataFrame(loaded_model.transform(df_test.text))
y_test = df_test.cogn_func


In [18]:
X_train.to_csv("./../data/features/in_features_train.csv", index=False)
y_train.to_csv("./../data/features/out_features_train.csv", index=False)

X_test.to_csv("./../data/features/in_features_test.csv", index=False)
y_test.to_csv("./../data/features/out_features_test.csv", index=False)


## Save clean texts

In [22]:
clean_texts = clean_text(df.text)
df['clean_text'] = clean_texts

In [23]:
df.to_excel("./../data/clean_text_labels.xlsx", index=False)

In [26]:
path = "./../data/text/clean/"
try:
    os.mkdir(path)
except FileExistsError:
    pass


In [27]:
for i in range(df.shape[0]):
    with open(path + df.file[i], "w") as f:
        f.write(df.clean_text[i])