In [25]:
from modules.text_processing import *
import pandas as pd

## Load Data

In [26]:
text_labels_file = "./../data/text_labels.xlsx"
dir_path = "./../data/text/raw/"
df = get_text_dataframe(dir_path, text_labels_file)
df.head()

Unnamed: 0,file,cogn_func,label,text
0,00_Caida piso mojado.txt,ATTENTION,0,Lecciones Aprendidas\n\nAccidente de trabajo\n...
1,01_Caída de altura.txt,PERCEPTION,2,LECCIONES APRENDIDAS\n\nTipo de Accidente: Caí...
2,02_ auxiliar_trafico_aprisionado_vehiculo.txt,WORKING MEMORY,1,Auxiliar de tráfico (Paletero - Señalelo) apri...
3,03_Caida de alturas_Lesiones Múltiples.txt,PERCEPTION,2,Lecciones aprendidas\n\nCaida de alturas \nLes...
4,04_Caida_alturas_montaje_estructura.txt,COGNITIVE FLEXIBILITY,3,LECCIONES APRENDIDAS\n\nCaída de alturas en mo...


In [27]:
# Train test split
from sklearn.model_selection import train_test_split

df_train, df_test = train_test_split(df, test_size=0.2, random_state=42)

## Clean and lemmatize text

In [28]:
train_clean_texts = clean_text(df_train.text)
test_clean_texts = clean_text(df_test.text)

In [29]:
df_train['clean_text'] = train_clean_texts
df_test['clean_text'] = test_clean_texts
df_train.head()

Unnamed: 0,file,cogn_func,label,text,clean_text
31,31_Accidente minero por derrumbe_Vetas_Santand...,PERCEPTION,2,LECCIÓN APRENDIDA \n\n¿QUÉ PASÓ?\n\nEl sábado ...,leccion aprendido pasar sabado 29 abril 2017 1...
58,58_Accidente minero_explosión_sub_carbon_Topag...,ATTENTION,0,LECCIÓN APRENDIDA\n\n¿QUÉ PASÓ?\n\nSe presentó...,leccion aprendido pasar presentar desprendimie...
9,09_Cogido entre la cuchilla y troquel molino m...,WORKING MEMORY,1,LECCIONES APRENDIDAS \n \nTipo de Accidente: C...,leccion aprendido tipo accidente cogido cuchil...
5,05_Caida_trabajador_foso_ascensor.txt,WORKING MEMORY,1,Leccioens Aprendidas\n\nCaída de trabajador en...,leccioen aprendido caida trabajador foso ascen...
35,35_Accidente minero_explocion_metano_polvo_car...,ATTENTION,0,LECCIÓN APRENDIDA\n\n¿QUÉ PASÓ?\nSe presento a...,leccion aprendido pasar presento 9:10 a.m expl...


## TF-IDF Representation

In [30]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()

vectorizer.fit(df_train.clean_text)

In [31]:
X_train = vectorizer.transform(df_train.clean_text).toarray()
X_train = pd.DataFrame(X_train)
X_train

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2542,2543,2544,2545,2546,2547,2548,2549,2550,2551
0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.00000,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.114622,0.000000
1,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.00000,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.041154,0.000000
2,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.00000,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000
3,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.00000,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000
4,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.00000,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.022127,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.00000,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.043680,0.000000
58,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.00000,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000
59,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.00000,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000
60,0.015319,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.00000,0.0,0.0,0.024032,0.0,0.000000,0.000000,0.023858,0.026441


In [32]:
X_test = vectorizer.transform(df_test.clean_text).toarray()
X_test = pd.DataFrame(X_test)
X_test

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2542,2543,2544,2545,2546,2547,2548,2549,2550,2551
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.171583,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.044551,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.185698,0.0,0.0,0.0,0.044821,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.023599,0.0
5,0.040254,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.062691,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.045743,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.069096,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.062433,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.027686,0.0,0.0,0.0,0.042124,0.0,0.0,0.0,0.0,0.0


## Dimension Reduction (PCA)

In [33]:
from sklearn.decomposition import PCA

pca_model = PCA(n_components=0.999, svd_solver="full")

pca_model.fit(X_train)

print(len(pca_model.components_))
print(pca_model.explained_variance_ratio_)
print(pca_model.explained_variance_ratio_.cumsum())


61
[0.07809056 0.05592071 0.04619438 0.03522957 0.03234697 0.02790328
 0.02597056 0.02506347 0.02439934 0.02337592 0.02274576 0.02214936
 0.02130004 0.01996572 0.01981541 0.01897157 0.01841944 0.01766295
 0.01753123 0.01727421 0.01673788 0.01626047 0.01602097 0.0157389
 0.01522129 0.01482643 0.01448735 0.01400931 0.01362713 0.01341067
 0.01327719 0.01305393 0.01263765 0.01248629 0.01228131 0.01187716
 0.01175897 0.0114695  0.01119296 0.01074925 0.01053329 0.01029421
 0.01009885 0.00976085 0.00947447 0.00922507 0.00887764 0.00866094
 0.00811045 0.00795733 0.00768949 0.00744322 0.00728078 0.00684922
 0.00646963 0.0061351  0.00560754 0.00516774 0.0046453  0.00441805
 0.00384577]
[0.07809056 0.13401128 0.18020566 0.21543522 0.24778219 0.27568547
 0.30165603 0.3267195  0.35111884 0.37449476 0.39724052 0.41938987
 0.44068992 0.46065563 0.48047105 0.49944262 0.51786206 0.535525
 0.55305623 0.57033044 0.58706831 0.60332878 0.61934975 0.63508865
 0.65030994 0.66513637 0.67962373 0.69363304 0.70

In [34]:
X_train_reduced = pd.DataFrame(pca_model.transform(X_train))
X_train_reduced.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,51,52,53,54,55,56,57,58,59,60
0,-0.121607,-0.129949,-0.004596,-0.035284,0.01973,-0.000249,0.002859,0.175009,0.092392,-0.057765,...,0.009239,0.084233,-0.009223,0.020732,-0.002023,0.010094,-0.002836,0.004616,0.007062,0.004111
1,-0.248473,-0.209928,0.069717,-0.12355,-0.031036,-0.111278,0.012286,0.057159,0.206929,-0.030913,...,0.018945,-0.136356,0.16671,-0.145263,-0.049031,-0.039348,-0.021999,-0.023084,-0.000772,0.003372
2,0.473518,0.169634,0.621895,-0.031874,0.010119,-0.010231,-0.100916,-0.035923,0.063699,-0.033757,...,0.069483,-0.034021,-0.035737,-0.000932,0.018974,-0.047976,-0.022381,-0.005721,-0.301342,-0.003792
3,0.236973,0.056375,0.030424,0.011889,-0.002321,0.008335,0.458011,0.188188,-0.262491,-0.016177,...,-0.001057,0.022258,-0.001741,-0.007414,0.001628,0.003646,0.004839,0.008855,8.2e-05,0.000633
4,-0.301833,0.488078,-0.089603,-0.211645,-0.018372,-0.062768,0.055141,-0.006294,0.098792,-0.037957,...,0.009975,-0.10556,-0.071598,0.150588,0.380965,0.050967,-0.087849,0.019152,0.003092,-0.007331


In [35]:
X_train.shape, X_train_reduced.shape

((62, 2552), (62, 61))

## Set everything in a pipeline

In [36]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer


model = Pipeline([
    ("cleaner", FunctionTransformer(clean_text)),
    ("tfidf", TfidfVectorizer()),
    ("toarray", FunctionTransformer(to_array)),
    ("pca", PCA(n_components=0.999, svd_solver="full")),
])

model.fit(df_train.text)

In [37]:
pd.DataFrame(model.transform(df_train.text)).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,51,52,53,54,55,56,57,58,59,60
0,-0.121607,-0.129949,-0.004596,-0.035284,0.01973,-0.000249,0.002859,0.175009,0.092392,-0.057765,...,0.009239,0.084233,-0.009223,0.020732,-0.002023,0.010094,-0.002836,0.004616,0.007062,0.004111
1,-0.248473,-0.209928,0.069717,-0.12355,-0.031036,-0.111278,0.012286,0.057159,0.206929,-0.030913,...,0.018945,-0.136356,0.16671,-0.145263,-0.049031,-0.039348,-0.021999,-0.023084,-0.000772,0.003372
2,0.473518,0.169634,0.621895,-0.031874,0.010119,-0.010231,-0.100916,-0.035923,0.063699,-0.033757,...,0.069483,-0.034021,-0.035737,-0.000932,0.018974,-0.047976,-0.022381,-0.005721,-0.301342,-0.003792
3,0.236973,0.056375,0.030424,0.011889,-0.002321,0.008335,0.458011,0.188188,-0.262491,-0.016177,...,-0.001057,0.022258,-0.001741,-0.007414,0.001628,0.003646,0.004839,0.008855,8.2e-05,0.000633
4,-0.301833,0.488078,-0.089603,-0.211645,-0.018372,-0.062768,0.055141,-0.006294,0.098792,-0.037957,...,0.009975,-0.10556,-0.071598,0.150588,0.380965,0.050967,-0.087849,0.019152,0.003092,-0.007331


## Save preprocessing model

In [38]:
import os
path = "./models/"
try:
    os.mkdir(path)
except FileExistsError:
    pass

In [39]:
import joblib

joblib.dump(model, path + "preprocessor.pk")

['./models/preprocessor.pk']

In [40]:
# Load model
loaded_model = joblib.load(path + "preprocessor.pk")

pd.DataFrame(loaded_model.transform(df_train.text)).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,51,52,53,54,55,56,57,58,59,60
0,-0.121607,-0.129949,-0.004596,-0.035284,0.01973,-0.000249,0.002859,0.175009,0.092392,-0.057765,...,0.009239,0.084233,-0.009223,0.020732,-0.002023,0.010094,-0.002836,0.004616,0.007062,0.004111
1,-0.248473,-0.209928,0.069717,-0.12355,-0.031036,-0.111278,0.012286,0.057159,0.206929,-0.030913,...,0.018945,-0.136356,0.16671,-0.145263,-0.049031,-0.039348,-0.021999,-0.023084,-0.000772,0.003372
2,0.473518,0.169634,0.621895,-0.031874,0.010119,-0.010231,-0.100916,-0.035923,0.063699,-0.033757,...,0.069483,-0.034021,-0.035737,-0.000932,0.018974,-0.047976,-0.022381,-0.005721,-0.301342,-0.003792
3,0.236973,0.056375,0.030424,0.011889,-0.002321,0.008335,0.458011,0.188188,-0.262491,-0.016177,...,-0.001057,0.022258,-0.001741,-0.007414,0.001628,0.003646,0.004839,0.008855,8.2e-05,0.000633
4,-0.301833,0.488078,-0.089603,-0.211645,-0.018372,-0.062768,0.055141,-0.006294,0.098792,-0.037957,...,0.009975,-0.10556,-0.071598,0.150588,0.380965,0.050967,-0.087849,0.019152,0.003092,-0.007331


## Save feature space for model training

In [41]:
y_train = df_train.label
y_test = df_test.label

### Non-PCA feature space

In [42]:
X_train = pd.DataFrame(loaded_model[:3].transform(df_train.text))
X_test = pd.DataFrame(loaded_model[:3].transform(df_test.text))

### PCA feature space

In [43]:
X_train_pca = pd.DataFrame(loaded_model.transform(df_train.text))
X_test_pca = pd.DataFrame(loaded_model.transform(df_test.text))

In [44]:
X_train.to_csv("./../data/features/in_features_train.csv", index=False)
X_train_pca.to_csv("./../data/features/pca_in_features_train.csv", index=False)
y_train.to_csv("./../data/features/out_features_train.csv", index=False)

X_test.to_csv("./../data/features/in_features_test.csv", index=False)
X_test_pca.to_csv("./../data/features/pca_in_features_test.csv", index=False)
y_test.to_csv("./../data/features/out_features_test.csv", index=False)


## Save clean texts

In [45]:
clean_texts = clean_text(df.text)
df['clean_text'] = clean_texts

In [46]:
df.to_excel("./../data/clean_text_labels.xlsx", index=False)

In [47]:
path = "./../data/text/clean/"
try:
    os.mkdir(path)
except FileExistsError:
    pass


In [48]:
for i in range(df.shape[0]):
    with open(path + df.file[i], "w") as f:
        f.write(df.clean_text[i])