In [1]:
from modules.text_processing import *
import pandas as pd

## Load Data

In [2]:
text_labels_file = "./../data/text_labels.xlsx"
dir_path = "./../data/text/raw/"
df = get_text_dataframe(dir_path, text_labels_file)
df.head()

Unnamed: 0,file,cogn_func,text
0,0000_000_999-accidentes-trabajo.txt,3,lecciones aprendidas\n\naccidente de trabajo\n...
1,31-5_Caida_desde_escalera_de_silo.txt,3,lecciones aprendidas\n\ntipo de accidente: caí...
2,012015-Lecciones-aprendidas.txt,4,descripción de caso\n\nel 08 de octubre de 201...
3,auxiliar_trafico_aprisionado_vehiculo.txt,5,auxiliar de tráfico (paletero - señalelo) apri...
4,caida_alturas.txt,6,lecciones aprendidas\n\ncaida de alturas \nles...


## Clean and lemmatize text

In [3]:
clean_docs = clean_text(df.text)

In [4]:
clean_docs[0]

'leccion aprendido accidente trabajo caida piso mojado pasar funcionario ugc companero trabajo encontrar realizar limpieza piso madera instalacion universidad bajar piso trapero limpio companero trabajo verter removedor guarda- escoba ocasionar desplazamiento liquido generara peligro caida momento regresara continuar actividad limpieza consecuencia golpe espalda codo antebrazo cabeza causo falta comunicacion companero trabajo comunicar advertencia peligro evidenciado ausencia senalizacion desarrollo labor elemento trabajo encontrar alcance verificar condicion piso evitar ocurrir nuevamente 1 actividad realizar companero trabajo deber mantener comunicacion asertivo fluido contribuir cumplimiento objetivo meta organizacional evitar accidente trabajo 2 camin mirar direccion paso reportar condicion inseguro vea evitar ocurrencia accidente 3 prestar atencion senalizacion ubicado piso mojado limpiar evitar caida 4 utilizar elemento proteccion personal adecuado realizacion labor 5 utilizar ca

In [5]:
df['clean_text'] = clean_docs
df.head()

Unnamed: 0,file,cogn_func,text,clean_text
0,0000_000_999-accidentes-trabajo.txt,3,lecciones aprendidas\n\naccidente de trabajo\n...,leccion aprendido accidente trabajo caida piso...
1,31-5_Caida_desde_escalera_de_silo.txt,3,lecciones aprendidas\n\ntipo de accidente: caí...,leccion aprendido tipo accidente caida altura ...
2,012015-Lecciones-aprendidas.txt,4,descripción de caso\n\nel 08 de octubre de 201...,descripcion caso 08 octubre 2014 4:55 p.m veni...
3,auxiliar_trafico_aprisionado_vehiculo.txt,5,auxiliar de tráfico (paletero - señalelo) apri...,auxiliar trafico paletero senalelo aprisionado...
4,caida_alturas.txt,6,lecciones aprendidas\n\ncaida de alturas \nles...,leccion aprendido caida altura lesion multiple...


## TF-IDF Representation

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()

tfidf = vectorizer.fit_transform(clean_docs)

In [7]:
X = tfidf.toarray()
X = pd.DataFrame(X)
X

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,6587,6588,6589,6590,6591,6592,6593,6594,6595,6596
0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
1,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
2,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.025802,0.0,0.0
3,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
4,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
84,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
85,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.066618,0.0,0.0
86,0.072515,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.061855,0.0,0.0
87,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0


## Dimension Reduction (PCA)

In [8]:
from sklearn.decomposition import PCA

model = PCA(n_components=0.999, svd_solver="full")

model.fit(X)

print(len(model.components_))
print(model.explained_variance_ratio_)
print(model.explained_variance_ratio_.cumsum())


88
[0.11469942 0.06510193 0.04308376 0.03225484 0.02925536 0.02406869
 0.02059237 0.02016752 0.01852251 0.01727333 0.01597983 0.01554869
 0.0151415  0.01489738 0.01454894 0.01430357 0.01378172 0.01350554
 0.01338368 0.01301559 0.01259749 0.01248622 0.01239148 0.01216536
 0.01193573 0.01158898 0.01143093 0.01121923 0.01106828 0.01102934
 0.01085465 0.01063824 0.01038554 0.0100871  0.01001956 0.00963367
 0.00958168 0.00921634 0.008991   0.00894604 0.0087927  0.00846764
 0.00835669 0.00821176 0.00806826 0.00781015 0.00773581 0.007492
 0.00743891 0.00728293 0.00726184 0.0069744  0.00688185 0.00678154
 0.00667052 0.00657879 0.00619333 0.00608429 0.00596015 0.00580386
 0.00555526 0.00537022 0.00527249 0.00515179 0.00501212 0.00491591
 0.00483732 0.00459845 0.00449115 0.00428758 0.00427486 0.00423523
 0.00410642 0.00401412 0.00398921 0.00391518 0.00370647 0.00357164
 0.00355351 0.00348082 0.00332252 0.00310092 0.00298425 0.00280462
 0.00264507 0.00250549 0.00230282 0.00175409]
[0.11469942 0.1

In [9]:
X_reduced = pd.DataFrame(model.transform(X))
X_reduced.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,78,79,80,81,82,83,84,85,86,87
0,0.027245,0.162571,-0.01095,0.252698,0.054891,0.001483,-0.047786,-0.024961,-0.066245,0.004816,...,-0.015106,-0.00312,-0.009043,-0.006823,-0.008765,0.00559,0.008443,0.004874,0.001226,0.004628
1,-0.012194,0.370349,-0.054225,0.07347,0.045042,0.037502,-0.143492,-0.066471,-0.191638,0.068985,...,-0.012016,-0.006999,0.018802,-0.003092,-0.014899,-0.002096,0.010789,-0.006885,0.001836,0.004576
2,0.029696,0.128414,0.011264,0.181847,0.020077,-0.031522,-0.278802,-0.008568,-0.119754,-0.159438,...,0.00967,0.001104,0.002638,-0.000128,0.004262,0.007745,-0.007039,-0.003902,-0.000409,0.000222
3,-0.062627,0.189679,0.001924,0.274207,0.060717,-0.009176,-0.152615,0.012097,-0.040916,0.073484,...,-0.005691,0.017604,0.001174,-0.000217,-0.003895,0.002812,0.004648,0.004048,0.004887,0.000969
4,-0.04475,0.192658,-0.015283,0.270883,0.067603,0.008176,-0.327541,-0.082356,-0.223674,0.032808,...,0.008036,-0.004089,-0.002787,-0.001624,0.000228,-0.003946,0.004227,0.005213,-0.001269,0.001132


In [10]:
X.shape, X_reduced.shape

((89, 6597), (89, 88))

In [11]:
y = df.cogn_func

In [12]:
y.shape

(89,)

## Save feature space for model training

In [13]:
y.to_csv("./../data/features/out_features.csv", index=False)
X.to_csv("./../data/features/in_features.csv", index=False)
X_reduced.to_csv("./../data/features/in_features_reduced.csv", index=False)

In [14]:
df.to_csv("./../data/clean_text_labels.xlsx", index=False)

In [15]:
import os
path = "./../data/text/clean/"
try:
    os.mkdir(path)
except FileExistsError:
    pass


In [16]:
for i in range(df.shape[0]):
    with open(path + df.file[i], "w") as f:
        f.write(df.clean_text[i])