In [1]:
import pandas as pd
from modules.text_processing import *

## Load Data

In [2]:
text_labels_file = "./../data/text_labels.xlsx"
dir_path = "./../data/text/raw/"
df = get_text_dataframe(dir_path, text_labels_file)
df.head()

Unnamed: 0,file,cogn_func,text
0,0000_000_999-accidentes-trabajo.txt,3,lecciones aprendidas\n\naccidente de trabajo\n...
1,31-5_Caida_desde_escalera_de_silo.txt,3,lecciones aprendidas\n\ntipo de accidente: caí...
2,012015-Lecciones-aprendidas.txt,4,descripción de caso\n\nel 08 de octubre de 201...
3,auxiliar_trafico_aprisionado_vehiculo.txt,5,auxiliar de tráfico (paletero - señalelo) apri...
4,caida_alturas.txt,6,lecciones aprendidas\n\ncaida de alturas \nles...


## Clean and lemmatize text

In [3]:
clean_docs = clean_text(df.text)

In [4]:
clean_docs[0]

'lección aprendido accidente trabajo caída piso mojado pasar funcionario ugc compañero trabajo encontrar realizar limpieza piso madera instalación universidad bajar piso trapero limpio compañero trabajo verter removedor guarda- escoba ocasionar desplazamiento líquido generara peligro caída momento regresara continuar actividad limpieza consecuencia golpe espalda codo antebrazo cabeza causo falta comunicación compañero trabajo comunicar advertencia peligro evidenciado ausencia señalización desarrollo labor elemento trabajo encontrar alcance verificar condición piso evitar ocurrir nuevamente 1 actividad realizar compañero trabajo deber mantener comunicación asertivo fluido contribuir cumplimiento objetivo meta organizacional evitar accidente trabajo 2 camín mirar dirección paso reportar condición inseguro vea evitar ocurrencia accidente 3 prestar atención señalización ubicado piso mojado limpiar evitar caída 4 utilizar elemento protección personal adecuado realización labor 5 utilizar ca

In [5]:
df['clean_text'] = clean_docs
df.head()

Unnamed: 0,file,cogn_func,text,clean_text
0,0000_000_999-accidentes-trabajo.txt,3,lecciones aprendidas\n\naccidente de trabajo\n...,lección aprendido accidente trabajo caída piso...
1,31-5_Caida_desde_escalera_de_silo.txt,3,lecciones aprendidas\n\ntipo de accidente: caí...,lección aprendido tipo accidente caída altura ...
2,012015-Lecciones-aprendidas.txt,4,descripción de caso\n\nel 08 de octubre de 201...,descripción caso 08 octubre 2014 4:55 p.m veni...
3,auxiliar_trafico_aprisionado_vehiculo.txt,5,auxiliar de tráfico (paletero - señalelo) apri...,auxiliar tráfico paletero señalelo aprisionado...
4,caida_alturas.txt,6,lecciones aprendidas\n\ncaida de alturas \nles...,lección aprendido caer alturos lesión múltiple...


## TF-IDF Representation

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()

tfidf = vectorizer.fit_transform(clean_docs)

In [7]:
X = tfidf.toarray()
X = pd.DataFrame(X)
X

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,6732,6733,6734,6735,6736,6737,6738,6739,6740,6741
0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
1,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
2,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
3,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
4,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.083033,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
84,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
85,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
86,0.072307,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
87,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0


In [8]:
X.shape

(89, 6742)

In [9]:
y = df.cogn_func

In [10]:
y.shape

(89,)

## Save feature space for model training

In [11]:
y.to_csv("./../data/features/out_features.csv", index=False)
X.to_csv("./../data/features/in_features.csv", index=False)