# Pipeline

In [2]:
#Instalaciones

#Libreria para remplazar numeros por palabras
!pip install num2words
#Instalar  pandas profiler
#!pip install pandas-profiling==2.7.1
#deteccion de lenguaje para eliminar entradas que no esten en español
!pip install langdetect
#Procesamientno de lenguaje natural en español
!pip install stanza



In [3]:
#imports para procesamiento de texto

#tokenizacion y lematizacion
import stanza
#Para integrar pasos de la limpieza adicionales
from stanza.pipeline.processor import Processor, register_processor
#paquete español
stanza.download('es')

#Para manejo de numeros, singluares, plurarles en lenguaje
from num2words import num2words
#Deteccion de lenguaje
from langdetect import detect
# librería Natural Language Toolkit, usada para trabajar con textos
import nltk
# Punkt permite separar un texto en frases.
nltk.download('stopwords')
from nltk.corpus import stopwords

#Operaciones con expresiones regulares y unicode
import re, string, unicodedata

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.6.0.json:   0%|   …

2023-10-26 09:43:11 INFO: Downloading default packages for language: es (Spanish) ...
2023-10-26 09:43:13 INFO: File exists: C:\Users\andre\stanza_resources\es\default.zip
2023-10-26 09:43:16 INFO: Finished downloading models and saved to C:\Users\andre\stanza_resources.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\andre\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
#Imports generales para analisis de datos y ML
import pandas as pd
import numpy as np
import sys
from pandas_profiling import ProfileReport
import statistics

import sklearn
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier

  from pandas_profiling import ProfileReport


## 1. Lectura de los datos

In [None]:
# Lectura de los datos.
df_ODS = pd.read_excel("cat_6716.xlsx")

df_ODS.head()

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin

In [None]:
class CustomPreprocessor(BaseEstimator, TransformerMixin):
    def init(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X):

        X_processed = self.customPreprocessing(X)
        #Retornar los datos
        return X_processed
    
    #Remplaza los numeros por su representacion en palabras
    def replace_numbers(words):
        """Replace all interger occurrences in list of tokenized words with textual representation"""
        new_words = []
        for word in words:
            if word.isdigit():
                new_word = num2words(word, lang='es')
                new_words.append(new_word)
            else:
                new_words.append(word)
        return new_words
    #Remueve todo caracter no latino (conserva espacios y numeros)
    def remove_nonlatin(words):
      new_words = []
      for word in words:
        new_word = ''
        for ch in word:
          if unicodedata.name(ch).startswith(('LATIN', 'DIGIT', 'SPACE')):
            new_word += ch
        new_words.append(new_word)
      return new_words

    #Remueve palabras comunes que no aportan informacion
    def remove_stopwords(words):
        """Remove stop words from list of tokenized words"""
        new_words = []
        s = set(stopwords.words('spanish'))
        for word in words:
            if word not in s:
                new_words.append(word)
        return new_words

    #Remueve puntuacion
    def remove_punctuation(words):
        """Remove punctuation from list of tokenized words"""
        new_words = ''
        for word in words:
                new_words += re.sub(r'[^\w\s]', ' ', word)
        return new_words

     #Procesamiento de cada review usando stanza
    def tokenLemma(self, data):
      data['words'] = data['Textos_espanol'].apply(remove_punctuation)
      #Creamos un pipeline para tokenizacion y lematizacion
      nlp = stanza.Pipeline('es', processors = 'tokenize,mwt,pos,lemma', use_gpu=True)
      in_docs = [stanza.Document([], text=d) for d in data.words]
      return nlp(in_docs)

    #Funcion secundaria para procesar cada token
    def procesamientoPalabras(words):
        words = remove_nonlatin(words)
        words = replace_numbers(words)
        words = remove_stopwords(words)
        return words

    #Funcion principal para el pre-procesamiento
    def customPreprocessing(self, data):
        out_docs = self.tokenLemma(data)
        palabras = []

        for doc in out_docs:
            reviewAct = []
            for sentence in doc.sentences:
              for word in sentence.words:
                if(word.pos != 'PUNCT' and word.pos != 'SYM'):
                  reviewAct.append(word.lemma.lower())
            palabras.append(reviewAct)
        
        data['words'] = palabras
        data['words'] = data['words'].apply(procesamientoPalabras)
        return data


In [None]:
class CustomRegression(BaseEstimator, TransformerMixin):
    def init(self):
        self.model = None
        
    def fit(self, X, y=None):
                
        X['words'] = X['words'].apply(lambda x: ' '.join(map(str, x)))
        
        #Separación de los datos en conjunto de test y train
        X = X.drop('Textos_espanol', axis = 1)
        df_train, df_test = sklearn.model_selection.train_test_split(X, test_size=0.2, random_state=0)

        X_train = df_train['words']
        y_train = df_train['sdg']

        X_test = df_test['words']
        y_test = df_test['sdg']
        
        #Vectorizar los datos con Tfid
        vectorizer = TfidfVectorizer()
        train_vectors = vectorizer.fit_transform(X_train)
        test_vectors = vectorizer.transform(X_test)        
        
        
        #logreg = LogisticRegression(penalty='l2', C=10000000000.0, solver='newton-cg')
        logreg = LogisticRegression()


        modelo = logreg.fit(train_vectors, y_train)
                
        self.model = modelo
        
        return self

    def transform(self, X):        

        return X

In [None]:
custom_preprocessor = CustomPreprocessor()
custom_regression = CustomRegression()
pipeline = Pipeline(
    [
        ("processing", custom_preprocessor),
        ("model", custom_regression)
    ]
)

In [None]:
# Ajusta el modelo en tus datos transformados
pipeline.fit(df_ODS)

In [None]:
reg = pipeline["model"].model


In [None]:
y_pred = reg.predict(test_vectors)