In [95]:
import stanza

from sklearn.base import BaseEstimator, TransformerMixin


class Preprocesser(BaseEstimator, TransformerMixin):
    
    def fit(self, X, y=None):
        return self
    
    
    def transform(self, X):
        alfa = 0.05
        beta = 0.05
        
        # eliminación de nulos
        df_work = X.dropna()
        # eliminación de duplicados
        df_work = df_work.drop_duplicates()
        
        # eliminación de longitudes atípicas
        df_work['Conteo'] = [len(x.split()) for x in df_work['Review']]
        upper = df_work['Conteo'].quantile(1-beta)
        lower = df_work['Conteo'].quantile(alfa)
        # menores que percentil mayor
        df_work = df_work[df_work['Conteo'] <= upper]
        # mayores que percentil menor
        df_work = df_work[df_work['Conteo'] >= lower]

        df_work = df_work.drop(['Conteo'], axis=1)
        
        return df_work


In [96]:
class Lemmatizer(BaseEstimator, TransformerMixin):
    
    def __init__(self):
        self.stanza_pipeline = stanza.Pipeline(lang="es", processors="tokenize,mwt,pos,lemma")
    
    
    def fit(self, X, y=None):
        return self
    
    
    def stanza_preprocessing(self, words):
        doc = self.stanza_pipeline(words)
        lemmas = [w.lemma for w in doc.sentences[0].words]
        return lemmas
    
    def transform(self, X):
        print("Lemmatize transformation...")
        df_work = X
        df_work['Review'] = df_work['Review'].apply(self.stanza_preprocessing)
        return df_work
    
    def __getstate__(self):
        # Exclude the stanza_pipeline from serialization
        state = self.__dict__.copy()
        del state['stanza_pipeline']
        return state

    def __setstate__(self, state):
        # Restore the stanza_pipeline during deserialization
        self.__dict__.update(state)
        self.stanza_pipeline = stanza.Pipeline(lang="es", processors="tokenize,mwt,pos,lemma")
    

In [97]:
import nltk
import re
import unicodedata

def clean_text(words):
    """
    Cleans the list of words by performing various operations.

    :param words: (list) List of words to be cleaned.
    :returns: (list) Cleaned list of words.
    """
    words = [w.lower() for w in words]  # Lowercase
    words = [re.sub(r'[^\w\s]', '', word) for word in words if word is not None]  # Remove punctuation
    words = [unicodedata.normalize('NFKD', word).encode('utf-8', 'ignore').decode('utf-8', 'ignore') for word in words]  # Remove non-encoded characters
    languages = ['spanish']
    stopword = nltk.corpus.stopwords.words(languages)
    return [w for w in words if w not in stopword]  # Remove stopwords


class PostCleaner(BaseEstimator, TransformerMixin):
    
    def __init__(self, cleaning_func):
        self.cleaning_func = cleaning_func
    
    def fit(self, X, y=None):
        return self
    
    
    def transform(self, X):
        print("Post lemmatization cleaning transformation...")
        df_work = X.copy()
        df_work['Review'] = df_work['Review'].apply(self.cleaning_func)
        return df_work
    

In [98]:
from copy import deepcopy
from sklearn.feature_extraction.text import TfidfVectorizer


class Vectorizer(BaseEstimator, TransformerMixin):
    
    def __init__(self):
        self.vectorizer_algorithm = TfidfVectorizer(
            decode_error='ignore',
            strip_accents='ascii',
            analyzer='word',
            max_features=10000,
        )
        self.features = None
        
        # remiendo. En la primera pasada, se genera el vocabulario
        self.already_fit = False
        
    
    def fit(self, X, y=None):
        return self
    
    
    def transform(self, X):
        data_stringfied = deepcopy(X)
        data_stringfied['Review'] = data_stringfied['Review'].apply(lambda x: ' '.join(map(str, x)))
        
        x_data = data_stringfied['Review']
        
        if self.already_fit:
            print("Vocabulario ya definido...")        
            x_data_vectorized_matrix = self.vectorizer_algorithm.transform(x_data)
        else:
            print("Creando vocabulario...")
            x_data_vectorized_matrix = self.vectorizer_algorithm.fit_transform(x_data)
            
        x_data_vectorized_df = pd.DataFrame(x_data_vectorized_matrix.toarray())  # ... for additional features from csr_matrix
        
        # obtiene el arreglo de palabras con columnas
        self.features = self.vectorizer_algorithm.get_feature_names_out()
        self.already_fit = True
        
        res = pd.concat([x_data_vectorized_df], axis=1)
        
        return res
    

In [99]:
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline

pipeline = Pipeline(
    steps=[
        ('lemmatizer', Lemmatizer()),
        ('post-cleaner', PostCleaner(cleaning_func=clean_text)),
        ('vectorizer', Vectorizer()),
        ('classifier', SVC(kernel='rbf', C=1, probability=True))
    ]
)

2024-04-20 13:17:08 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json:   0%|   …

2024-04-20 13:17:09 INFO: Downloaded file to C:\Users\alvar\stanza_resources\resources.json
2024-04-20 13:17:10 INFO: Loading these models for language: es (Spanish):
| Processor | Package         |
-------------------------------
| tokenize  | ancora          |
| mwt       | ancora          |
| pos       | ancora_charlm   |
| lemma     | ancora_nocharlm |

2024-04-20 13:17:10 INFO: Using device: cpu
2024-04-20 13:17:10 INFO: Loading: tokenize
2024-04-20 13:17:10 INFO: Loading: mwt
2024-04-20 13:17:10 INFO: Loading: pos
2024-04-20 13:17:10 INFO: Loading: lemma
2024-04-20 13:17:10 INFO: Done loading processors!


In [100]:
from sklearn.model_selection import train_test_split
import pandas as pd

df_big = pd.read_csv('../data/raw/tipo2_entrenamiento_estudiantes.csv', sep=',', encoding = 'utf-8')
df = df_big.sample(frac=0.03)

df = Preprocesser().fit_transform(df)

xtr, xts, ytr, yts = train_test_split(df['Review'], df['Class'], test_size=0.2, random_state=42069)
xtr = pd.DataFrame(xtr)
xts = pd.DataFrame(xts)
print(xtr.shape, type(xtr))
print(ytr.shape, type(ytr))
print(xts.shape, type(xts))
print(yts.shape, type(yts))

(170, 1) <class 'pandas.core.frame.DataFrame'>
(170,) <class 'pandas.core.series.Series'>
(43, 1) <class 'pandas.core.frame.DataFrame'>
(43,) <class 'pandas.core.series.Series'>


In [101]:
pipeline.fit(xtr, ytr)

Lemmatize transformation...
Post lemmatization cleaning transformation...
Creando vocabulario...


In [102]:
train_features = pipeline[2].features
train_features

array(['10', '1010', '1120', '1230am', '130am', '15', '180', '18092015',
       '19', '20', '200ds', '2013', '25', '30', '300', '34', '45', '715',
       '730', '8pm', '9pm', 'abajo', 'ac', 'acabado', 'acabar', 'acceder',
       'accesible', 'acceso', 'aceptable', 'acer', 'acero',
       'acondicionado', 'acorde', 'actividad', 'adaptar', 'ademas',
       'ademo', 'admirar', 'agosto', 'agradable', 'agrado', 'agua', 'ahi',
       'ahora', 'ahorratar', 'aire', 'alberca', 'alcoholico', 'alfombra',
       'alguno', 'alimento', 'alli', 'alma', 'almohada', 'alojamiento',
       'alojar', 'alrededor', 'alto', 'ama', 'amable', 'amargura',
       'ambiental', 'ambiente', 'ambulante', 'americana', 'americas',
       'amigo', 'amistad', 'amor', 'amurallado', 'angelopoli', 'animal',
       'ano', 'antepasado', 'antiguedad', 'antiguo', 'aparte', 'apestar',
       'apestoso', 'apreciar', 'aprender', 'aquel', 'aqui', 'argumentar',
       'arquitectonicamente', 'arquitectura', 'arrastrar', 'arriba',
  

In [103]:
pipeline.score(xts, yts)

Lemmatize transformation...
Post lemmatization cleaning transformation...
Vocabulario ya definido...


0.27906976744186046

In [104]:
pred = pd.DataFrame({'Review': ['favorito']})
pred

Unnamed: 0,Review
0,favorito


In [105]:
pipeline.predict_proba(pred)

Lemmatize transformation...
Post lemmatization cleaning transformation...
Vocabulario ya definido...


array([[0.12870395, 0.16333828, 0.20812274, 0.17812203, 0.32171299]])

In [106]:
from joblib import dump
# dump vectorizer
dump(pipeline['vectorizer'], '../models/vf7.joblib')
dump(pipeline, '../models/svm7.joblib')

['../models/svm7.joblib']

In [114]:
from joblib import load

vecto = load('../models/vf7.joblib')
new_pipe = load('../models/svm7.joblib')
new_pipe['vectorizer'].vectorizer_algorithm.get_feature_names_out()

2024-04-20 13:30:14 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json:   0%|   …

2024-04-20 13:30:15 INFO: Downloaded file to C:\Users\alvar\stanza_resources\resources.json
2024-04-20 13:30:15 INFO: Loading these models for language: es (Spanish):
| Processor | Package         |
-------------------------------
| tokenize  | ancora          |
| mwt       | ancora          |
| pos       | ancora_charlm   |
| lemma     | ancora_nocharlm |

2024-04-20 13:30:15 INFO: Using device: cpu
2024-04-20 13:30:15 INFO: Loading: tokenize
2024-04-20 13:30:15 INFO: Loading: mwt
2024-04-20 13:30:16 INFO: Loading: pos
2024-04-20 13:30:17 INFO: Loading: lemma
2024-04-20 13:30:17 INFO: Done loading processors!


array(['10', '1010', '1120', '1230am', '130am', '15', '180', '18092015',
       '19', '20', '200ds', '2013', '25', '30', '300', '34', '45', '715',
       '730', '8pm', '9pm', 'abajo', 'ac', 'acabado', 'acabar', 'acceder',
       'accesible', 'acceso', 'aceptable', 'acer', 'acero',
       'acondicionado', 'acorde', 'actividad', 'adaptar', 'ademas',
       'ademo', 'admirar', 'agosto', 'agradable', 'agrado', 'agua', 'ahi',
       'ahora', 'ahorratar', 'aire', 'alberca', 'alcoholico', 'alfombra',
       'alguno', 'alimento', 'alli', 'alma', 'almohada', 'alojamiento',
       'alojar', 'alrededor', 'alto', 'ama', 'amable', 'amargura',
       'ambiental', 'ambiente', 'ambulante', 'americana', 'americas',
       'amigo', 'amistad', 'amor', 'amurallado', 'angelopoli', 'animal',
       'ano', 'antepasado', 'antiguedad', 'antiguo', 'aparte', 'apestar',
       'apestoso', 'apreciar', 'aprender', 'aquel', 'aqui', 'argumentar',
       'arquitectonicamente', 'arquitectura', 'arrastrar', 'arriba',
  

In [152]:
pred = pd.DataFrame({'Review': ['asombroso  genial espectacular', 'mierda asqueroso horrible']})
pred

Unnamed: 0,Review
0,asombroso genial espectacular
1,mierda asqueroso horrible


In [188]:
probabilidades = new_pipe.predict_proba(pred.copy())

Lemmatize transformation...
Post lemmatization cleaning transformation...
Vocabulario ya definido...


In [197]:
# consigue el mapeo de clases
clases = new_pipe['classifier'].classes_
# consigue el índice de mayor probabilidad para cada fila
indice_probable = [int(i) for i in np.argmax(probabilidades, axis=1)]
# mapea indices de mayor probabilidad con clases
clase_probable = [clases[i] for i in indice_probable]
# mapea probabilidades
score_clase = [probabilidades[i][indice_probable[i]] for i in range(len(probabilidades))]

df_respuesta = pd.DataFrame(columns=["Review", "Class", "Score"])
df_respuesta['Review'] = pred['Review']
df_respuesta['Class'] = clase_probable
df_respuesta['Score'] = score_clase
df_respuesta['Class'] = df_respuesta['Class'].astype(int)
df_respuesta['Score'] = df_respuesta['Score'].astype(float)
df_respuesta

Unnamed: 0,Review,Class,Score
0,asombroso genial espectacular,5,0.333038
1,mierda asqueroso horrible,5,0.24445


In [198]:
df_respuesta.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2 entries, 0 to 1
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Review  2 non-null      object 
 1   Class   2 non-null      int32  
 2   Score   2 non-null      float64
dtypes: float64(1), int32(1), object(1)
memory usage: 172.0+ bytes


In [202]:
df_respuesta.iloc[0]['Class']

5

In [76]:
vecto.vectorizer_algorithm.transform(pd.DataFrame({'Review': ['favorito']}))

<1x2060 sparse matrix of type '<class 'numpy.float64'>'
	with 0 stored elements in Compressed Sparse Row format>

In [77]:
new_vecto = TfidfVectorizer(
    decode_error='ignore',
    strip_accents='ascii',
    analyzer='word',
    vocabulary=vecto.features
)
dffit = pd.DataFrame({'Review': ['hola mundo 123 casa', 'prueba de fit']})
vtest = new_vecto.fit_transform(dffit)
dftest = pd.DataFrame(vtest.toarray())
dftest

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2050,2051,2052,2053,2054,2055,2056,2057,2058,2059
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [78]:
res = new_fitted_vecto.transform(pd.DataFrame({'Review': ['zona']}))
print(res[0])



In [80]:
ultra_pipe = load('../models/svm6.joblib')

2024-04-20 10:45:54 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json:   0%|   …

2024-04-20 10:45:54 INFO: Downloaded file to C:\Users\alvar\stanza_resources\resources.json
2024-04-20 10:45:55 INFO: Loading these models for language: es (Spanish):
| Processor | Package         |
-------------------------------
| tokenize  | ancora          |
| mwt       | ancora          |
| pos       | ancora_charlm   |
| lemma     | ancora_nocharlm |

2024-04-20 10:45:55 INFO: Using device: cpu
2024-04-20 10:45:55 INFO: Loading: tokenize
2024-04-20 10:45:55 INFO: Loading: mwt
2024-04-20 10:45:55 INFO: Loading: pos
2024-04-20 10:45:57 INFO: Loading: lemma
2024-04-20 10:45:57 INFO: Done loading processors!


In [82]:
ppp = ultra_pipe.predict(pd.DataFrame({'Review': ['hola mundo']}))

Lemmatize transformation...
Post lemmatization cleaning transformation...
Vocabulario ya definido...


In [87]:
ppp[0]

5