In [54]:
import stanza

from sklearn.base import BaseEstimator, TransformerMixin


class Preprocesser(BaseEstimator, TransformerMixin):
    
    def fit(self, X, y=None):
        return self
    
    
    def transform(self, X):
        alfa = 0.05
        beta = 0.05
        
        # eliminación de nulos
        df_work = X.dropna()
        # eliminación de duplicados
        df_work = df_work.drop_duplicates()
        
        # eliminación de longitudes atípicas
        df_work['Conteo'] = [len(x.split()) for x in df_work['Review']]
        upper = df_work['Conteo'].quantile(1-beta)
        lower = df_work['Conteo'].quantile(alfa)
        # menores que percentil mayor
        df_work = df_work[df_work['Conteo'] <= upper]
        # mayores que percentil menor
        df_work = df_work[df_work['Conteo'] >= lower]

        df_work = df_work.drop(['Conteo'], axis=1)
        
        return df_work


In [55]:
class Lemmatizer(BaseEstimator, TransformerMixin):
    
    def __init__(self):
        self.stanza_pipeline = stanza.Pipeline(lang="es", processors="tokenize,mwt,pos,lemma")
    
    
    def fit(self, X, y=None):
        return self
    
    
    def stanza_preprocessing(self, words):
        doc = self.stanza_pipeline(words)
        lemmas = [w.lemma for w in doc.sentences[0].words]
        return lemmas
    
    def transform(self, X):
        print("Lemmatize transformation...")
        df_work = X
        df_work['Review'] = df_work['Review'].apply(self.stanza_preprocessing)
        return df_work
    
    def __getstate__(self):
        # Exclude the stanza_pipeline from serialization
        state = self.__dict__.copy()
        del state['stanza_pipeline']
        return state

    def __setstate__(self, state):
        # Restore the stanza_pipeline during deserialization
        self.__dict__.update(state)
        self.stanza_pipeline = stanza.Pipeline(lang="es", processors="tokenize,mwt,pos,lemma")
    

In [56]:
import nltk
import re
import unicodedata

def clean_text(words):
    """
    Cleans the list of words by performing various operations.

    :param words: (list) List of words to be cleaned.
    :returns: (list) Cleaned list of words.
    """
    words = [w.lower() for w in words]  # Lowercase
    words = [re.sub(r'[^\w\s]', '', word) for word in words if word is not None]  # Remove punctuation
    words = [unicodedata.normalize('NFKD', word).encode('utf-8', 'ignore').decode('utf-8', 'ignore') for word in words]  # Remove non-encoded characters
    languages = ['spanish']
    stopword = nltk.corpus.stopwords.words(languages)
    return [w for w in words if w not in stopword]  # Remove stopwords


class PostCleaner(BaseEstimator, TransformerMixin):
    
    def __init__(self, cleaning_func):
        self.cleaning_func = cleaning_func
    
    def fit(self, X, y=None):
        return self
    
    
    def transform(self, X):
        print("Post lemmatization cleaning transformation...")
        df_work = X.copy()
        df_work['Review'] = df_work['Review'].apply(self.cleaning_func)
        return df_work
    

In [57]:
from copy import deepcopy
from sklearn.feature_extraction.text import TfidfVectorizer


class Vectorizer(BaseEstimator, TransformerMixin):
    
    def __init__(self):
        self.vectorizer_algorithm = TfidfVectorizer(
            decode_error='ignore',
            strip_accents='ascii',
            analyzer='word',
            max_features=10000
        )
        self.features = None
        
        # remiendo. En la primera pasada, se genera el vocabulario
        self.already_fit = False
        
    
    def fit(self, X, y=None):
        return self
    
    
    def transform(self, X):
        data_stringfied = deepcopy(X)
        data_stringfied['Review'] = data_stringfied['Review'].apply(lambda x: ' '.join(map(str, x)))
        
        x_data = data_stringfied['Review']
        
        if self.already_fit:
            print("Vocabulario ya definido...")        
            x_data_vectorized_matrix = self.vectorizer_algorithm.transform(x_data)
        else:
            print("Creando vocabulario...")
            x_data_vectorized_matrix = self.vectorizer_algorithm.fit_transform(x_data)
            
        x_data_vectorized_df = pd.DataFrame(x_data_vectorized_matrix.toarray())  # ... for additional features from csr_matrix
        
        # obtiene el arreglo de palabras con columnas
        self.features = self.vectorizer_algorithm.get_feature_names_out()
        self.already_fit = True
        
        res = pd.concat([x_data_vectorized_df], axis=1)
        
        return res
    

In [66]:
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline

pipeline = Pipeline(
    steps=[
        ('lemmatizer', Lemmatizer()),
        ('post-cleaner', PostCleaner(cleaning_func=clean_text)),
        ('vectorizer', Vectorizer()),
        ('classifier', SVC(kernel='rbf', C=1))
    ]
)

2024-04-20 10:36:32 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json:   0%|   …

2024-04-20 10:36:33 INFO: Downloaded file to C:\Users\alvar\stanza_resources\resources.json
2024-04-20 10:36:33 INFO: Loading these models for language: es (Spanish):
| Processor | Package         |
-------------------------------
| tokenize  | ancora          |
| mwt       | ancora          |
| pos       | ancora_charlm   |
| lemma     | ancora_nocharlm |

2024-04-20 10:36:33 INFO: Using device: cpu
2024-04-20 10:36:33 INFO: Loading: tokenize
2024-04-20 10:36:33 INFO: Loading: mwt
2024-04-20 10:36:33 INFO: Loading: pos
2024-04-20 10:36:34 INFO: Loading: lemma
2024-04-20 10:36:34 INFO: Done loading processors!


In [67]:
from sklearn.model_selection import train_test_split
import pandas as pd

df_big = pd.read_csv('../data/raw/tipo2_entrenamiento_estudiantes.csv', sep=',', encoding = 'utf-8')
df = df_big.sample(frac=0.10)

df = Preprocesser().fit_transform(df)

xtr, xts, ytr, yts = train_test_split(df['Review'], df['Class'], test_size=0.2, random_state=42069)
xtr = pd.DataFrame(xtr)
xts = pd.DataFrame(xts)
print(xtr.shape, type(xtr))
print(ytr.shape, type(ytr))
print(xts.shape, type(xts))
print(yts.shape, type(yts))

(568, 1) <class 'pandas.core.frame.DataFrame'>
(568,) <class 'pandas.core.series.Series'>
(142, 1) <class 'pandas.core.frame.DataFrame'>
(142,) <class 'pandas.core.series.Series'>


In [68]:
pipeline.fit(xtr, ytr)

Lemmatize transformation...
Post lemmatization cleaning transformation...
Creando vocabulario...


In [69]:
train_features = pipeline[2].features
train_features

array(['0130', '10', '100010', ..., 'zocalo', 'zona', 'zoologico'],
      dtype=object)

In [70]:
pipeline.score(xts, yts)

Lemmatize transformation...
Post lemmatization cleaning transformation...
Vocabulario ya definido...


0.2887323943661972

In [71]:
pred = pd.DataFrame({'Review': ['favorito']})
pred

Unnamed: 0,Review
0,favorito


In [72]:
pipeline.predict(pred)

Lemmatize transformation...
Post lemmatization cleaning transformation...
Vocabulario ya definido...


array([5], dtype=int64)

In [73]:
from joblib import dump
# dump vectorizer
dump(pipeline['vectorizer'], '../models/vf6.joblib')
dump(pipeline, '../models/svm6.joblib')

['../models/svm6.joblib']

In [74]:
from joblib import load

vecto = load('../models/vf6.joblib')
new_pipe = load('../models/svm6.joblib')
new_pipe['vectorizer'].vectorizer_algorithm.get_feature_names_out()

2024-04-20 10:42:09 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json:   0%|   …

2024-04-20 10:42:10 INFO: Downloaded file to C:\Users\alvar\stanza_resources\resources.json
2024-04-20 10:42:11 INFO: Loading these models for language: es (Spanish):
| Processor | Package         |
-------------------------------
| tokenize  | ancora          |
| mwt       | ancora          |
| pos       | ancora_charlm   |
| lemma     | ancora_nocharlm |

2024-04-20 10:42:11 INFO: Using device: cpu
2024-04-20 10:42:11 INFO: Loading: tokenize
2024-04-20 10:42:11 INFO: Loading: mwt
2024-04-20 10:42:11 INFO: Loading: pos
2024-04-20 10:42:12 INFO: Loading: lemma
2024-04-20 10:42:12 INFO: Done loading processors!


array(['0130', '10', '100010', ..., 'zocalo', 'zona', 'zoologico'],
      dtype=object)

In [75]:
pred = pd.DataFrame({'Review': ['favorito']})
pred

Unnamed: 0,Review
0,favorito


In [79]:
new_pipe.predict(pred)

Lemmatize transformation...
Post lemmatization cleaning transformation...
Vocabulario ya definido...


array([5], dtype=int64)

In [76]:
vecto.vectorizer_algorithm.transform(pd.DataFrame({'Review': ['favorito']}))

<1x2060 sparse matrix of type '<class 'numpy.float64'>'
	with 0 stored elements in Compressed Sparse Row format>

In [77]:
new_vecto = TfidfVectorizer(
    decode_error='ignore',
    strip_accents='ascii',
    analyzer='word',
    vocabulary=vecto.features
)
dffit = pd.DataFrame({'Review': ['hola mundo 123 casa', 'prueba de fit']})
vtest = new_vecto.fit_transform(dffit)
dftest = pd.DataFrame(vtest.toarray())
dftest

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2050,2051,2052,2053,2054,2055,2056,2057,2058,2059
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [78]:
res = new_fitted_vecto.transform(pd.DataFrame({'Review': ['zona']}))
print(res[0])



In [80]:
ultra_pipe = load('../models/svm6.joblib')

2024-04-20 10:45:54 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json:   0%|   …

2024-04-20 10:45:54 INFO: Downloaded file to C:\Users\alvar\stanza_resources\resources.json
2024-04-20 10:45:55 INFO: Loading these models for language: es (Spanish):
| Processor | Package         |
-------------------------------
| tokenize  | ancora          |
| mwt       | ancora          |
| pos       | ancora_charlm   |
| lemma     | ancora_nocharlm |

2024-04-20 10:45:55 INFO: Using device: cpu
2024-04-20 10:45:55 INFO: Loading: tokenize
2024-04-20 10:45:55 INFO: Loading: mwt
2024-04-20 10:45:55 INFO: Loading: pos
2024-04-20 10:45:57 INFO: Loading: lemma
2024-04-20 10:45:57 INFO: Done loading processors!


In [82]:
ppp = ultra_pipe.predict(pd.DataFrame({'Review': ['hola mundo']}))

Lemmatize transformation...
Post lemmatization cleaning transformation...
Vocabulario ya definido...


In [87]:
ppp[0]

5