### Preprocesamiento

In [1]:
from nltk import word_tokenize
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords

stop_words = stopwords.words('spanish')

# Definimos un tokenizador con Stemming
class StemmerTokenizer:
    def __init__(self):
        self.ps = SnowballStemmer('spanish')
    def __call__(self, doc):
        doc_tok = word_tokenize(doc)
        doc_tok = [t for t in doc_tok if t not in stop_words]
        return [self.ps.stem(t) for t in doc_tok]

# Inicializamos tokenizador
tokenizador = StemmerTokenizer()

In [2]:
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import CountVectorizer

bog = CountVectorizer(
    tokenizer= StemmerTokenizer(),
    ngram_range=(1,2)
    )

preprocessing = ColumnTransformer(
    transformers=[('bag-of-words',bog,'comment')]
)

In [3]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_selection import SelectPercentile, f_classif
from sklearn.pipeline import Pipeline

bog_pipeline = Pipeline(
    [("Preprocessing", preprocessing),
    ("Selection", SelectPercentile(f_classif, percentile=90)),
    ("Clasificador", MultinomialNB())]
)

### Cargar datos

In [4]:
from utils.cargar import df_caso

caso = 'alicia'
df = df_caso(caso)

df = df[df['comment'].notna()]
df = df[df['sel'].notna()]
df = df.drop(df[df['max_num'] > 6].index)

df = df.drop(columns=['user_id','team_id','gender','df','title','opt_left','opt_right','max_num','phase','time','curso'])

df.head()

Unnamed: 0_level_0,sel,comment
id,Unnamed: 1_level_1,Unnamed: 2_level_1
20310,1,entrega de producto defectuoso no cumple con l...
20311,1,se debe recuperar tiempo perdido
20314,3,"Una contexto como es el de pandemia, es algo m..."
20315,3,Debiese priorizar cumplir con los criterios té...
20400,4,Lo mejor sería intentar apurar un poco el proy...


In [5]:
from sklearn.model_selection import train_test_split

df_train, df_test, y_train, y_test = train_test_split(df, df['sel'], test_size=.25, stratify=df['sel'])

In [6]:
bog_pipeline.fit(df_train,y_train)



In [7]:
from sklearn.metrics import classification_report

y_pred = bog_pipeline.predict(df_test)

print("Resultados clasificador Naive-Bayes multinomial")
print(classification_report(y_test, y_pred))

Resultados clasificador Naive-Bayes multinomial
              precision    recall  f1-score   support

           1       0.59      0.31      0.41       540
           2       0.42      0.89      0.57       911
           3       0.48      0.11      0.17       427
           4       0.43      0.07      0.13       270
           5       0.44      0.24      0.31       263
           6       1.00      0.01      0.02        87

    accuracy                           0.44      2498
   macro avg       0.56      0.27      0.27      2498
weighted avg       0.49      0.44      0.37      2498

