# Modelos lineales

In [1]:
import sys
sys.path.insert(0, '..')
from utils.preprocesamiento import StemmerTokenizer, df_to_list, process_df, procesar_adela
from utils.cargar import df_caso
import pandas as pd

tokenizador = StemmerTokenizer(stem=False,rmv_punctuation=True,)

caso = 'alicia'
df = df_caso(caso)
df = process_df(df,'comment','sel',verbose=False)
df = df.drop(columns=['user_id','team_id','gender','df','title','opt_left','opt_right','max_num','phase','time','curso'])

train_indices = pd.read_csv('../utils/splits/train_indices_{}'.format(caso), header=None)[0].tolist()
test_indices = pd.read_csv('../utils/splits/test_indices_{}'.format(caso), header=None)[0].tolist()
print("Partición train-test cargada")

df_train = df.iloc[train_indices]
df_test = df.iloc[test_indices]

tokenized_corpus = [tokenizador(document) for document in df_train['comment']]
tokenized_test = [tokenizador(document) for document in df_test['comment']]

Partición train-test cargada


LDA

In [2]:
import gensim
from gensim import corpora

# Create a dictionary from the tokenized corpus
dictionary = corpora.Dictionary(tokenized_corpus)

# Convert the tokenized corpus into a document-term matrix
doc_term_matrix = [dictionary.doc2bow(doc) for doc in tokenized_corpus]

In [3]:
sys.path.insert(0, '../topic-modelling/')
from TM_utils import get_lda_embeddings

num_topic = 25

lda_model = gensim.models.LdaModel.load('../topic-modelling/modelos/LDA-{}_{}.gensim'.format(caso,num_topic))
print("Cargando modelo pre-ajustado: '../topic-modelling/modelos/LDA-{}_{}.gensim".format(caso,num_topic))

arr_train_lda = get_lda_embeddings(lda_model,tokenizador,dictionary,df_train,'comment')
arr_test_lda = get_lda_embeddings(lda_model,tokenizador,dictionary,df_test,'comment')

Cargando modelo pre-ajustado: '../topic-modelling/modelos/LDA-alicia_25.gensim


BERTopic

In [4]:
from bertopic import BERTopic

BT_model = BERTopic.load("../topic-modelling/modelos/BT-{}.bertopic".format(caso))

output_train = BT_model.transform(df_to_list(df_train,'comment',tokenizador))
output_test = BT_model.transform(df_to_list(df_test,'comment',tokenizador))

  @numba.jit()
  @numba.jit()
  @numba.jit()
  @numba.jit()


## Modelo lineal regularizado

**Clasificación**

In [5]:
from sklearn.linear_model import RidgeClassifier

clf_lda = RidgeClassifier(alpha=1)
clf_lda.fit(arr_train_lda,df_train['sel'])

In [6]:
clf_bt = RidgeClassifier(alpha=1)
clf_bt.fit(output_train[1],df_train['sel'])

In [7]:
from sklearn.metrics import classification_report

y_pred_lda = clf_lda.predict(arr_test_lda)
y_pred_bt = clf_bt.predict(output_test[1])

print("Resultados clasificador lineal con tópicos LDA")
print(classification_report(df_test['sel'], y_pred_lda))
print("\nResultados clasificador lineal con tópicos BERTopic")
print(classification_report(df_test['sel'], y_pred_bt))

Resultados clasificador lineal con tópicos LDA
              precision    recall  f1-score   support

           1       0.00      0.00      0.00       434
           2       0.36      1.00      0.53       729
           3       0.00      0.00      0.00       341
           4       0.00      0.00      0.00       216
           5       0.00      0.00      0.00       210
           6       0.00      0.00      0.00        70

    accuracy                           0.36      2000
   macro avg       0.06      0.17      0.09      2000
weighted avg       0.13      0.36      0.19      2000


Resultados clasificador lineal con tópicos BERTopic
              precision    recall  f1-score   support

           1       0.00      0.00      0.00       434
           2       0.36      0.99      0.53       729
           3       0.00      0.00      0.00       341
           4       0.00      0.00      0.00       216
           5       0.00      0.00      0.00       210
           6       0.00      0.0

Clasificación Binaria

In [8]:
df_bin = df.copy()

df_bin.loc[(df_bin['sel']==2) | (df_bin['sel']==3),'sel'] = 1
df_bin.loc[(df_bin['sel']==4) | (df_bin['sel']==5),'sel'] = 6

df_train_bin = df_bin.iloc[train_indices]
df_test_bin = df_bin.iloc[test_indices]

df_bin['sel'].value_counts()

sel
1    7512
6    2479
Name: count, dtype: int64

In [9]:
clf_lda = RidgeClassifier(alpha=1.0)
clf_lda.fit(arr_train_lda,df_train_bin['sel'])

clf_bt = RidgeClassifier(alpha=1.0)
clf_bt.fit(output_train[1],df_train_bin['sel']);

In [10]:
y_pred_lda = clf_lda.predict(arr_test_lda)
y_pred_bt = clf_bt.predict(output_test[1])

print("Resultados clasificador lineal con tópicos LDA")
print(classification_report(df_test_bin['sel'], y_pred_lda))
print("\nResultados clasificador lineal con tópicos BERTopic")
print(classification_report(df_test_bin['sel'], y_pred_bt))

Resultados clasificador lineal con tópicos LDA
              precision    recall  f1-score   support

           1       0.75      1.00      0.86      1504
           6       0.00      0.00      0.00       496

    accuracy                           0.75      2000
   macro avg       0.38      0.50      0.43      2000
weighted avg       0.57      0.75      0.65      2000


Resultados clasificador lineal con tópicos BERTopic
              precision    recall  f1-score   support

           1       0.75      1.00      0.86      1504
           6       0.29      0.00      0.01       496

    accuracy                           0.75      2000
   macro avg       0.52      0.50      0.43      2000
weighted avg       0.64      0.75      0.65      2000



**Regresión**

In [11]:
from sklearn.linear_model import Ridge