The objective of this notebook is to create files with the vectorized representation of the words for three different types: BOW, TF-IDF, word2vec to save time and resources (my pc)

### Libraries

In [1]:
import pandas as pd
import numpy as np  
from tqdm.notebook import tqdm
tqdm.pandas()
import os
import joblib

from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv("../../data/silver/df_preprocessed.csv")
df

Unnamed: 0,Category,Message,word_count,char_count
0,0,go jurong point crazy available bugis n great ...,20,111
1,0,ok lar joking wif u oni,6,29
2,1,free entry number wkly comp win fa cup final t...,28,155
3,0,u dun say early hor u c already say,11,49
4,0,nah dont think go usf life around though,13,61
...,...,...,...,...
5149,1,numbernd time tried number contact u u poundnu...,30,160
5150,0,b going esplanade fr home,8,36
5151,0,pity mood soany suggestion,10,57
5152,0,guy bitching acted like id interested buying s...,26,125


In [3]:
X = df['Message']
y = df['Category']  

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)


## BOW

In [4]:
from sklearn.feature_extraction.text import CountVectorizer

In [5]:
def preprocess_bow(X_train, X_val, X_test):
    vectorizer = CountVectorizer(ngram_range= (1,3), max_features = 2000)
    X_train_bow = vectorizer.fit_transform(X_train)
    X_val_bow = vectorizer.transform(X_val)
    X_test_bow = vectorizer.transform(X_test)
    
    return vectorizer, X_train_bow, X_val_bow, X_test_bow

## TF-IDF

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer


In [7]:
def preprocess_tfidf(X_train, X_val, X_test):
    vectorizer = TfidfVectorizer(ngram_range=(1, 3), max_features=2000)
    X_train_tfidf = vectorizer.fit_transform(X_train)
    X_val_tfidf = vectorizer.transform(X_val)
    X_test_tfidf = vectorizer.transform(X_test)
    
    return vectorizer, X_train_tfidf, X_val_tfidf, X_test_tfidf

## Word2Vec

CountVectorizer and TfidfVectorizer tokenize the text already but word2vec no so we do it manually

In [8]:
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/maldu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [9]:
# general dimension of the vector = 100 is fine for a small dataset
# max number of words considered as context of each word. Let's try with 5 by default.      
# min frequency of word appearance in the corpus. All words are included for now.             
# threads to accelerate the training. I have an Intel core i7 so I gonna use 6 to work in parallel 

def preprocess_word2vec(X_train, X_val, X_test):
    
    corpus_train = [word_tokenize(text.lower()) for text in X_train]
    corpus_val = [word_tokenize(text.lower()) for text in X_val]
    corpus_test = [word_tokenize(text.lower()) for text in X_test]
    
    model = Word2Vec(sentences=corpus_train, vector_size=100, window=5, min_count=1, workers=os.cpu_count() - 1)
    
    # Función para obtener el promedio de los vectores de palabras
    def get_average_word2vec(tokens_list, model, vector_size=100):
        valid_words = [word for word in tokens_list if word in model.wv]
        if valid_words:
            word_vectors = [model.wv[word] for word in valid_words]
            return np.mean(word_vectors, axis=0)
        else:
            return np.zeros(vector_size)
    
    X_train_word2vec = np.array([get_average_word2vec(tokens, model) for tokens in corpus_train])
    X_val_word2vec = np.array([get_average_word2vec(tokens, model) for tokens in corpus_val])
    X_test_word2vec = np.array([get_average_word2vec(tokens, model) for tokens in corpus_test])
    
    return model, X_train_word2vec, X_val_word2vec, X_test_word2vec