# Text Preprocessing

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import numpy as np
from gensim.models import Word2Vec
import re

class TextPreprocessor:
    def __init__(self):
        pass
        
    def clean_text(self, text):
        if isinstance(text, float):
            return ""
        text = text.lower()
        # https://medium.com/@siddharthgov01/regular-expressions-from-a-za-z-88cf9cf0abac
        text = re.sub(r'[^a-zA-Z\s]', '', text)
        return text
    

    # https://pages.github.rpi.edu/kuruzj/website_introml_rpi/notebooks/08-intro-nlp/03-scikit-learn-text.html#bag-of-words-using-scikit-learn
    def get_bow_features(self, texts, max_features=5000):
        vectorizer = CountVectorizer(max_features=max_features)
        return vectorizer.fit_transform(texts)

    # https://pages.github.rpi.edu/kuruzj/website_introml_rpi/notebooks/08-intro-nlp/03-scikit-learn-text.html#tf-idf-encoding
    def get_tfidf_features(self, texts, max_features=5000):
        vectorizer = TfidfVectorizer(max_features=max_features)
        return vectorizer.fit_transform(texts)
 
    # https://radimrehurek.com/gensim/models/word2vec.html
    def get_word2vec_features(self, texts, vector_size=100, window=5, min_count=1, workers=4):
        processed_texts = []
        for text in texts:
            if isinstance(text, str):
                cleaned = self.clean_text(text)
                tokens = cleaned.split()
                processed_texts.append(tokens)

        model = Word2Vec(
            sentences=processed_texts,
            vector_size=vector_size,
            window=window,
            min_count=min_count,
            workers=workers
        )
        
        return model

    def get_text_vector(self, text, word2vec_model):
        tokens = self.clean_text(text).split()
        vectors = []
        for token in tokens:
            if token in word2vec_model.wv:
                vectors.append(word2vec_model.wv[token])
                
        if vectors:
            return np.mean(vectors, axis=0)
        return np.zeros(word2vec_model.vector_size)

import pandas as pd
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

preprocessor = TextPreprocessor()

train_texts = train_df['text'].apply(preprocessor.clean_text)

print('bow')
X_bow = preprocessor.get_bow_features(train_texts)
print('tfidf')
X_tfidf = preprocessor.get_tfidf_features(train_texts)
print('w2v')
X_w2v = preprocessor.get_word2vec_features(train_texts)

print(X_bow.shape, X_tfidf.shape, X_w2v)


bow
tfidf
w2v
(20800, 5000) (20800, 5000) Word2Vec<vocab=178605, vector_size=100, alpha=0.025>
