# Text Preprocessing

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import numpy as np
from gensim.models import Word2Vec
import re

class TextPreprocessor:
    def __init__(self):
        pass
        
    def clean_text(self, text):
        if isinstance(text, float):
            return ""
        text = text.lower()
        # https://medium.com/@siddharthgov01/regular-expressions-from-a-za-z-88cf9cf0abac
        text = re.sub(r'[^a-zA-Z\s]', '', text)
        return text
    

    # https://pages.github.rpi.edu/kuruzj/website_introml_rpi/notebooks/08-intro-nlp/03-scikit-learn-text.html#bag-of-words-using-scikit-learn
    def get_bow_features(self, texts, max_features=5000):
        vectorizer = CountVectorizer(max_features=max_features)
        return vectorizer.fit_transform(texts)

    # https://pages.github.rpi.edu/kuruzj/website_introml_rpi/notebooks/08-intro-nlp/03-scikit-learn-text.html#tf-idf-encoding
    def get_tfidf_features(self, texts, max_features=5000):
        vectorizer = TfidfVectorizer(max_features=max_features)
        return vectorizer.fit_transform(texts)
 
    # https://radimrehurek.com/gensim/models/word2vec.html
    def get_word2vec_features(self, texts, vector_size=100, window=5, min_count=1, workers=4):
        processed_texts = []
        for text in texts:
            if isinstance(text, str):
                cleaned = self.clean_text(text)
                tokens = cleaned.split()
                processed_texts.append(tokens)

        model = Word2Vec(
            sentences=processed_texts,
            vector_size=vector_size,
            window=window,
            min_count=min_count,
            workers=workers
        )
        
        return model

    def get_text_vector(self, text, word2vec_model):
        tokens = self.clean_text(text).split()
        vectors = []
        for token in tokens:
            if token in word2vec_model.wv:
                vectors.append(word2vec_model.wv[token])
                
        if vectors:
            return np.mean(vectors, axis=0)
        return np.zeros(word2vec_model.vector_size)

import pandas as pd
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

# preprocessor = TextPreprocessor()

# train_texts = train_df['text'].apply(preprocessor.clean_text)

# print('bow')
# X_bow = preprocessor.get_bow_features(train_texts)
# print('tfidf')
# X_tfidf = preprocessor.get_tfidf_features(train_texts)
# print('w2v')
# X_w2v = preprocessor.get_word2vec_features(train_texts)

# print(X_bow.shape, X_tfidf.shape, X_w2v)


# Linear Regression with TI-IDF Word Processor

In [3]:

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

preprocessor = TextPreprocessor()
train_texts = train_df['text'].apply(preprocessor.clean_text)
train_labels = train_df['label']

X_train, X_test, y_train, y_test = train_test_split(train_texts, train_labels, test_size=0.2, random_state=42, stratify=train_labels)

X_train_tfidf = preprocessor.get_tfidf_features(X_train)
X_test_tfidf = preprocessor.get_tfidf_features(X_test)

lr_classifier = LogisticRegression(max_iter=1000, C=5.0, penalty='l2', random_state=42 )


lr_classifier.fit(X_train_tfidf, y_train)


y_pred = lr_classifier.predict(X_test_tfidf)


accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")


print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 50.91%

Classification Report:
              precision    recall  f1-score   support

           0       0.63      0.04      0.08      2077
           1       0.51      0.98      0.67      2083

    accuracy                           0.51      4160
   macro avg       0.57      0.51      0.37      4160
weighted avg       0.57      0.51      0.37      4160



# Linear Regression with BOW

In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report


preprocessor = TextPreprocessor()
train_texts = train_df['text'].apply(preprocessor.clean_text)
train_labels = train_df['label']

X_train, X_test, y_train, y_test = train_test_split(train_texts, train_labels, test_size=0.2, random_state=42, stratify=train_labels)

X_train_tfidf = preprocessor.get_bow_features(X_train)
X_test_tfidf = preprocessor.get_bow_features (X_test)

lr_classifier = LogisticRegression(max_iter=1000, C=5.0, penalty='l2', random_state=42 )



lr_classifier.fit(X_train_tfidf, y_train)


y_pred = lr_classifier.predict(X_test_tfidf)


accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")


print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 59.88%

Classification Report:
              precision    recall  f1-score   support

           0       0.57      0.79      0.66      2077
           1       0.66      0.40      0.50      2083

    accuracy                           0.60      4160
   macro avg       0.62      0.60      0.58      4160
weighted avg       0.62      0.60      0.58      4160



# Linear Regression with Word2vec Embedding

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report


preprocessor = TextPreprocessor()
train_texts = train_df['text'].apply(preprocessor.clean_text)
train_labels = train_df['label']

X_train, X_test, y_train, y_test = train_test_split(train_texts, train_labels, test_size=0.2, random_state=42, stratify=train_labels)

X_train_text_vectors = []
X_test_text_vectors = []

word2vec_model = preprocessor.get_word2vec_features(X_train)

for text in X_train:
    text_vector = preprocessor.get_text_vector(text, word2vec_model)
    X_train_text_vectors.append(text_vector)
    
for text in X_test:
    text_vector = preprocessor.get_text_vector(text, word2vec_model)
    X_test_text_vectors.append(text_vector)

X_train_text_vectors = pd.DataFrame(X_train_text_vectors)
X_test_text_vectors = pd.DataFrame(X_test_text_vectors)

lr_classifier = LogisticRegression(max_iter=1000, C=5.0, penalty='l2', solver="saga", random_state=42)

lr_classifier.fit(X_train_text_vectors, y_train)

y_train_pred = lr_classifier.predict(X_train_text_vectors)
y_test_pred = lr_classifier.predict(X_test_text_vectors)

train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f"Train Accuracy: {train_accuracy * 100:.2f}%")
print(f"Test Accuracy: {test_accuracy * 100:.2f}%")

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Train Accuracy: 89.39%
Test Accuracy: 88.32%

Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.90      0.89      2077
           1       0.89      0.87      0.88      2083

    accuracy                           0.88      4160
   macro avg       0.88      0.88      0.88      4160
weighted avg       0.88      0.88      0.88      4160





# SVC Model

In [5]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split

train_texts = train_df['text'].apply(preprocessor.clean_text)
train_labels = train_df['label']


X_train, X_test, y_train, y_test = train_test_split(train_texts, train_labels, test_size=0.2, random_state=42, stratify=train_labels)

X_train_tfidf = preprocessor.get_bow_features(X_train)
X_test_tfidf = preprocessor.get_bow_features(X_test)


svm_clf = SVC(kernel='linear', C=5.0, max_iter=100)  
svm_clf.fit(X_train_tfidf, y_train)


y_pred = svm_clf.predict(X_test_tfidf)


accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")
print(classification_report(y_test, y_pred))




Accuracy: 47.79%
              precision    recall  f1-score   support

           0       0.49      0.94      0.64      2077
           1       0.24      0.02      0.04      2083

    accuracy                           0.48      4160
   macro avg       0.36      0.48      0.34      4160
weighted avg       0.36      0.48      0.34      4160

