# Text Analytics Pipeline for Text Classification

This notebook demonstrates how to build a text analytics pipeline that includes text processing, feature extraction, classification, and evaluation.


In [None]:
# %pip install pandas numpy nltk emoji scikit-learn

In [None]:
import pandas as pd
import numpy as np
import re
import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import emoji

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report

import warnings
warnings.filterwarnings("ignore")

# Download required NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

# Custom Text Preprocessor

The custom transformer below:

 - **Emoji Conversion:** Converts any emojis to their text descriptions.
 - **Normalization:** Lowercases the text.
 - **Punctuation Removal:** Removes punctuation using regex.
 - **Tokenization:** Uses NLTK’s `word_tokenize`.
 - **Stop-word Removal:** Filters out English stopwords.
 - **Stemming:** Applies Porter stemming.
 
 The transformer implements `fit` and `transform` so that it can be used inside a scikit-learn pipeline.

In [None]:
class TextPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self, do_stemming=True, remove_stopwords=True, do_emoji_conversion=True):
        self.do_stemming = do_stemming
        self.remove_stopwords = remove_stopwords
        self.do_emoji_conversion = do_emoji_conversion
        self.stemmer = PorterStemmer()
        self.stop_words = set(stopwords.words('english'))
    
    def emoji_to_text(self, text):
        """Convert emojis to their text descriptions."""
        return emoji.demojize(text)
    
    def tokenize(self, text):
        """Tokenize text after removing punctuation."""
        # Remove punctuation
        text = re.sub(r'[^\w\s]', '', text)
        tokens = word_tokenize(text)
        # Keep only alphabetic tokens
        tokens = [token for token in tokens if token.isalpha()]
        if self.remove_stopwords:
            tokens = [token for token in tokens if token.lower() not in self.stop_words]
        if self.do_stemming:
            tokens = [self.stemmer.stem(token) for token in tokens]
        return tokens

    def normalize(self, text):
        """Lowercase the text."""
        return text.lower()
    
    def preprocess(self, text):
        """Perform full preprocessing on the text."""
        if self.do_emoji_conversion:
            text = self.emoji_to_text(text)
        text = self.normalize(text)
        tokens = self.tokenize(text)
        # Join tokens back to a single string (vectorizers expect string input)
        return ' '.join(tokens)
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        return X.apply(self.preprocess)

 ## Data Loading and Train/Test Split
 
 We load the dataset and split it into training and testing sets.

In [None]:
# Read the dataset (make sure the file is in your working directory)
df = pd.read_csv("labeled_data_1.csv")

# Check available columns
print("Columns in dataset:", df.columns.tolist())

# Select the important columns and drop any missing values
df = df[['Cleaned Text', 'labels_1']].dropna()
X = df['Cleaned Text']
y = df['labels_1']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Building Various Pipelines
 
 We create several pipelines:
 
 1. **CountVectorizer with Unigrams (Binary Representation):**  
    Uses binary occurrence of words.
 
 2. **CountVectorizer with N-grams (Frequency Count):**  
    Uses unigrams and bigrams.
 
 3. **TfidfVectorizer with Unigrams:**  
    Uses TF-IDF weights for unigrams.
 
 4. **TfidfVectorizer with N-grams:**  
    Uses TF-IDF weights for unigrams and bigrams.
 
 In all pipelines, the `TextPreprocessor` is applied first, followed by feature extraction and finally classification using Logistic Regression.


In [None]:
# Pipeline: CountVectorizer with unigrams (binary representation)
pipeline_count_unigram = Pipeline([
    ('preprocessor', TextPreprocessor()),
    ('vectorizer', CountVectorizer(binary=True, ngram_range=(1,1))),
    ('classifier', LogisticRegression(max_iter=1000))
])

# Pipeline: CountVectorizer with n-grams (frequency count)
pipeline_count_ngram = Pipeline([
    ('preprocessor', TextPreprocessor()),
    ('vectorizer', CountVectorizer(ngram_range=(1,2))),  # using unigrams and bigrams
    ('classifier', LogisticRegression(max_iter=1000))
])

# Pipeline: TfidfVectorizer with unigrams
pipeline_tfidf_unigram = Pipeline([
    ('preprocessor', TextPreprocessor()),
    ('vectorizer', TfidfVectorizer(ngram_range=(1,1))),
    ('classifier', LogisticRegression(max_iter=1000))
])

# Pipeline: TfidfVectorizer with n-grams
pipeline_tfidf_ngram = Pipeline([
    ('preprocessor', TextPreprocessor()),
    ('vectorizer', TfidfVectorizer(ngram_range=(1,2))),
    ('classifier', LogisticRegression(max_iter=1000))
])

## Evaluating the Pipelines
 
 We define a helper function `evaluate_pipeline` that fits a given pipeline on the training data, predicts the test data labels, and prints out the classification report.

In [None]:
def evaluate_pipeline(pipeline, X_train, X_test, y_train, y_test):
    pipeline.fit(X_train, y_train)
    predictions = pipeline.predict(X_test)
    print(classification_report(y_test, predictions))
    return predictions

print("### Evaluation: CountVectorizer with Unigrams (Binary)")
predictions_count_uni = evaluate_pipeline(pipeline_count_unigram, X_train, X_test, y_train, y_test)

print("### Evaluation: CountVectorizer with N-grams (Frequency Count)")
predictions_count_ngram = evaluate_pipeline(pipeline_count_ngram, X_train, X_test, y_train, y_test)

print("### Evaluation: TfidfVectorizer with Unigrams")
predictions_tfidf_uni = evaluate_pipeline(pipeline_tfidf_unigram, X_train, X_test, y_train, y_test)

print("### Evaluation: TfidfVectorizer with N-grams")
predictions_tfidf_ngram = evaluate_pipeline(pipeline_tfidf_ngram, X_train, X_test, y_train, y_test)

## Parameter Tuning with GridSearchCV
 
 Here, we perform grid search on a pipeline using `TfidfVectorizer` to tune parameters such as:
 
 - **ngram_range:** Unigrams vs. unigrams+bigrams.
 - **use_idf:** Whether to use the inverse document frequency reweighting.
 - **C:** Regularization strength for Logistic Regression.
 
 The grid search uses 5-fold cross-validation and optimizes for macro F1 score.


In [None]:
# Define the pipeline for grid search
pipeline_grid = Pipeline([
    ('preprocessor', TextPreprocessor()),
    ('vectorizer', TfidfVectorizer()),
    ('classifier', LogisticRegression(max_iter=1000))
])

# Define parameter grid
param_grid = {
    'vectorizer__ngram_range': [(1,1), (1,2)],
    'vectorizer__use_idf': [True, False],
    'classifier__C': [0.1, 1, 10]
}

# Perform Grid Search
grid_search = GridSearchCV(pipeline_grid, param_grid, cv=5, scoring='f1_macro', n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)

print("Best Parameters from Grid Search:", grid_search.best_params_)

# Evaluate the best model on the test set
best_model = grid_search.best_estimator_
predictions_best = best_model.predict(X_test)
print("### Evaluation of Best Model from Grid Search")
print(classification_report(y_test, predictions_best))