# Text Analytics Pipeline for Text Classification

This notebook demonstrates how to build a text analytics pipeline that includes text processing, feature extraction, classification, and evaluation.


In [None]:
# %pip install pandas numpy nltk emoji scikit-learn
# !python -m spacy download en_core_web_sm

In [None]:
import pandas as pd
import numpy as np
import re
import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WorldNetLemmatizer
from nltk.corpus import stopwords
import emoji
import spacy
import contractions

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score


# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Custom Text Preprocessor

The custom transformer below:

 - **Emoji Conversion:** Converts any emojis to their text descriptions.
 - **Normalization:** Lowercases the text.
 - **Punctuation Removal:** Removes punctuation using regex.
 - **Tokenization:** Uses NLTK’s `word_tokenize`.
 - **Stop-word Removal:** Filters out English stopwords.
 - **Stemming:** Applies Porter stemming.
 
 The transformer implements `fit` and `transform` so that it can be used inside a scikit-learn pipeline.

In [None]:
class TextPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self, do_stemming=True, do_lemmatization=False, remove_stopwords=True, 
                 do_emoji_conversion=True, use_spacy_tokenizer=True):
        """
        Parameters:
        - do_stemming: Apply stemming (reduces words to their root form)
        - do_lemmatization: Apply lemmatization (converts words to their canonical form)
          Note: When using the default (NLTK) tokenizer, if both do_lemmatization and do_stemming are enabled,
          lemmatization takes precedence.
        - remove_stopwords: Remove common stopwords
        - do_emoji_conversion: Convert emojis to text descriptions
        - use_spacy_tokenizer: Use a custom spaCy-based tokenizer (which already uses lemmatization)
        """
        self.do_stemming = do_stemming
        self.do_lemmatization = do_lemmatization
        self.remove_stopwords = remove_stopwords
        self.do_emoji_conversion = do_emoji_conversion
        self.use_spacy_tokenizer = use_spacy_tokenizer
        self.stemmer = PorterStemmer()
        if self.do_lemmatization:
            self.lemmatizer = WordNetLemmatizer()
        self.stop_words = set(stopwords.words('english'))
        
        # Load the spaCy model if using the spaCy tokenizer
        if self.use_spacy_tokenizer:
            self.nlp = spacy.load("en_core_web_sm")
    
    def remove_links(self, text):
        """Remove URLs from text."""
        return re.sub(r'http[s]?://\S+|www\.\S+', '', text)
    
    def remove_user_mentions(self, text):
        """Remove user mentions from text."""
        return re.sub(r'u/\S+', '', text)
    
    def expand_contractions(self, text):
        """Expand contractions in the text."""
        return contractions.fix(text)
    
    def remove_non_ascii(self, text):
        """Remove non-ASCII characters from the text."""
        return text.encode("ascii", "ignore").decode()
    
    def remove_punctuations(self, text):
        """
        Remove or adjust punctuation in text.
        Replaces hyphens with space and ensures separation around punctuation.
        """
        text = re.sub(r'[-]', ' ', text)
        text = re.sub(r'(\S)[' + re.escape(string.punctuation) + r'](\S)', r'\1 \2', text)
        return text
    
    def remove_numbers(self, text):
        """Remove numbers from text."""
        return re.sub(r'[0-9]+', '', text)
    
    def emoji_to_text(self, text):
        """Convert emojis to text descriptions."""
        return emoji.demojize(text)
    
    def normalize(self, text):
        """Lowercase the text."""
        return text.lower()
    
    def tokenize(self, text):
        """
        Tokenize text using either a spaCy-based custom tokenizer or the default NLTK tokenizer.
        """
        if self.use_spacy_tokenizer:
            # Use spaCy's custom tokenization logic:
            doc = self.nlp(text)
            tokens = []
            # Add named entities as tokens
            for ent in doc.ents:
                tokens.append(ent.text)
            # Add non-entity tokens using their lemma
            non_entity_tokens = [token.lemma_.lower() for token in doc if not token.ent_type_ 
                                 and not token.is_punct and not token.is_space]
            tokens.extend(non_entity_tokens)
            if self.remove_stopwords:
                tokens = [token for token in tokens if token.lower() not in self.stop_words]
            if self.do_stemming:
                tokens = [self.stemmer.stem(token) for token in tokens]
            return tokens
        else:
            # Default NLTK-based tokenization:
            # Remove punctuation (if any remains) and then tokenize
            text = re.sub(r'[^\w\s]', '', text)
            tokens = word_tokenize(text)
            # Keep only alphabetic tokens
            tokens = [token for token in tokens if token.isalpha()]
            if self.remove_stopwords:
                tokens = [token for token in tokens if token.lower() not in self.stop_words]
            # Apply lemmatization if enabled; otherwise, apply stemming if enabled
            if self.do_lemmatization:
                tokens = [self.lemmatizer.lemmatize(token) for token in tokens]
            elif self.do_stemming:
                tokens = [self.stemmer.stem(token) for token in tokens]
            return tokens
    
    def preprocess(self, text):
        """Apply the complete preprocessing pipeline to the text."""
        text = self.remove_links(text)
        text = self.remove_user_mentions(text)
        text = self.expand_contractions(text)
        text = self.remove_non_ascii(text)
        text = self.remove_punctuations(text)
        text = self.remove_numbers(text)
        if self.do_emoji_conversion:
            text = self.emoji_to_text(text)
        text = self.normalize(text)
        tokens = self.tokenize(text)
        return ' '.join(tokens)
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        return X.apply(self.preprocess)

 ## Data Loading and Train/Test Split
 
 We load the dataset and split it into training and testing sets.

In [None]:
# Read the dataset (make sure the file is in your working directory)
df = pd.read_csv("labeled_data_1.csv")

# Check available columns
print("Columns in dataset:", df.columns.tolist())

# Select the important columns and drop any missing values
df = df[['Cleaned Text', 'labels_1']].dropna()
X = df['Cleaned Text']
y = df['labels_1']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Building Various Pipelines
 
 We create several pipelines:
 
 1. **CountVectorizer with Unigrams (Binary Representation):**  
    Uses binary occurrence of words.
 
 2. **CountVectorizer with N-grams (Frequency Count):**  
    Uses unigrams and bigrams.
 
 3. **TfidfVectorizer with Unigrams:**  
    Uses TF-IDF weights for unigrams.
 
 4. **TfidfVectorizer with N-grams:**  
    Uses TF-IDF weights for unigrams and bigrams.
 
 For each representation, we create 4 classifiers: Logistic Regression, SVM (using LinearSVC), RandomForest and Multtinomial Naive Bayes

In [None]:
# Logistic Regression Pipelines
pipeline_count_unigram = Pipeline([
    ('preprocessor', TextPreprocessor()),
    ('vectorizer', CountVectorizer(binary=True, ngram_range=(1,1))),
    ('classifier', LogisticRegression(max_iter=1000))
])

pipeline_count_ngram = Pipeline([
    ('preprocessor', TextPreprocessor()),
    ('vectorizer', CountVectorizer(ngram_range=(1,2))),  # unigrams and bigrams, frequency counts
    ('classifier', LogisticRegression(max_iter=1000))
])

pipeline_tfidf_unigram = Pipeline([
    ('preprocessor', TextPreprocessor()),
    ('vectorizer', TfidfVectorizer(ngram_range=(1,1))),
    ('classifier', LogisticRegression(max_iter=1000))
])

pipeline_tfidf_ngram = Pipeline([
    ('preprocessor', TextPreprocessor()),
    ('vectorizer', TfidfVectorizer(ngram_range=(1,2))),
    ('classifier', LogisticRegression(max_iter=1000))
])

# SVM Pipelines (using LinearSVC)
pipeline_svm_count_unigram = Pipeline([
    ('preprocessor', TextPreprocessor()),
    ('vectorizer', CountVectorizer(binary=True, ngram_range=(1,1))),
    ('classifier', LinearSVC(max_iter=1000))
])

pipeline_svm_count_ngram = Pipeline([
    ('preprocessor', TextPreprocessor()),
    ('vectorizer', CountVectorizer(ngram_range=(1,2))),
    ('classifier', LinearSVC(max_iter=1000))
])

pipeline_svm_tfidf_unigram = Pipeline([
    ('preprocessor', TextPreprocessor()),
    ('vectorizer', TfidfVectorizer(ngram_range=(1,1))),
    ('classifier', LinearSVC(max_iter=1000))
])

pipeline_svm_tfidf_ngram = Pipeline([
    ('preprocessor', TextPreprocessor()),
    ('vectorizer', TfidfVectorizer(ngram_range=(1,2))),
    ('classifier', LinearSVC(max_iter=1000))
])


# Random Forest Pipelines
pipeline_rf_count_unigram = Pipeline([
    ('preprocessor', TextPreprocessor()),
    ('vectorizer', CountVectorizer(binary=True, ngram_range=(1,1))),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
])

pipeline_rf_count_ngram = Pipeline([
    ('preprocessor', TextPreprocessor()),
    ('vectorizer', CountVectorizer(ngram_range=(1,2))),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
])

pipeline_rf_tfidf_unigram = Pipeline([
    ('preprocessor', TextPreprocessor()),
    ('vectorizer', TfidfVectorizer(ngram_range=(1,1))),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
])

pipeline_rf_tfidf_ngram = Pipeline([
    ('preprocessor', TextPreprocessor()),
    ('vectorizer', TfidfVectorizer(ngram_range=(1,2))),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
])


# Naive Bayes (MultinomialNB) Pipelines
pipeline_nb_count_unigram = Pipeline([
    ('preprocessor', TextPreprocessor()),
    ('vectorizer', CountVectorizer(binary=True, ngram_range=(1,1))),
    ('classifier', MultinomialNB())
])

pipeline_nb_count_ngram = Pipeline([
    ('preprocessor', TextPreprocessor()),
    ('vectorizer', CountVectorizer(ngram_range=(1,2))),
    ('classifier', MultinomialNB())
])

pipeline_nb_tfidf_unigram = Pipeline([
    ('preprocessor', TextPreprocessor()),
    ('vectorizer', TfidfVectorizer(ngram_range=(1,1))),
    ('classifier', MultinomialNB())
])

pipeline_nb_tfidf_ngram = Pipeline([
    ('preprocessor', TextPreprocessor()),
    ('vectorizer', TfidfVectorizer(ngram_range=(1,2))),
    ('classifier', MultinomialNB())
])

## Evaluating the Pipelines
 
 We define a helper function that fits a pipeline and returns evaluation metrics:
 
 - **Accuracy**  
 - **Precision** (weighted)  
 - **Recall** (weighted)  
 - **F1 Score** (weighted)
 
 Then, we loop over all pipelines, evaluate them on the test set, and compile the results into a comparison table.

In [None]:
def evaluate_pipeline_metrics(pipeline, X_train, X_test, y_train, y_test):
    """Train the pipeline and return evaluation metrics."""
    pipeline.fit(X_train, y_train)
    predictions = pipeline.predict(X_test)
    metrics = {
        "Accuracy": accuracy_score(y_test, predictions),
        "Precision": precision_score(y_test, predictions, average='weighted', zero_division=0),
        "Recall": recall_score(y_test, predictions, average='weighted', zero_division=0),
        "F1 Score": f1_score(y_test, predictions, average='weighted', zero_division=0)
    }
    return metrics

# Dictionary of all pipelines
pipelines = {
    "LR_Count_Binary_Unigram": pipeline_count_unigram,
    "LR_Count_Freq_Ngram": pipeline_count_ngram,
    "LR_Tfidf_Unigram": pipeline_tfidf_unigram,
    "LR_Tfidf_Ngram": pipeline_tfidf_ngram,
    "SVM_Count_Binary_Unigram": pipeline_svm_count_unigram,
    "SVM_Count_Freq_Ngram": pipeline_svm_count_ngram,
    "SVM_Tfidf_Unigram": pipeline_svm_tfidf_unigram,
    "SVM_Tfidf_Ngram": pipeline_svm_tfidf_ngram,
}

# Evaluate each pipeline and store results
results = []
for name, pipe in pipelines.items():
    metrics = evaluate_pipeline_metrics(pipe, X_train, X_test, y_train, y_test)
    row = {"Pipeline": name}
    row.update(metrics)
    results.append(row)

# Create a DataFrame of results and sort by F1 Score
results_df = pd.DataFrame(results)
results_df = results_df.sort_values(by="F1 Score", ascending=False)
print("### Model Comparison Table")
results_df.reset_index(drop=True, inplace=True)
print(results_df)

## Parameter Tuning with GridSearchCV [TO BE COMPLETED]
 
 Here, we perform grid search on a pipeline using `TfidfVectorizer` to tune parameters such as:
 
 - **ngram_range:** Unigrams vs. unigrams+bigrams.
 - **use_idf:** Whether to use the inverse document frequency reweighting.
 - **C:** Regularization strength for Logistic Regression.
 
 The grid search uses 5-fold cross-validation and optimizes for macro F1 score.


In [None]:
# Define the pipeline for grid search
pipeline_grid = Pipeline([
    ('preprocessor', TextPreprocessor()),
    ('vectorizer', TfidfVectorizer()),
    ('classifier', LogisticRegression(max_iter=1000))
])

# Define parameter grid
param_grid = {
    'vectorizer__ngram_range': [(1,1), (1,2)],
    'vectorizer__use_idf': [True, False],
    'classifier__C': [0.1, 1, 10]
}

# Perform Grid Search
grid_search = GridSearchCV(pipeline_grid, param_grid, cv=5, scoring='f1_macro', n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)

print("Best Parameters from Grid Search:", grid_search.best_params_)

# Evaluate the best model on the test set
best_model = grid_search.best_estimator_
predictions_best = best_model.predict(X_test)
print("### Evaluation of Best Model from Grid Search")
print(classification_report(y_test, predictions_best))