# Text Analytics Pipeline for Text Classification

This notebook demonstrates how to build a text analytics pipeline that includes text processing, feature extraction, classification, and evaluation.


In [None]:
# import pandas as pd

# # Read the dataset
# df = pd.read_csv("../Data/labelled_data_1.csv")

# # Define the classes in the desired order (adjust if needed)
# classes = ['neutral', 'negative', 'positive']

# # Total number of records desired
# total_records = 500
# n_classes = len(classes)

# # Calculate base count per class and remainder
# base_count = total_records // n_classes      # e.g. 500 // 3 = 166
# remainder = total_records % n_classes          # e.g. 500 % 3 = 2

# selected_dfs = []
# for i, cls in enumerate(classes):
#     # Filter records for the current class
#     cls_df = df[df['label_1'] == cls].copy()
    
#     # Sort by the similarity field in descending order (highest similarity first)
#     cls_df = cls_df.sort_values(by='similarity', ascending=False)
    
#     # Determine number of records to pick for this class.
#     # For the first "remainder" classes, add one extra record.
#     n_records = base_count + (1 if i < remainder else 0)
    
#     # Take the top n_records from the sorted dataframe
#     selected_dfs.append(cls_df.head(n_records))

# # Combine the subsets for each class into one dataframe
# balanced_subset = pd.concat(selected_dfs)

# # Optionally, shuffle the combined dataframe
# balanced_subset = balanced_subset.sample(frac=1, random_state=42).reset_index(drop=True)

# print("Balanced subset shape:", balanced_subset.shape)
# print(balanced_subset.head())
# balanced_subset.to_csv("../Data/TestDataOnly.csv", index=False)


In [None]:
# %pip install pandas numpy nltk emoji scikit-learn
# !python -m spacy download en_core_web_sm

In [5]:
import pandas as pd
import numpy as np
import re
import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer  # corrected typo: WorldNetLemmatizer -> WordNetLemmatizer
from nltk.corpus import stopwords
import emoji
import spacy
import contractions

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.decomposition import TruncatedSVD

# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

# Custom Text Preprocessor

The custom transformer below:

 - **Emoji Conversion:** Converts any emojis to their text descriptions.
 - **Normalization:** Lowercases the text.
 - **Punctuation Removal:** Removes punctuation using regex.
 - **Tokenization:** Uses NLTK’s `word_tokenize`.
 - **Stop-word Removal:** Filters out English stopwords.
 - **Stemming:** Applies Porter stemming.
 
 The transformer implements `fit` and `transform` so that it can be used inside a scikit-learn pipeline.

In [6]:
class TextPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self, do_stemming=True, do_lemmatization=False, remove_stopwords=True, 
                 do_emoji_conversion=True, use_spacy_tokenizer=True):
        """
        Parameters:
        - do_stemming: Apply stemming (reduces words to their root form)
        - do_lemmatization: Apply lemmatization (converts words to their canonical form)
          Note: When using the default (NLTK) tokenizer, if both do_lemmatization and do_stemming are enabled,
          lemmatization takes precedence.
        - remove_stopwords: Remove common stopwords
        - do_emoji_conversion: Convert emojis to text descriptions
        - use_spacy_tokenizer: Use a custom spaCy-based tokenizer (which already uses lemmatization)
        """
        self.do_stemming = do_stemming
        self.do_lemmatization = do_lemmatization
        self.remove_stopwords = remove_stopwords
        self.do_emoji_conversion = do_emoji_conversion
        self.use_spacy_tokenizer = use_spacy_tokenizer
        self.stemmer = PorterStemmer()
        if self.do_lemmatization:
            self.lemmatizer = WordNetLemmatizer()
        self.stop_words = set(stopwords.words('english'))
        
        # Load the spaCy model if using the spaCy tokenizer
        if self.use_spacy_tokenizer:
            self.nlp = spacy.load("en_core_web_sm")
    
    def remove_links(self, text):
        """Remove URLs from text."""
        return re.sub(r'http[s]?://\S+|www\.\S+', '', text)
    
    def remove_user_mentions(self, text):
        """Remove user mentions from text."""
        return re.sub(r'u/\S+', '', text)
    
    def expand_contractions(self, text):
        """Expand contractions in the text."""
        return contractions.fix(text)
    
    def remove_non_ascii(self, text):
        """Remove non-ASCII characters from the text."""
        return text.encode("ascii", "ignore").decode()
    
    def remove_punctuations(self, text):
        """
        Remove or adjust punctuation in text.
        Replaces hyphens with space and ensures separation around punctuation.
        """
        text = re.sub(r'[-]', ' ', text)
        text = re.sub(r'(\S)[' + re.escape(string.punctuation) + r'](\S)', r'\1 \2', text)
        return text
    
    def remove_numbers(self, text):
        """Remove numbers from text."""
        return re.sub(r'[0-9]+', '', text)
    
    def emoji_to_text(self, text):
        """Convert emojis to text descriptions."""
        return emoji.demojize(text)
    
    def normalize(self, text):
        """Lowercase the text."""
        return text.lower()
    
    def tokenize(self, text):
        """
        Tokenize text using either a spaCy-based custom tokenizer or the default NLTK tokenizer.
        """
        if self.use_spacy_tokenizer:
            # Use spaCy's custom tokenization logic:
            doc = self.nlp(text)
            tokens = []
            # Add named entities as tokens
            for ent in doc.ents:
                tokens.append(ent.text)
            # Add non-entity tokens using their lemma
            non_entity_tokens = [token.lemma_.lower() for token in doc if not token.ent_type_ 
                                 and not token.is_punct and not token.is_space]
            tokens.extend(non_entity_tokens)
            if self.remove_stopwords:
                tokens = [token for token in tokens if token.lower() not in self.stop_words]
            if self.do_stemming:
                tokens = [self.stemmer.stem(token) for token in tokens]
            return tokens
        else:
            # Default NLTK-based tokenization:
            # Remove punctuation (if any remains) and then tokenize
            text = re.sub(r'[^\w\s]', '', text)
            tokens = word_tokenize(text)
            # Keep only alphabetic tokens
            tokens = [token for token in tokens if token.isalpha()]
            if self.remove_stopwords:
                tokens = [token for token in tokens if token.lower() not in self.stop_words]
            # Apply lemmatization if enabled; otherwise, apply stemming if enabled
            if self.do_lemmatization:
                tokens = [self.lemmatizer.lemmatize(token) for token in tokens]
            elif self.do_stemming:
                tokens = [self.stemmer.stem(token) for token in tokens]
            return tokens
    
    def preprocess(self, text):
        """Apply the complete preprocessing pipeline to the text."""
        text = self.remove_links(text)
        text = self.remove_user_mentions(text)
        text = self.expand_contractions(text)
        text = self.remove_non_ascii(text)
        text = self.remove_punctuations(text)
        text = self.remove_numbers(text)
        if self.do_emoji_conversion:
            text = self.emoji_to_text(text)
        text = self.normalize(text)
        tokens = self.tokenize(text)
        return ' '.join(tokens)
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        return X.apply(self.preprocess)

 ## Data Loading and Train/Test Split
 
 We load the dataset and split it into training and testing sets.

In [9]:
# Read the dataset (make sure the file is in your working directory)
df = pd.read_csv("../Data/TestDataOnly.csv")

# Check available columns
print("Columns in dataset:", df.columns.tolist())

# Select the important columns and drop any missing values
df = df[['text', 'label_1']].dropna()
X = df['text']
y = df['label_1']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Columns in dataset: ['post_id', 'subreddit', 'post_title', 'post_body', 'number_of_comments', 'readable_datetime', 'post_author', 'number_of_upvotes', 'query', 'text', 'comment_id', 'comment_body', 'comment_author', 'Cleaned Text', 'similarity', 'label_1', 'score_1']


## Building Various Pipelines
 
 We create several pipelines:
 
 1. **CountVectorizer with Unigrams (Binary Representation):**  
    Uses binary occurrence of words.

 2. **CountVectorizer with Unigrams (Binary Representation):**  
    Uses binary occurrence of words.
 
 3. **TfidfVectorizer with Unigrams:**  
    Uses TF-IDF weights for unigrams.
 
 3. **TfidfVectorizer with N-grams:**  
    Uses TF-IDF weights for unigrams and bigrams.
 
 For each representation, we create 4 classifiers: Logistic Regression, SVM (using LinearSVC), RandomForest and Multtinomial Naive Bayes

In [10]:
#  Model Pipelines (Binary and TF-IDF, with and without SVD)


# --- Logistic Regression Pipelines ---
pipeline_lr_count_unigram = Pipeline([
    ('preprocessor', TextPreprocessor()),
    ('vectorizer', CountVectorizer(binary=True, ngram_range=(1,1))),
    ('classifier', LogisticRegression(max_iter=1000))
])

pipeline_lr_count_ngram = Pipeline([
    ('preprocessor', TextPreprocessor()),
    ('vectorizer', CountVectorizer(binary=True, ngram_range=(1,2))),
    ('classifier', LogisticRegression(max_iter=1000))
])

pipeline_lr_tfidf_unigram = Pipeline([
    ('preprocessor', TextPreprocessor()),
    ('vectorizer', TfidfVectorizer(ngram_range=(1,1))),
    ('classifier', LogisticRegression(max_iter=1000))
])

pipeline_lr_tfidf_ngram = Pipeline([
    ('preprocessor', TextPreprocessor()),
    ('vectorizer', TfidfVectorizer(ngram_range=(1,2))),
    ('classifier', LogisticRegression(max_iter=1000))
])


# --- SVM Pipelines (using LinearSVC) ---
pipeline_svm_count_unigram = Pipeline([
    ('preprocessor', TextPreprocessor()),
    ('vectorizer', CountVectorizer(binary=True, ngram_range=(1,1))),
    ('classifier', LinearSVC(max_iter=1000))
])

pipeline_svm_count_ngram = Pipeline([
    ('preprocessor', TextPreprocessor()),
    ('vectorizer', CountVectorizer(binary=True, ngram_range=(1,2))),
    ('classifier', LinearSVC(max_iter=1000))
])

pipeline_svm_tfidf_unigram = Pipeline([
    ('preprocessor', TextPreprocessor()),
    ('vectorizer', TfidfVectorizer(ngram_range=(1,1))),
    ('classifier', LinearSVC(max_iter=1000))
])

pipeline_svm_tfidf_ngram = Pipeline([
    ('preprocessor', TextPreprocessor()),
    ('vectorizer', TfidfVectorizer(ngram_range=(1,2))),
    ('classifier', LinearSVC(max_iter=1000))
])


# --- Random Forest Pipelines ---
pipeline_rf_count_unigram = Pipeline([
    ('preprocessor', TextPreprocessor()),
    ('vectorizer', CountVectorizer(binary=True, ngram_range=(1,1))),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
])

pipeline_rf_count_ngram = Pipeline([
    ('preprocessor', TextPreprocessor()),
    ('vectorizer', CountVectorizer(binary=True, ngram_range=(1,2))),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
])

pipeline_rf_tfidf_unigram = Pipeline([
    ('preprocessor', TextPreprocessor()),
    ('vectorizer', TfidfVectorizer(ngram_range=(1,1))),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
])

pipeline_rf_tfidf_ngram = Pipeline([
    ('preprocessor', TextPreprocessor()),
    ('vectorizer', TfidfVectorizer(ngram_range=(1,2))),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
])


# --- Naive Bayes Pipelines ---
pipeline_nb_count_unigram = Pipeline([
    ('preprocessor', TextPreprocessor()),
    ('vectorizer', CountVectorizer(binary=True, ngram_range=(1,1))),
    ('classifier', MultinomialNB())
])

pipeline_nb_count_ngram = Pipeline([
    ('preprocessor', TextPreprocessor()),
    ('vectorizer', CountVectorizer(binary=True, ngram_range=(1,2))),
    ('classifier', MultinomialNB())
])

pipeline_nb_tfidf_unigram = Pipeline([
    ('preprocessor', TextPreprocessor()),
    ('vectorizer', TfidfVectorizer(ngram_range=(1,1))),
    ('classifier', MultinomialNB())
])

pipeline_nb_tfidf_ngram = Pipeline([
    ('preprocessor', TextPreprocessor()),
    ('vectorizer', TfidfVectorizer(ngram_range=(1,2))),
    ('classifier', MultinomialNB())
])

### In this section we will define and add SVD to the pipelines (TruncatedSVD) for dimensionality reduction


In [None]:
# Define Helper to Insert SVD
# ---------------------------
def add_svd(pipeline, n_components=100):
    """
    Inserts a TruncatedSVD step right after the vectorizer.
    Assumes the pipeline has steps: preprocessor, vectorizer, classifier.
    """
    steps = pipeline.steps.copy()
    # Insert SVD at position 2 (right after vectorizer)
    steps.insert(2, ('svd', TruncatedSVD(n_components=n_components)))
    return Pipeline(steps)


# Create pipelines without SVD for NB
pipelines_no_svd = {
    # For models that don't use SVD (or for NB)
    "NB_Count_Binary_Unigram": pipeline_nb_count_unigram,
    "NB_Count_Binary_Ngram": pipeline_nb_count_ngram,
    "NB_Tfidf_Unigram": pipeline_nb_tfidf_unigram,
    "NB_Tfidf_Ngram": pipeline_nb_tfidf_ngram,
}

# For models other than MultinomialNB, with their SVD versions
other_pipelines = {
    "LR_Count_Binary_Unigram": pipeline_lr_count_unigram,
    "LR_Count_Binary_Ngram": pipeline_lr_count_ngram,
    "LR_Tfidf_Unigram": pipeline_lr_tfidf_unigram,
    "LR_Tfidf_Ngram": pipeline_lr_tfidf_ngram,
    
    "SVM_Count_Binary_Unigram": pipeline_svm_count_unigram,
    "SVM_Count_Binary_Ngram": pipeline_svm_count_ngram,
    "SVM_Tfidf_Unigram": pipeline_svm_tfidf_unigram,
    "SVM_Tfidf_Ngram": pipeline_svm_tfidf_ngram,
    
    "RF_Count_Binary_Unigram": pipeline_rf_count_unigram,
    "RF_Count_Binary_Ngram": pipeline_rf_count_ngram,
    "RF_Tfidf_Unigram": pipeline_rf_tfidf_unigram,
    "RF_Tfidf_Ngram": pipeline_rf_tfidf_ngram,
}

# Create SVD versions for non-NB pipelines
svd_pipelines = {name + "_SVD": add_svd(pipe) for name, pipe in other_pipelines.items()}

# Combine all pipelines
all_pipelines = {}
all_pipelines.update(pipelines_no_svd)
all_pipelines.update(other_pipelines)
all_pipelines.update(svd_pipelines)

## Evaluating the Pipelines
 
 We define a helper function that fits a pipeline and returns evaluation metrics:
 
 - **Accuracy**  
 - **Precision** (weighted)  
 - **Recall** (weighted)  
 - **F1 Score** (weighted)
 
 Then, we loop over all pipelines, evaluate them on the test set, and compile the results into a comparison table.

In [14]:
def evaluate_pipeline_metrics(pipeline, X_train, X_test, y_train, y_test):
    """Train the pipeline and return evaluation metrics."""
    pipeline.fit(X_train, y_train)
    predictions = pipeline.predict(X_test)
    metrics = {
        "Accuracy": accuracy_score(y_test, predictions),
        "Precision": precision_score(y_test, predictions, average='weighted', zero_division=0),
        "Recall": recall_score(y_test, predictions, average='weighted', zero_division=0),
        "F1 Score": f1_score(y_test, predictions, average='weighted', zero_division=0)
    }
    return metrics


# Evaluate each pipeline and store results
results = []
for name, pipe in all_pipelines.items():
    metrics = evaluate_pipeline_metrics(pipe, X_train, X_test, y_train, y_test)
    row = {"Pipeline": name}
    row.update(metrics)
    results.append(row)

# Create a DataFrame of results and sort by F1 Score
results_df = pd.DataFrame(results)
results_df = results_df.sort_values(by="F1 Score", ascending=False)
print("### Model Comparison Table")
results_df.reset_index(drop=True, inplace=True)
print(results_df)

### Model Comparison Table
                        Pipeline  Accuracy  Precision  Recall  F1 Score
0   SVM_Count_Binary_Unigram_SVD      0.57   0.583846    0.57  0.569560
1          NB_Count_Binary_Ngram      0.56   0.564354    0.56  0.561556
2               RF_Tfidf_Unigram      0.57   0.654931    0.57  0.559442
3      LR_Count_Binary_Ngram_SVD      0.55   0.593645    0.55  0.555728
4        NB_Count_Binary_Unigram      0.54   0.553021    0.54  0.543837
5    LR_Count_Binary_Unigram_SVD      0.54   0.547857    0.54  0.534145
6     SVM_Count_Binary_Ngram_SVD      0.53   0.547191    0.53  0.528756
7                SVM_Tfidf_Ngram      0.52   0.545145    0.52  0.521176
8           LR_Tfidf_Unigram_SVD      0.52   0.533534    0.52  0.513862
9               NB_Tfidf_Unigram      0.51   0.527618    0.51  0.512978
10            LR_Tfidf_Ngram_SVD      0.50   0.546250    0.50  0.504473
11           SVM_Tfidf_Ngram_SVD      0.50   0.538706    0.50  0.501718
12         SVM_Tfidf_Unigram_SVD     

## Cross-Validation Evaluation on Training Set

In [None]:
print("\n### 5-Fold Cross-Validation (F1 Macro) Scores") # Just kept F1 - Macro, can be changed to other metrics
for name, pipe in all_pipelines.items():
    cv_scores = cross_val_score(pipe, X_train, y_train, cv=5, scoring='f1_macro', n_jobs=1)
    print(f"{name:35s}: {cv_scores.mean():.4f} (+/- {cv_scores.std():.4f})")


### 5-Fold Cross-Validation (F1 Macro) Scores
NB_Count_Binary_Unigram            : 0.5339 (+/- 0.0273)
NB_Count_Binary_Ngram              : 0.5152 (+/- 0.0274)
NB_Tfidf_Unigram                   : 0.4880 (+/- 0.0379)
NB_Tfidf_Ngram                     : 0.4810 (+/- 0.0198)
LR_Count_Binary_Unigram            : 0.5744 (+/- 0.0305)
LR_Count_Binary_Ngram              : 0.5410 (+/- 0.0370)
LR_Tfidf_Unigram                   : 0.5400 (+/- 0.0276)
LR_Tfidf_Ngram                     : 0.5392 (+/- 0.0373)
SVM_Count_Binary_Unigram           : 0.5314 (+/- 0.0178)
SVM_Count_Binary_Ngram             : 0.5082 (+/- 0.0261)
SVM_Tfidf_Unigram                  : 0.5401 (+/- 0.0496)
SVM_Tfidf_Ngram                    : 0.5338 (+/- 0.0301)
RF_Count_Binary_Unigram            : 0.5215 (+/- 0.0389)
RF_Count_Binary_Ngram              : 0.4559 (+/- 0.0419)
RF_Tfidf_Unigram                   : 0.5353 (+/- 0.0591)
RF_Tfidf_Ngram                     : 0.3814 (+/- 0.0629)
LR_Count_Binary_Unigram_SVD        : 0.57

## Parameter Tuning with GridSearchCV [TO BE VERIFIED]
 
 Here, we perform grid search on a pipeline using `TfidfVectorizer` to tune parameters such as:
 
 - **ngram_range:** Unigrams vs. unigrams+bigrams.
 - **use_idf:** Whether to use the inverse document frequency reweighting.
 - **C:** Regularization strength for Logistic Regression.
 
 The grid search uses 5-fold cross-validation and optimizes for macro F1 score.


In [19]:
pipeline_grid = Pipeline([
    ('preprocessor', TextPreprocessor()),
    ('vectorizer', TfidfVectorizer()),  # we'll tune ngram_range and use_idf here
    ('svd', 'passthrough'),             # optional dimensionality reduction
    ('classifier', LogisticRegression(max_iter=1000))
])

# Define a list of parameter grids, one for each classifier type
param_grid = [
    {
        'vectorizer__ngram_range': [(1,1), (1,2)],
        'vectorizer__use_idf': [True, False],
        'svd': [TruncatedSVD(n_components=100), 'passthrough'],
        'classifier': [LogisticRegression(max_iter=1000)],
        'classifier__C': [0.1, 1, 10]
    },
    {
        'vectorizer__ngram_range': [(1,1), (1,2)],
        'vectorizer__use_idf': [True, False],
        'svd': [TruncatedSVD(n_components=100), 'passthrough'],
        'classifier': [LinearSVC(max_iter=1000)],
        'classifier__C': [0.1, 1, 10]
    },
    {
        'vectorizer__ngram_range': [(1,1), (1,2)],
        'vectorizer__use_idf': [True, False],
        'svd': [TruncatedSVD(n_components=100), 'passthrough'],
        'classifier': [RandomForestClassifier(n_estimators=100, random_state=42)],
        'classifier__max_depth': [None, 10, 20]
    },
    {
        # For MultinomialNB, force SVD to passthrough to avoid negative values.
        'vectorizer__ngram_range': [(1,1), (1,2)],
        'vectorizer__use_idf': [True, False],
        'svd': ['passthrough'],  # Do not apply SVD
        'classifier': [MultinomialNB()],
        'classifier__alpha': [0.5, 1.0, 1.5]
    }
]

# Perform Grid Search with n_jobs=1 to avoid pickling issues.
grid_search = GridSearchCV(pipeline_grid, param_grid, cv=5, scoring='f1_macro', n_jobs=1, verbose=1)
grid_search.fit(X_train, y_train)

print("\nBest Parameters from Grid Search:")
print(grid_search.best_params_)

# Evaluate the best model on the test set
best_model = grid_search.best_estimator_
predictions_best = best_model.predict(X_test)
print("\n### Evaluation of Best Model from Grid Search")
print(classification_report(y_test, predictions_best))


Fitting 5 folds for each of 84 candidates, totalling 420 fits

Best Parameters from Grid Search:
{'classifier': LogisticRegression(max_iter=1000), 'classifier__C': 1, 'svd': TruncatedSVD(n_components=100), 'vectorizer__ngram_range': (1, 1), 'vectorizer__use_idf': False}

### Evaluation of Best Model from Grid Search
              precision    recall  f1-score   support

    negative       0.54      0.37      0.44        35
     neutral       0.46      0.83      0.59        29
    positive       0.71      0.47      0.57        36

    accuracy                           0.54       100
   macro avg       0.57      0.56      0.53       100
weighted avg       0.58      0.54      0.53       100

