# Text Analytics Pipeline for Text Classification

This notebook demonstrates how to build a text analytics pipeline that includes text processing, feature extraction, classification, and evaluation.


In [29]:
# %pip install pandas numpy nltk emoji scikit-learn
# !python -m spacy download en_core_web_sm

In [30]:
import pandas as pd
import numpy as np
import re
import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer  
from nltk.corpus import stopwords
import emoji
import spacy
import contractions

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.decomposition import TruncatedSVD
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.under_sampling import RandomUnderSampler

# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

# Custom Text Preprocessor

The custom transformer below:

 - **Emoji Conversion:** Converts any emojis to their text descriptions.
 - **Normalization:** Lowercases the text.
 - **Punctuation Removal:** Removes punctuation using regex.
 - **Tokenization:** Uses NLTK’s `word_tokenize`.
 - **Stop-word Removal:** Filters out English stopwords.
 - **Stemming:** Applies Porter stemming.
 
 The transformer implements `fit` and `transform` so that it can be used inside a scikit-learn pipeline.

In [46]:
class TextPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self, do_stemming=True, do_lemmatization=False, remove_stopwords=True, 
                 do_emoji_conversion=True, use_spacy_tokenizer=True):
        """
        Parameters:
        - do_stemming: Apply stemming (reduces words to their root form)
        - do_lemmatization: Apply lemmatization (converts words to their canonical form)
          Note: When using the default (NLTK) tokenizer, if both do_lemmatization and do_stemming are enabled,
          lemmatization takes precedence.
        - remove_stopwords: Remove common stopwords
        - do_emoji_conversion: Convert emojis to text descriptions
        - use_spacy_tokenizer: Use a custom spaCy-based tokenizer (which already uses lemmatization)
        """
        self.do_stemming = do_stemming
        self.do_lemmatization = do_lemmatization
        self.remove_stopwords = remove_stopwords
        self.do_emoji_conversion = do_emoji_conversion
        self.use_spacy_tokenizer = use_spacy_tokenizer
        self.stemmer = PorterStemmer()
        if self.do_lemmatization:
            self.lemmatizer = WordNetLemmatizer()
        self.stop_words = set(stopwords.words('english'))
        
        # Load the spaCy model if using the spaCy tokenizer
        if self.use_spacy_tokenizer:
            self.nlp = spacy.load("en_core_web_sm")
    
    def remove_links(self, text):
        """Remove URLs from text."""
        return re.sub(r'http[s]?://\S+|www\.\S+', '', text)
    
    def remove_user_mentions(self, text):
        """Remove user mentions from text."""
        return re.sub(r'u/\S+', '', text)
    
    def expand_contractions(self, text):
        """Expand contractions in the text."""
        return contractions.fix(text)
    
    def remove_non_ascii(self, text):
        """Remove non-ASCII characters from the text."""
        return text.encode("ascii", "ignore").decode()
    
    def remove_punctuations(self, text):
        """
        Remove or adjust punctuation in text.
        Replaces hyphens with space and ensures separation around punctuation.
        """
        text = re.sub(r'[-]', ' ', text)
        text = re.sub(r'(\S)[' + re.escape(string.punctuation) + r'](\S)', r'\1 \2', text)
        return text
    
    def remove_numbers(self, text):
        """Remove numbers from text."""
        return re.sub(r'[0-9]+', '', text)
    
    def emoji_to_text(self, text):
        """Convert emojis to text descriptions."""
        return emoji.demojize(text)
    
    def normalize(self, text):
        """Lowercase the text."""
        return text.lower()
    
    def tokenize(self, text):
        """
        Tokenize text using either a spaCy-based custom tokenizer or the default NLTK tokenizer.
        """
        if self.use_spacy_tokenizer:
            # Use spaCy's custom tokenization logic:
            doc = self.nlp(text)
            tokens = []
            # Add named entities as tokens
            for ent in doc.ents:
                tokens.append(ent.text)
            # Add non-entity tokens using their lemma
            non_entity_tokens = [token.lemma_.lower() for token in doc if not token.ent_type_ 
                                 and not token.is_punct and not token.is_space]
            tokens.extend(non_entity_tokens)
            if self.remove_stopwords:
                tokens = [token for token in tokens if token.lower() not in self.stop_words]
            if self.do_stemming:
                tokens = [self.stemmer.stem(token) for token in tokens]
            return tokens
        else:
            # Default NLTK-based tokenization:
            # Remove punctuation (if any remains) and then tokenize
            text = re.sub(r'[^\w\s]', '', text)
            tokens = word_tokenize(text)
            # Keep only alphabetic tokens
            tokens = [token for token in tokens if token.isalpha()]
            if self.remove_stopwords:
                tokens = [token for token in tokens if token.lower() not in self.stop_words]
            # Apply lemmatization if enabled; otherwise, apply stemming if enabled
            if self.do_lemmatization:
                tokens = [self.lemmatizer.lemmatize(token) for token in tokens]
            elif self.do_stemming:
                tokens = [self.stemmer.stem(token) for token in tokens]
            return tokens
    
    def preprocess(self, text):
        """Apply the complete preprocessing pipeline to the text."""
        text = self.remove_links(text)
        text = self.remove_user_mentions(text)
        text = self.expand_contractions(text)
        text = self.remove_non_ascii(text)
        text = self.remove_punctuations(text)
        text = self.remove_numbers(text)
        if self.do_emoji_conversion:
            text = self.emoji_to_text(text)
        text = self.normalize(text)
        tokens = self.tokenize(text)
        return ' '.join(tokens)
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        return X.apply(self.preprocess)

GloVe-based Word embeddings representation

In [32]:
class GloveVectorizer(BaseEstimator, TransformerMixin):
    """
    Loading pre-trained GloVe embeddings and returns the average embedding vector for each document.
    """
    def __init__(self, glove_file='glove.twitter.27B.50d.txt', embedding_dim=50):
        self.glove_file = glove_file
        self.embedding_dim = embedding_dim

    def fit(self, X, y=None):
        self.embeddings_index = {}
        with open(self.glove_file, encoding="utf8") as f:
            for line in f:
                values = line.split()
                word = values[0]
                coefs = np.asarray(values[1:], dtype='float32')
                self.embeddings_index[word] = coefs
        return self

    def transform(self, X):
        vectors = []
        for doc in X:
            # Since TextPreprocessor returns a space-separated string of tokens,
            # we can simply split on spaces.
            tokens = doc.split()
            token_vecs = [self.embeddings_index[token] for token in tokens if token in self.embeddings_index]
            if token_vecs:
                doc_vec = np.mean(token_vecs, axis=0)
            else:
                doc_vec = np.zeros(self.embedding_dim)
            vectors.append(doc_vec)
        return np.array(vectors)

 ## Data Loading and Train/Test Split
 
 We load the dataset and split it into training and testing sets.

In [33]:
# Read the dataset (make sure the file is in your working directory)
df = pd.read_csv("../Data/labelled_data.csv")

# Check available columns
print("Columns in dataset:", df.columns.tolist())

# Select the important columns and drop any missing values
df = df[['text', 'label']].dropna()
X = df['text']
y = df['label']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Columns in dataset: ['post_id', 'subreddit', 'post_title', 'post_body', 'number_of_comments', 'readable_datetime', 'post_author', 'number_of_upvotes', 'query', 'text', 'comment_id', 'comment_body', 'comment_author', 'label']


## Building Various Pipelines
 
 We create several pipelines:
 
 1. **CountVectorizer with Unigrams (Binary Representation):**  
    Uses binary occurrence of words.

 2. **CountVectorizer with Unigrams (Binary Representation):**  
    Uses binary occurrence of words.
 
 3. **TfidfVectorizer with Unigrams:**  
    Uses TF-IDF weights for unigrams.
 
 3. **TfidfVectorizer with N-grams:**  
    Uses TF-IDF weights for unigrams and bigrams.
 
 For each representation, we create 4 classifiers: Logistic Regression, SVM (using LinearSVC), RandomForest and Multtinomial Naive Bayes

In [34]:
#  Model Pipelines (Binary and TF-IDF, with and without SVD)


# --- Logistic Regression Pipelines (with weighted balancing) ---
pipeline_lr_count_unigram = Pipeline([
    ('preprocessor', TextPreprocessor()),
    ('vectorizer', CountVectorizer(binary=True, ngram_range=(1,1))),
    ('classifier', LogisticRegression(max_iter=1000, class_weight='balanced'))
])

pipeline_lr_count_ngram = Pipeline([
    ('preprocessor', TextPreprocessor()),
    ('vectorizer', CountVectorizer(binary=True, ngram_range=(1,2))),
    ('classifier', LogisticRegression(max_iter=1000, class_weight='balanced'))
])

pipeline_lr_tfidf_unigram = Pipeline([
    ('preprocessor', TextPreprocessor()),
    ('vectorizer', TfidfVectorizer(ngram_range=(1,1))),
    ('classifier', LogisticRegression(max_iter=1000, class_weight='balanced'))
])

pipeline_lr_tfidf_ngram = Pipeline([
    ('preprocessor', TextPreprocessor()),
    ('vectorizer', TfidfVectorizer(ngram_range=(1,2))),
    ('classifier', LogisticRegression(max_iter=1000, class_weight='balanced'))
])


# --- SVM Pipelines (using LinearSVC with weighted balancing) ---
pipeline_svm_count_unigram = Pipeline([
    ('preprocessor', TextPreprocessor()),
    ('vectorizer', CountVectorizer(binary=True, ngram_range=(1,1))),
    ('classifier', LinearSVC(max_iter=1000, class_weight='balanced'))
])

pipeline_svm_count_ngram = Pipeline([
    ('preprocessor', TextPreprocessor()),
    ('vectorizer', CountVectorizer(binary=True, ngram_range=(1,2))),
    ('classifier', LinearSVC(max_iter=1000, class_weight='balanced'))
])

pipeline_svm_tfidf_unigram = Pipeline([
    ('preprocessor', TextPreprocessor()),
    ('vectorizer', TfidfVectorizer(ngram_range=(1,1))),
    ('classifier', LinearSVC(max_iter=1000, class_weight='balanced'))
])

pipeline_svm_tfidf_ngram = Pipeline([
    ('preprocessor', TextPreprocessor()),
    ('vectorizer', TfidfVectorizer(ngram_range=(1,2))),
    ('classifier', LinearSVC(max_iter=1000, class_weight='balanced'))
])


# --- Random Forest Pipelines (with weighted balancing) ---
pipeline_rf_count_unigram = Pipeline([
    ('preprocessor', TextPreprocessor()),
    ('vectorizer', CountVectorizer(binary=True, ngram_range=(1,1))),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced'))
])

pipeline_rf_count_ngram = Pipeline([
    ('preprocessor', TextPreprocessor()),
    ('vectorizer', CountVectorizer(binary=True, ngram_range=(1,2))),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced'))
])

pipeline_rf_tfidf_unigram = Pipeline([
    ('preprocessor', TextPreprocessor()),
    ('vectorizer', TfidfVectorizer(ngram_range=(1,1))),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced'))
])

pipeline_rf_tfidf_ngram = Pipeline([
    ('preprocessor', TextPreprocessor()),
    ('vectorizer', TfidfVectorizer(ngram_range=(1,2))),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced'))
])


# --- Naive Bayes Pipelines (with Random Under-Sampling) ---
pipeline_nb_count_unigram = ImbPipeline([
    ('preprocessor', TextPreprocessor()),
    ('vectorizer', CountVectorizer(binary=True, ngram_range=(1,1))),
    ('sampler', RandomUnderSampler(random_state=42)),
    ('classifier', MultinomialNB())
])

pipeline_nb_count_ngram = ImbPipeline([
    ('preprocessor', TextPreprocessor()),
    ('vectorizer', CountVectorizer(binary=True, ngram_range=(1,2))),
    ('sampler', RandomUnderSampler(random_state=42)),
    ('classifier', MultinomialNB())
])

pipeline_nb_tfidf_unigram = ImbPipeline([
    ('preprocessor', TextPreprocessor()),
    ('vectorizer', TfidfVectorizer(ngram_range=(1,1))),
    ('sampler', RandomUnderSampler(random_state=42)),
    ('classifier', MultinomialNB())
])

pipeline_nb_tfidf_ngram = ImbPipeline([
    ('preprocessor', TextPreprocessor()),
    ('vectorizer', TfidfVectorizer(ngram_range=(1,2))),
    ('sampler', RandomUnderSampler(random_state=42)),
    ('classifier', MultinomialNB())
])


### In this section we will define and add SVD to the pipelines (TruncatedSVD) for dimensionality reduction


In [35]:
# Define Helper to Insert SVD
# ---------------------------
def add_svd(pipeline, n_components=100):
    """
    Inserts a TruncatedSVD step right after the vectorizer.
    Assumes the pipeline has steps: preprocessor, vectorizer, classifier.
    """
    steps = pipeline.steps.copy()
    # Insert SVD at position 2 (right after vectorizer)
    steps.insert(2, ('svd', TruncatedSVD(n_components=n_components)))
    return Pipeline(steps)


# Create pipelines without SVD for NB
pipelines_no_svd = {
    # For models that don't use SVD (or for NB)
    "NB_Count_Binary_Unigram": pipeline_nb_count_unigram,
    "NB_Count_Binary_Ngram": pipeline_nb_count_ngram,
    "NB_Tfidf_Unigram": pipeline_nb_tfidf_unigram,
    "NB_Tfidf_Ngram": pipeline_nb_tfidf_ngram,
}

# For models other than MultinomialNB, with their SVD versions
other_pipelines = {
    "LR_Count_Binary_Unigram": pipeline_lr_count_unigram,
    "LR_Count_Binary_Ngram": pipeline_lr_count_ngram,
    "LR_Tfidf_Unigram": pipeline_lr_tfidf_unigram,
    "LR_Tfidf_Ngram": pipeline_lr_tfidf_ngram,
    
    "SVM_Count_Binary_Unigram": pipeline_svm_count_unigram,
    "SVM_Count_Binary_Ngram": pipeline_svm_count_ngram,
    "SVM_Tfidf_Unigram": pipeline_svm_tfidf_unigram,
    "SVM_Tfidf_Ngram": pipeline_svm_tfidf_ngram,
    
    "RF_Count_Binary_Unigram": pipeline_rf_count_unigram,
    "RF_Count_Binary_Ngram": pipeline_rf_count_ngram,
    "RF_Tfidf_Unigram": pipeline_rf_tfidf_unigram,
    "RF_Tfidf_Ngram": pipeline_rf_tfidf_ngram,
}

# Create SVD versions for non-NB pipelines
svd_pipelines = {name + "_SVD": add_svd(pipe) for name, pipe in other_pipelines.items()}

# Combine all pipelines
all_pipelines = {}
all_pipelines.update(pipelines_no_svd)
all_pipelines.update(other_pipelines)
all_pipelines.update(svd_pipelines)

### Pipelines Using GloVe Word Embeddings

In [36]:
# They return a fixed-length embedding for each document (by averaging word embeddings).

pipeline_glove_lr = Pipeline([
    ('preprocessor', TextPreprocessor()),
    ('glove', GloveVectorizer(glove_file='glove.twitter.27B.50d.txt', embedding_dim=50)),
    ('classifier', LogisticRegression(max_iter=1000, class_weight='balanced'))
])

pipeline_glove_svm = Pipeline([
    ('preprocessor', TextPreprocessor()),
    ('glove', GloveVectorizer(glove_file='glove.twitter.27B.50d.txt', embedding_dim=50)),
    ('classifier', LinearSVC(max_iter=1000, class_weight='balanced'))
])

pipeline_glove_rf = Pipeline([
    ('preprocessor', TextPreprocessor()),
    ('glove', GloveVectorizer(glove_file='glove.twitter.27B.50d.txt', embedding_dim=50)),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced'))
])

# Add the Glove pipelines to our all_pipelines dictionary
all_pipelines["Glove_LR"] = pipeline_glove_lr
all_pipelines["Glove_SVM"] = pipeline_glove_svm
all_pipelines["Glove_RF"] = pipeline_glove_rf

## Evaluating the Pipelines
 
 We define a helper function that fits a pipeline and returns evaluation metrics:
 
 - **Accuracy**  
 - **Precision** (weighted)  
 - **Recall** (weighted)  
 - **F1 Score** (weighted)
 
 Then, we loop over all pipelines, evaluate them on the test set, and compile the results into a comparison table.

In [37]:
def evaluate_pipeline_metrics(pipeline, X_train, X_test, y_train, y_test):
    """Train the pipeline and return evaluation metrics."""
    pipeline.fit(X_train, y_train)
    predictions = pipeline.predict(X_test)
    metrics = {
        "Accuracy": accuracy_score(y_test, predictions),
        "Precision": precision_score(y_test, predictions, average='weighted', zero_division=0),
        "Recall": recall_score(y_test, predictions, average='weighted', zero_division=0),
        "F1 Score": f1_score(y_test, predictions, average='weighted', zero_division=0)
    }
    return metrics


# Evaluate each pipeline and store results
results = []
for name, pipe in all_pipelines.items():
    metrics = evaluate_pipeline_metrics(pipe, X_train, X_test, y_train, y_test)
    row = {"Pipeline": name}
    row.update(metrics)
    results.append(row)

# Create a DataFrame of results and sort by F1 Score
results_df = pd.DataFrame(results)
results_df = results_df.sort_values(by="F1 Score", ascending=False)
print("### Model Comparison Table")
results_df.reset_index(drop=True, inplace=True)
print(results_df)

### Model Comparison Table
                        Pipeline  Accuracy  Precision    Recall  F1 Score
0                SVM_Tfidf_Ngram  0.762849   0.742287  0.762849  0.738133
1          LR_Count_Binary_Ngram  0.747520   0.728635  0.747520  0.732432
2                 LR_Tfidf_Ngram  0.732191   0.729445  0.732191  0.730574
3              SVM_Tfidf_Unigram  0.738503   0.725265  0.738503  0.730212
4         SVM_Count_Binary_Ngram  0.746619   0.721449  0.746619  0.722890
5        LR_Count_Binary_Unigram  0.715059   0.723156  0.715059  0.718613
6       SVM_Count_Binary_Unigram  0.712353   0.707297  0.712353  0.709597
7               LR_Tfidf_Unigram  0.696123   0.722199  0.696123  0.706270
8          SVM_Tfidf_Unigram_SVD  0.699729   0.674637  0.699729  0.682471
9                      Glove_SVM  0.689811   0.667190  0.689811  0.675243
10  SVM_Count_Binary_Unigram_SVD  0.688909   0.660859  0.688909  0.667307
11           SVM_Tfidf_Ngram_SVD  0.687106   0.650704  0.687106  0.659032
12    SVM_C

## Parameter Tuning with GridSearchCV [TO BE VERIFIED]
 
 Here, we perform grid search on a pipeline using `TfidfVectorizer` to tune parameters such as:
 
 - **ngram_range:** Unigrams vs. unigrams+bigrams.
 - **use_idf:** Whether to use the inverse document frequency reweighting.
 - **C:** Regularization strength for Logistic Regression.
 
 The grid search uses 5-fold cross-validation and optimizes for macro F1 score.


In [48]:
pipeline_grid = ImbPipeline([
    ('preprocessor', TextPreprocessor()),
    ('vectorizer', TfidfVectorizer()),  # We'll tune various vectorizer parameters here
    ('svd', 'passthrough'),             # Optional dimensionality reduction step
    ('sampler', 'passthrough'),         # Placeholder; will be set to RandomUnderSampler for NB in grid search
    ('classifier', LogisticRegression(max_iter=1000))
])

# Define a parameter grid that explores various options including different classifiers and balancing.
param_grid = [
    {
        'vectorizer__ngram_range': [(1,1), (1,2)],
        'vectorizer__use_idf': [True, False],
        'svd': [TruncatedSVD(n_components=100), 'passthrough'],
        'classifier': [LogisticRegression(max_iter=1000)],
        'classifier__C': [0.1, 1, 10],
        'classifier__class_weight': ['balanced']
    },
    {
        # Parameters for LinearSVC (using class_weight balancing)
        'vectorizer__ngram_range': [(1,1), (1,2)],
        'vectorizer__use_idf': [True, False],
        'svd': [TruncatedSVD(n_components=100), 'passthrough'],
        'classifier': [LinearSVC(max_iter=1000)],
        'classifier__C': [0.1, 1, 10],
        'classifier__class_weight': ['balanced']
    },
    {
        # Parameters for RandomForestClassifier (using class_weight balancing)
        'vectorizer__ngram_range': [(1,1), (1,2)],
        'vectorizer__use_idf': [True, False],
        'svd': [TruncatedSVD(n_components=100), 'passthrough'],
        'classifier': [RandomForestClassifier(n_estimators=100, random_state=42)],
        'classifier__max_depth': [None, 10, 20],
        'classifier__class_weight': ['balanced']
    },
    {
        # Parameters for MultinomialNB (avoiding SVD to prevent negative values)
        # Use RandomUnderSampler for balancing
        'vectorizer__ngram_range': [(1,1), (1,2)],
        'vectorizer__use_idf': [True, False],
        'svd': ['passthrough'],  # No SVD for NB
        'sampler': [RandomUnderSampler(random_state=42)],  # Apply under-sampling to balance classes
        'classifier': [MultinomialNB()],
        'classifier__alpha': [0.5, 1.0, 1.5],
        
    }
]

# Create the GridSearchCV object using 5-fold cross-validation on the training set

grid_search = GridSearchCV(pipeline_grid, param_grid, cv=5, scoring='f1_macro', n_jobs=1, verbose=1)
grid_search.fit(X_train, y_train)
print("\nBest Parameters from Grid Search:")
print(grid_search.best_params_)

Fitting 5 folds for each of 84 candidates, totalling 420 fits





Best Parameters from Grid Search:
{'classifier': LogisticRegression(max_iter=1000), 'classifier__C': 1, 'classifier__class_weight': 'balanced', 'svd': 'passthrough', 'vectorizer__ngram_range': (1, 1), 'vectorizer__use_idf': True}


### Evaluating the best parameters on 20% test set (from 80 - 20 split)

In [49]:
print("\nBest Parameters from Grid Search:")
print(grid_search.best_params_)

# Evaluate the best model on the test set
best_model = grid_search.best_estimator_
predictions_best = best_model.predict(X_test)
print("\n### Evaluation of Best Model from Grid Search")
print(classification_report(y_test, predictions_best))


Best Parameters from Grid Search:
{'classifier': LogisticRegression(max_iter=1000), 'classifier__C': 1, 'classifier__class_weight': 'balanced', 'svd': 'passthrough', 'vectorizer__ngram_range': (1, 1), 'vectorizer__use_idf': True}

### Evaluation of Best Model from Grid Search
              precision    recall  f1-score   support

          -1       0.50      0.56      0.53       229
           0       0.83      0.76      0.79       779
           1       0.38      0.53      0.44       101

    accuracy                           0.70      1109
   macro avg       0.57      0.62      0.59      1109
weighted avg       0.72      0.70      0.71      1109

