# Model Development
We want to develop and compare different models

In [1]:
# Built-in
import re
import pickle
import random

# Utils
from tqdm import tqdm

# Data science utils
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# NLP
import nltk
from nltk.corpus import wordnet, stopwords
from nltk.tokenize import word_tokenize

In [2]:
# UNCOMMMENT the line below if your system doesn't have wordnet
# nltk.download('wordnet')

## Preprocessing

In [3]:
# Import data
df = pd.read_json("../data/News_Category_Dataset_v3.json", orient = "records", lines = True)
df.head()

Unnamed: 0,link,headline,category,short_description,authors,date
0,https://www.huffpost.com/entry/covid-boosters-...,Over 4 Million Americans Roll Up Sleeves For O...,U.S. NEWS,Health experts said it is too early to predict...,"Carla K. Johnson, AP",2022-09-23
1,https://www.huffpost.com/entry/american-airlin...,"American Airlines Flyer Charged, Banned For Li...",U.S. NEWS,He was subdued by passengers and crew when he ...,Mary Papenfuss,2022-09-23
2,https://www.huffpost.com/entry/funniest-tweets...,23 Of The Funniest Tweets About Cats And Dogs ...,COMEDY,"""Until you have a dog you don't understand wha...",Elyse Wanshel,2022-09-23
3,https://www.huffpost.com/entry/funniest-parent...,The Funniest Tweets From Parents This Week (Se...,PARENTING,"""Accidentally put grown-up toothpaste on my to...",Caroline Bologna,2022-09-23
4,https://www.huffpost.com/entry/amy-cooper-lose...,Woman Who Called Cops On Black Bird-Watcher Lo...,U.S. NEWS,Amy Cooper accused investment firm Franklin Te...,Nina Golgowski,2022-09-22


In [4]:
# Selected only relevant columns
df_filtered = df[["headline", "category", "short_description"]].copy()

# Concatenate headline and short description together
df_filtered.loc[:, "text"] = df_filtered["headline"] + " " + df_filtered["short_description"]
df_filtered.drop(["headline","short_description"],axis=1,inplace=True)
df_filtered.head()

Unnamed: 0,category,text
0,U.S. NEWS,Over 4 Million Americans Roll Up Sleeves For O...
1,U.S. NEWS,"American Airlines Flyer Charged, Banned For Li..."
2,COMEDY,23 Of The Funniest Tweets About Cats And Dogs ...
3,PARENTING,The Funniest Tweets From Parents This Week (Se...
4,U.S. NEWS,Woman Who Called Cops On Black Bird-Watcher Lo...


In [5]:
# Count the frequency in each category
def count_category_frequency(df, column_name):
    freq = df[column_name].value_counts()
    print(f"There are {len(freq)} categories")
    print(freq)

In [6]:
count_category_frequency(df_filtered, "category")

There are 42 categories
POLITICS          35602
WELLNESS          17945
ENTERTAINMENT     17362
TRAVEL             9900
STYLE & BEAUTY     9814
PARENTING          8791
HEALTHY LIVING     6694
QUEER VOICES       6347
FOOD & DRINK       6340
BUSINESS           5992
COMEDY             5400
SPORTS             5077
BLACK VOICES       4583
HOME & LIVING      4320
PARENTS            3955
THE WORLDPOST      3664
WEDDINGS           3653
WOMEN              3572
CRIME              3562
IMPACT             3484
DIVORCE            3426
WORLD NEWS         3299
MEDIA              2944
WEIRD NEWS         2777
GREEN              2622
WORLDPOST          2579
RELIGION           2577
STYLE              2254
SCIENCE            2206
TECH               2104
TASTE              2096
MONEY              1756
ARTS               1509
ENVIRONMENT        1444
FIFTY              1401
GOOD NEWS          1398
U.S. NEWS          1377
ARTS & CULTURE     1339
COLLEGE            1144
LATINO VOICES      1130
CULTURE & ARTS  

There are many unique categories in the dataset that are not commonly found. These include categories such as "Weird news," "Green," and "Fifty." Additionally, there is confusion about when a category should be classified as "World news" versus "World post," and when a category should be classified as "Money" versus "Business." To improve the applicability of the machine learning model to general datasets, **we will only retain categories that have more than 4000 articles**.

In [7]:
def filter_categories(df, threshold):
    # Make a copy of the dataframe
    data = df.copy()
    
    # Create a new column "count" that stores the count of each category
    data["count"] = data.groupby("category")["category"].transform("count")
    # Keep only the rows where the "count" column is greater than the specified threshold
    
    data = data[data["count"] > threshold]
    
    # Drop the "count" column
    data.drop(columns=["count"], inplace=True)
    
    return data

In [8]:
df_over_4000 = filter_categories(df_filtered, 4000)

In [9]:
# source for this func: https://www.kaggle.com/code/tarunchaubey/news-category-classification-machine-learning
# since preprocessing always look the same, we will just utilize existing code instead of reinvent the wheel

# preprocess text (removing stopwords and tokenizing)
def process_text(text):
    # convert text to lowercase, remove newlines and carriage returns, and strip leading/trailing whitespace
    text = text.lower().replace('\n',' ').replace('\r','').strip()
    
    # replace multiple spaces with single space
    text = re.sub(' +', ' ', text)
    
    # remove non-alphanumeric characters and digits
    text = re.sub(r'[^\w\s]','',text)
    text = re.sub(r'[0-9]','',text)
    
    # create set of english stopwords
    stop_words = set(stopwords.words('english')) 
    
    # tokenize text into words
    word_tokens = word_tokenize(text)
    
    # if word not in stops_words, add word to filtered_sentence
    filtered_sentence = [] 
    for w in word_tokens: 
        if w not in stop_words: 
            filtered_sentence.append(w) 
    
    text = " ".join(filtered_sentence)
    return text

In [10]:
df_over_4000["text"] = df_over_4000["text"].apply(lambda x:process_text(x))

In [35]:
old_length = df_filtered["text"].apply(len).sum()
new_length = df_over_4000["text"].apply(len).sum()

In [39]:
print(f"Before: {old_length} characters")
print(f"After: {new_length} characters")
print(f"Total lost of: {old_length - new_length} characters")

Before: 36378921 characters
After: 17948453 characters
Total lost of: 18430468 characters


In [11]:
df_over_4000.head()

Unnamed: 0,category,text
2,COMEDY,funniest tweets cats dogs week sept dog dont u...
3,PARENTING,funniest tweets parents week sept accidentally...
17,SPORTS,maury wills basestealing shortstop dodgers die...
20,ENTERTAINMENT,golden globes returning nbc january year offai...
21,POLITICS,biden says us forces would defend taiwan china...


In [12]:
count_category_frequency(df_over_4000, "category")

There are 14 categories
POLITICS          35602
WELLNESS          17945
ENTERTAINMENT     17362
TRAVEL             9900
STYLE & BEAUTY     9814
PARENTING          8791
HEALTHY LIVING     6694
QUEER VOICES       6347
FOOD & DRINK       6340
BUSINESS           5992
COMEDY             5400
SPORTS             5077
BLACK VOICES       4583
HOME & LIVING      4320
Name: category, dtype: int64


The data is still imbalanced through. There are many different technique to counter this: 
- **Oversampling**: This technique involves duplicating instances of the underrepresented class(es) in the dataset to balance the class distribution. This can be done by randomly repeating instances of the underrepresented class(es) until the number of instances of each class is roughly equal.
- **Undersampling**: This technique involves removing instances of the overrepresented class(es) in the dataset to balance the class distribution. This can be done by randomly removing instances of the overrepresented class(es) until the number of instances of each class is roughly equal.
- **SMOTE**: (Synthetic Minority Over-sampling Technique) This is a more sophisticated oversampling technique that generates synthetic instances of the underrepresented class(es) by interpolating between existing instances of that class.
- **Data augmentation**: This technique involves generating new instances of the underrepresented class(es) by applying data transformation techniques such as synonym replacement, random insertion, random deletion, or random swap to the existing instances of the underrepresented class(es).

For our use case, **undersampling** is not suitable since we would lose a lot of valuable data in the categories "politics", "wellness" and "entertainment". **SMOTE** is not a suitable technique either because it operates in feature space, which means it generates synthetic data by interpolating between existing instances of the underrepresented class. However, in NLP problems, the feature space is large and high-dimensional, which makes it difficult for the KNN algorithm used by SMOTE to effectively identify similar instances. Additionally, the generated synthetic data may not be representative of real text data in the feature space, which may negatively impact the performance of the model. **Oversampling** can lead to overfitting, as the model becomes more sensitive to the duplicated instances of the minority class. Because of those reasons **we will use data augmentation with synonym replacement!**

In [20]:
# get number of samples for each category
class_counts = df_over_4000["category"].value_counts()

# threshold for sample
threshold = 10000

# get the list of categories that has less sample than threshold
underrepresented_classes = class_counts[class_counts < threshold].index.tolist()

print(f"There are {len(underrepresented_classes)} categories that have less than {threshold} samples")
print(underrepresented_classes)

There are 11 categories that have less than 10000 samples
['TRAVEL', 'STYLE & BEAUTY', 'PARENTING', 'HEALTHY LIVING', 'QUEER VOICES', 'FOOD & DRINK', 'BUSINESS', 'COMEDY', 'SPORTS', 'BLACK VOICES', 'HOME & LIVING']


In [25]:
def generate_new_instance(text, label):
    # Tokenize the text
    tokens = nltk.word_tokenize(text)
    
    # error handling when text is not tokenizable
    if len(tokens) == 0:
        return text, label
    
    # Select a random token
    random_token = random.choice(tokens)
    
    # Synonym dict
    synonyms = wordnet.synsets(random_token)
    
    # when arent any synonyms for the token, then dont modify the text
    if len(synonyms) == 0:
        return text, label
    else:
        # Select a random synonym of the selected token
        random_synonym = random.choice(synonyms).lemmas()[0].name()
        # Replace the selected token with its synonym in the text
        new_text = text.replace(random_token, random_synonym)
        return new_text, label

In [26]:
# Initialize an empty list to store the additional instances
additional_instances = []

# Iterate through all the underrepresented classes
for label in tqdm(underrepresented_classes):
    
    # Determine the number of instances to generate for this class
    num_to_generate = threshold - class_counts[label]
    
    # Generate the new instances
    for i in range(num_to_generate):
        # select random text from the underrepresented class
        text = df_over_4000[df_over_4000["category"] == label]["text"].sample(n=1).values[0]
        # generate a new instance and label
        new_text, new_label = generate_new_instance(text, label)
        # append the new instance and label to the additional_instances list
        additional_instances.append((new_text, new_label))

100%|██████████████████████████████████████████████████████████████████████████████████| 11/11 [10:25<00:00, 56.88s/it]


In [27]:
# Append the additional instances to the original dataset
df_aug = df_over_4000.append(pd.DataFrame(additional_instances, columns=["text", "category"]), ignore_index=True)

In [32]:
count_category_frequency(df_aug, "category")

There are 14 categories
POLITICS          35602
WELLNESS          17945
ENTERTAINMENT     17362
COMEDY            10000
PARENTING         10000
SPORTS            10000
BUSINESS          10000
STYLE & BEAUTY    10000
FOOD & DRINK      10000
QUEER VOICES      10000
HOME & LIVING     10000
BLACK VOICES      10000
TRAVEL            10000
HEALTHY LIVING    10000
Name: category, dtype: int64


In [34]:
print(f"Number of rows previous {len(df_over_4000)}")
print(f"Number of rows after {len(df_aug)}")
print(f"In total we augmented {len(df_aug) - len(df_over_4000)} rows")

Number of rows previous 144167
Number of rows after 180909
In total we augmented 36742 rows


While inspecting the process, there are some disadvantages from data augmentation which could affect the model performance:
- **grammatical errors**: ...make things worse people started pitying -> ...make things bad people started pitying
- **nonsense**: rory mcilroy pulls olympics... -> rory mcilroy puff olympics...

But those disadvantages are minimal so its fine for our use case

In [41]:
# Run the line below to save the dataframe after preprocessing
# df_aug.to_csv("df_aug.csv", index=False, mode="wb")

## Linear model

In [70]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV

In [44]:
X = df_aug["text"]
y = df_aug["category"]

In [45]:
# Splitting data into train and test dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2 ,random_state = 123)

### Naives Bayes
First model we try is a Naives Bayer Classifier. It is based on the assumption of independence between features, which makes it a "naive" classifier

In [46]:
# source: https://towardsdatascience.com/multi-class-text-classification-model-comparison-and-selection-5eb066197568
nb = Pipeline([("vect", CountVectorizer()),
               ("tfidf", TfidfTransformer()),
               ("clf", MultinomialNB()),
              ])
nb.fit(X_train, y_train)

- **The first step vect** uses the CountVectorizer class to convert the training data (X_train) into a sparse matrix of token counts. This step is also known as text vectorization. Text vectorization is the process of converting raw text data into a numerical format
- **The second step tfidf** uses the TfidfTransformer class to transform the token counts into the Tf-idf representation of the data. Tf-idf (term frequency-inverse document frequency) is a measure of the importance of a word in a document, with respect to an entire corpus of documents.
- **The final step clf**  uses the MultinomialNB class on the Tf-idf representation of the data

In [50]:
y_pred = nb.predict(X_test)
print("accuracy %s" % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred))

accuracy 0.6089492012602952
                precision    recall  f1-score   support

  BLACK VOICES       0.92      0.22      0.36      1990
      BUSINESS       0.94      0.21      0.34      2014
        COMEDY       0.87      0.24      0.37      1971
 ENTERTAINMENT       0.61      0.75      0.67      3582
  FOOD & DRINK       0.87      0.74      0.80      1995
HEALTHY LIVING       0.95      0.04      0.07      2022
 HOME & LIVING       0.91      0.75      0.82      1991
     PARENTING       0.84      0.35      0.49      1993
      POLITICS       0.48      0.98      0.64      7000
  QUEER VOICES       0.97      0.35      0.51      2003
        SPORTS       0.91      0.56      0.69      2006
STYLE & BEAUTY       0.89      0.66      0.76      2011
        TRAVEL       0.89      0.56      0.69      2014
      WELLNESS       0.45      0.87      0.59      3590

      accuracy                           0.61     36182
     macro avg       0.82      0.52      0.56     36182
  weighted avg    

**Average accuracy is 0.61**! Since we have 14 categories, **the baseline would be 1 / 14 = 0.07**, when the model just randomly guesses! So the model itself is pretty good! Furthermore categories like Entertainment (0.67), Food & Drink (0.80) and Home & Living (0.82) seems easy to recognize. Let try to improve the model with hyperparameter tuning!

In [51]:
# Display hyperparameters that we can change
nb.get_params()

{'memory': None,
 'steps': [('vect', CountVectorizer()),
  ('tfidf', TfidfTransformer()),
  ('clf', MultinomialNB())],
 'verbose': False,
 'vect': CountVectorizer(),
 'tfidf': TfidfTransformer(),
 'clf': MultinomialNB(),
 'vect__analyzer': 'word',
 'vect__binary': False,
 'vect__decode_error': 'strict',
 'vect__dtype': numpy.int64,
 'vect__encoding': 'utf-8',
 'vect__input': 'content',
 'vect__lowercase': True,
 'vect__max_df': 1.0,
 'vect__max_features': None,
 'vect__min_df': 1,
 'vect__ngram_range': (1, 1),
 'vect__preprocessor': None,
 'vect__stop_words': None,
 'vect__strip_accents': None,
 'vect__token_pattern': '(?u)\\b\\w\\w+\\b',
 'vect__tokenizer': None,
 'vect__vocabulary': None,
 'tfidf__norm': 'l2',
 'tfidf__smooth_idf': True,
 'tfidf__sublinear_tf': False,
 'tfidf__use_idf': True,
 'clf__alpha': 1.0,
 'clf__class_prior': None,
 'clf__fit_prior': True}

In [52]:
parameters = {"vect__ngram_range": [(1, 1), (1, 2)],
              "tfidf__use_idf": (True, False),
              "clf__alpha": (1e-2, 1e-3),
}

# GridSearchCV performs an exhaustive search over all possible combinations
# This is much longer than RandomizedSearchCV
gs_clf = GridSearchCV(nb, parameters, cv=5)
gs_clf = gs_clf.fit(X_train, y_train)

- **vect__ngram_range**: Controls the range of n-grams that the vectorizer should consider
- **tfidf__use_idf**: Controls whether or not the Tf-idf transformer should use the idf weighting scheme.
- **clf__alpha**: Controls the regularization strength of the Naive Bayes classifier. Regularization is a technique used to prevent overfitting by adding a penalty term to the loss function.

In [53]:
y_pred = gs_clf.predict(X_test)
print("accuracy %s" % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred))

accuracy 0.8400585926703886
                precision    recall  f1-score   support

  BLACK VOICES       0.89      0.87      0.88      1990
      BUSINESS       0.90      0.80      0.85      2014
        COMEDY       0.86      0.80      0.83      1971
 ENTERTAINMENT       0.81      0.79      0.80      3582
  FOOD & DRINK       0.89      0.91      0.90      1995
HEALTHY LIVING       0.84      0.58      0.68      2022
 HOME & LIVING       0.92      0.95      0.94      1991
     PARENTING       0.79      0.72      0.75      1993
      POLITICS       0.85      0.93      0.89      7000
  QUEER VOICES       0.91      0.84      0.87      2003
        SPORTS       0.93      0.93      0.93      2006
STYLE & BEAUTY       0.88      0.82      0.85      2011
        TRAVEL       0.85      0.80      0.82      2014
      WELLNESS       0.66      0.85      0.75      3590

      accuracy                           0.84     36182
     macro avg       0.86      0.83      0.84     36182
  weighted avg    

An average accuracy of 0.84 is pretty good! There is an accuracy of at least 0.75 for each category

In [69]:
# Test the model by passing a chosen sentence
test_sentence = "The Funniest Tweets From Parents This Week"
test_sentence_process = process_text(test_sentence)
category_pred = gs_clf.predict([test_sentence_process])

print("Original sentence", test_sentence)
print("Test sentence after preprocessing:", test_sentence_process)
print("Predicted category:", category_pred[0])

Original sentence The Funniest Tweets From Parents This Week
Test sentence after preprocessing: funniest tweets parents week
Predicted category: PARENTING


The sentence wasnt randomly chosen. It could belong to the category parenting but **it would fit much better to comedy**. The model probably see the word parent and automatically assign it to "parenting" category. This is a weakness of Naives Bayes.

In [65]:
# Run the code below to export the model
# with open("naiveBayes.pkl", "wb") as f:
#     pickle.dump(gs_clf.best_estimator_, f)

### Linear SVM
Next linear model we will try is **Linear Support Vector Machine** . Linear Support Vector Machine (SVM) is a model that finds the best boundary to separate different classes in the feature space, by maximizing the distance between the boundary and the closest data points from each class. The SGDClassifier is a linear classifier that can also be used as a linear SVM by setting the loss parameter to "hinge" and the penalty parameter to "l2" and it learns from training data incrementally using stochastic gradient descent optimization.

In [72]:
# source: https://towardsdatascience.com/multi-class-text-classification-model-comparison-and-selection-5eb066197568
sgd = Pipeline([("vect", CountVectorizer()),
                ("tfidf", TfidfTransformer()),
                ("clf", SGDClassifier(loss="hinge", penalty="l2",alpha=1e-3, random_state=42, max_iter=5, tol=None)),
               ])
sgd.fit(X_train, y_train)

In [73]:
y_pred = sgd.predict(X_test)

print("accuracy %s" % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred))

accuracy 0.6865844895251783
                precision    recall  f1-score   support

  BLACK VOICES       0.71      0.47      0.56      1990
      BUSINESS       0.76      0.48      0.59      2014
        COMEDY       0.68      0.37      0.48      1971
 ENTERTAINMENT       0.73      0.60      0.66      3582
  FOOD & DRINK       0.72      0.83      0.77      1995
HEALTHY LIVING       0.69      0.12      0.20      2022
 HOME & LIVING       0.73      0.82      0.77      1991
     PARENTING       0.69      0.68      0.68      1993
      POLITICS       0.62      0.94      0.75      7000
  QUEER VOICES       0.81      0.72      0.77      2003
        SPORTS       0.78      0.77      0.77      2006
STYLE & BEAUTY       0.71      0.79      0.75      2011
        TRAVEL       0.77      0.72      0.74      2014
      WELLNESS       0.62      0.72      0.66      3590

      accuracy                           0.69     36182
     macro avg       0.72      0.64      0.65     36182
  weighted avg    

**Average score is 0.69** which is 8% better than Naive Bayes without hyperparameter tuning. Let apply hyperparameter tuning to increase the accuracy.

In [74]:
# Display hyperparameters that we can change
sgd.get_params()

{'memory': None,
 'steps': [('vect', CountVectorizer()),
  ('tfidf', TfidfTransformer()),
  ('clf', SGDClassifier(alpha=0.001, max_iter=5, random_state=42, tol=None))],
 'verbose': False,
 'vect': CountVectorizer(),
 'tfidf': TfidfTransformer(),
 'clf': SGDClassifier(alpha=0.001, max_iter=5, random_state=42, tol=None),
 'vect__analyzer': 'word',
 'vect__binary': False,
 'vect__decode_error': 'strict',
 'vect__dtype': numpy.int64,
 'vect__encoding': 'utf-8',
 'vect__input': 'content',
 'vect__lowercase': True,
 'vect__max_df': 1.0,
 'vect__max_features': None,
 'vect__min_df': 1,
 'vect__ngram_range': (1, 1),
 'vect__preprocessor': None,
 'vect__stop_words': None,
 'vect__strip_accents': None,
 'vect__token_pattern': '(?u)\\b\\w\\w+\\b',
 'vect__tokenizer': None,
 'vect__vocabulary': None,
 'tfidf__norm': 'l2',
 'tfidf__smooth_idf': True,
 'tfidf__sublinear_tf': False,
 'tfidf__use_idf': True,
 'clf__alpha': 0.001,
 'clf__average': False,
 'clf__class_weight': None,
 'clf__early_stoppin

In [75]:
# Define the parameter grid for the SGDClassifier
param_grid = {'clf__alpha': [0.001, 0.01, 0.1],
              'clf__max_iter': [5, 10, 15],
              'clf__tol': [None, 1e-3, 1e-4]
             }

# Create the GridSearchCV object
gs_sgd = GridSearchCV(sgd, param_grid, cv=5, scoring='accuracy', n_jobs=-1)

# Fit the GridSearchCV object to the training data
gs_sgd.fit(X_train, y_train)

- **clf_alpha**: Controls the regularization strength. Regularization is a technique that helps to prevent overfitting by adding a penalty term to the loss function that the model is trying to minimize. The alpha parameter determines the strength of this penalty term. A smaller alpha means a stronger regularization, and a larger alpha means a weaker regularization. By setting alpha to different values, you can control the trade-off between fitting the training data well and preventing overfitting.
- **clf_max_iter**: Maximum number of iteration before the model stops.
- **clf_tol**: is the tolerance for stopping criterion. It is used to control the stopping criteria of the optimizer. When the loss or score is not improving by at least tol for n_iter_no_change consecutive iterations, the training process is stopped.

In [79]:
print("Best set of parameters: ", gs_sgd.best_params_)

Best set of parameters:  {'clf__alpha': 0.001, 'clf__max_iter': 10, 'clf__tol': 0.001}


In [76]:
y_pred = gs_sgd.predict(X_test)
print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred))

accuracy 0.6859764523796363
                precision    recall  f1-score   support

  BLACK VOICES       0.71      0.48      0.57      1990
      BUSINESS       0.76      0.48      0.59      2014
        COMEDY       0.69      0.37      0.48      1971
 ENTERTAINMENT       0.74      0.59      0.66      3582
  FOOD & DRINK       0.72      0.83      0.77      1995
HEALTHY LIVING       0.68      0.12      0.20      2022
 HOME & LIVING       0.72      0.82      0.77      1991
     PARENTING       0.69      0.68      0.68      1993
      POLITICS       0.61      0.94      0.74      7000
  QUEER VOICES       0.82      0.72      0.77      2003
        SPORTS       0.78      0.77      0.77      2006
STYLE & BEAUTY       0.72      0.79      0.75      2011
        TRAVEL       0.77      0.71      0.74      2014
      WELLNESS       0.61      0.73      0.66      3590

      accuracy                           0.69     36182
     macro avg       0.72      0.64      0.65     36182
  weighted avg    

**No improvement at all**! The default parameters are already the best. The only improvement is on software engineering site and the model is only 13mb big. Naive bayes before is 342mb big.

In [81]:
# Run the code below to export the model
# with open("linearSVM.pkl", "wb") as f:
#     pickle.dump(gs_sgd.best_estimator_, f)

In [84]:
# Let test the model
# Test the model by passing a chosen sentence
test_sentence = "The Funniest Tweets From Parents This Week"
test_sentence_process = process_text(test_sentence)
category_pred = gs_sgd.predict([test_sentence_process])

print("Original sentence", test_sentence)
print("Test sentence after preprocessing:", test_sentence_process)
print("Predicted category:", category_pred[0])

Original sentence The Funniest Tweets From Parents This Week
Test sentence after preprocessing: funniest tweets parents week
Predicted category: PARENTING


The model didn't make an improvement either

## Deep Learning
The first model we will build is a **sequential model with a fully connected neural network in combination with BOW (Bag of Words)**. With BOW a text document gets converted into a numerical vector, where each dimension of the vector represents a word (or n-gram) from the vocabulary and the value in the dimension represents the frequency of that word in the document. The BOW model generates a vocabulary from the text corpus and then represent each document as a vector of the word counts. The BOW model is simple and easy to implement, but **it does not take into account the order of words in the text**, which may be important for some tasks.

In [116]:
from sklearn.preprocessing import LabelEncoder
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from keras.preprocessing import text, sequence
from keras.utils import np_utils

In [130]:
df_aug.head()

Unnamed: 0,category,text
0,COMEDY,funniest tweets cats dogs week sept dog dont u...
1,PARENTING,funniest tweets parents week sept accidentally...
2,SPORTS,maury wills basestealing shortstop dodgers die...
3,ENTERTAINMENT,golden globes returning nbc january year offai...
4,POLITICS,biden says us forces would defend taiwan china...


In [131]:
print("Total length of df:", len(df_aug))

Total length of df: 180909


In [132]:
# Use 80% of the dataset for training
train_size = int(len(df_aug) * 0.8)
train_size

144727

In [133]:
# Splitting df into train and test set
train_text = df_aug["text"][:train_size]
train_category = df_aug["category"][:train_size]

test_text = df_aug["text"][train_size:]
test_category = df_aug["category"][train_size:]

### NN with BOW

In [134]:
# Set the maximum number of words to be included in the vocabulary
max_words = 1000
# Initialize the tokenizer
tokenize = text.Tokenizer(num_words=max_words, char_level=False)
# Fit the tokenizer only on the train text to create the vocabulary
tokenize.fit_on_texts(train_text)

- **max_words** sets the maximum number of words to be included in the vocabulary used for tokenization. It has a big impact onto the data! If it is set too low, the model might not be able to learn enough about the underlying patterns in the data, as it would have an incomplete vocabulary.  On the other hand, if num_words is set too high, the model will have to learn from a larger vocabulary and might be more prone to overfitting, since it would be memorizing noise from the data rather than generalizing.
- **char_level** sets to false means that the tokenizer will split the text into words and create a vocabulary of all the unique words. Each word will be represented as a single token, and the tokenizer will take into account the word-level information. If it is set to true, the tokenizer will split the text into individual characters and create a vocabulary of all the unique characters


In [135]:
# Convert train text into numerical feature vectors using the vocabulary created earlier
x_train = tokenize.texts_to_matrix(train_text)
x_test = tokenize.texts_to_matrix(test_text)

In [136]:
# Initialize the label encoder 
# convert categorical variables (strings) into numerical variables (integers)
encoder = LabelEncoder()
# Fit the encoder on the train category to learn the mapping from the categories to integer labels
encoder.fit(train_category)

# Transform the category into integer labels using the mapping learned earlier
y_train = encoder.transform(train_category)
y_test = encoder.transform(test_category)

In [137]:
# Get the number of classes by finding the maximum label value in the train set and adding 1
# why? Current data look like: 1="Comedy", 2="POLITIC"
num_classes = np.max(y_train) + 1

# Data looks like [0 0 1] after one hot encoding
y_train = np_utils.to_categorical(y_train, num_classes)
y_test = np_utils.to_categorical(y_test, num_classes)

In [148]:
# source: https://towardsdatascience.com/multi-class-text-classification-model-comparison-and-selection-5eb066197568
# Build the model
model_nn = Sequential()
model_nn.add(Dense(512, input_shape=(max_words,)))
model_nn.add(Activation("relu"))
model_nn.add(Dropout(0.5))
model_nn.add(Dense(num_classes))
model_nn.add(Activation("softmax"))

What does the dropout layer do? The Dropout layer is a regularization technique for reducing overfitting in neural networks. It works by randomly "dropping out" or setting to zero a certain percentage of the neurons (specified by the dropout rate) during the training process. This means that these neurons will not be updated or participate in the forward or backward pass during training. By doing this, the model is forced to learn multiple independent representations of the data, rather than relying too heavily on any one neuron. This in turn reduces the chance of the model overfitting to the training data, and improves its generalization performance on unseen data.

In [149]:
model_nn.compile(loss="categorical_crossentropy",
              optimizer="adam",
              metrics=["accuracy"])

- **categorical crossentropy** is used as loss function. Loss function is used to measure the model's performance and categorical crossentropy computes the cross-entropy loss between true labels and predicted labels.
- **adam** is the optimizer and it adapts the learning rate for each parameter

In [150]:
# Number of samples per gradient update
batch_size = 32
# Number of times the model will cycle through the data
epochs = 4

In [151]:
history = model_nn.fit(x_train, y_train,
                    batch_size=batch_size,
                    epochs=epochs,
                    verbose=1,
                    validation_split=0.1)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


In [152]:
score = model_nn.evaluate(x_test, y_test,
                       batch_size=batch_size, verbose=1)
print("Test accuracy:", score[1])

Test accuracy: 0.49842461943626404


Accuracy of around 50% is **in comparison to Naives Bayes pretty bad**. It's probably because of batch_size and epochs are too small. Max_words with 1000 is also too small

In [157]:
# Let test the model

# Tokenize the example text
example_text = tokenize.texts_to_matrix(["The Funniest Tweets From Parents This Week"])

# Make a prediction using the model
predictions = model_nn.predict(example_text)

# Print the predictions
print(predictions)

[[0.02625877 0.01205577 0.16106945 0.14356118 0.00325501 0.04972285
  0.00556945 0.4435004  0.05745911 0.01528602 0.02839412 0.03697094
  0.00397735 0.01291951]]


In [158]:
# Get the index of the class with the highest probability
predicted_class_index = np.argmax(predictions)

# Get the corresponding class label from the label encoder
predicted_class_label = encoder.classes_[predicted_class_index]

# Get the maximum probability
predicted_class_prob = predictions[0][predicted_class_index]

# Print the predicted class label and its probability
print(predicted_class_label, predicted_class_prob)

PARENTING 0.4435004


At least the model gives the sample output like naive bayes and svm on the example sentence

In [166]:
# Uncomment the code below to export the model, tokenizer and encoder

# model_nn.save("model_nn.h5")
# with open("tokenizer_nn.pickle", "wb") as handle:
#     pickle.dump(tokenize, handle, protocol=pickle.HIGHEST_PROTOCOL)
# with open("encoder_nn.pickle", "wb") as handle:
#     pickle.dump(encoder, handle, protocol=pickle.HIGHEST_PROTOCOL)

### CNN with Embedding
The next model we will develop is a **CNN**. One of the key features of CNNs is that they have a hidden vector, which acts as a short-term memory, allowing them to consider the sequence of words in a sentence. However, this advantage comes at a cost of slower sequential processing compared to other methods. The code in the beginning with Tokenizing is almost the same like with but I will copy it here again, so we can run each model development independently from each other.

In [248]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding
from keras.layers import SimpleRNN, Bidirectional

In [267]:
# Use 80% of the dataset for training
train_size = int(len(df_aug) * 0.8)
train_size

144727

In [268]:
# Splitting df into train and test set
train_text = df_aug["text"][:train_size]
train_category = df_aug["category"][:train_size]

test_text = df_aug["text"][train_size:]
test_category = df_aug["category"][train_size:]

In [269]:
# We will increase this from 1000 -> 20000
max_words = 20000
# Initialize the tokenizer
tokenize_cnn = text.Tokenizer(num_words=max_words, char_level=False)

# Fit the tokenizer only on the train text to create the vocabulary
tokenize_cnn.fit_on_texts(train_text)

In [270]:
# required for Embedding
maxlen = 110

Why do we need to define maxlen?
<br>maxlen is used to define the maximum length of the input sequences that will be passed to the Embedding layer. The Embedding layer expects a fixed-length input, and maxlen specifies the maximum length of the input sequences.
<br>If the input sequence is shorter than maxlen, it will be padded with zeros to reach the specified length. If it is longer than maxlen, it will be truncated to maxlen words.
<br>This ensures that all input sequences passed to the Embedding layer have the same length and can be used as input to the model.
This is important because, in the given code, the Embedding layer is the first layer of the model and it expects a fixed-length input, so it is necessary to define the maxlen so that the Embedding layer can correctly handle the input.

In [271]:
# turn words into a list of sequences, where each sequence
# is a list of integers representing the words in the text
train_text_seq = tokenize_cnn.texts_to_sequences(train_text)
# pad the sequences to a fixed length of 110
train_text_padseq = pad_sequences(train_text_seq, maxlen=maxlen)

# same for test data
test_text_seq = tokenize_cnn.texts_to_sequences(test_text)
test_text_padseq = pad_sequences(test_text_seq, maxlen=maxlen)

Why do we need pad_sequences? pad_sequences ensures that all sequences have the same length and can be used as input to the CNN model. It also helps to handle variable length of text.

In [272]:
# Initialize the label encoder 
# convert categorical variables (strings) into numerical variables (integers)
encoder = LabelEncoder()
# Fit the encoder on the train category to learn the mapping from the categories to integer labels
encoder.fit(train_category)

# Transform the category into integer labels using the mapping learned earlier
y_train = encoder.transform(train_category)
y_test = encoder.transform(test_category)

In [273]:
# Get the number of classes by finding the maximum label value in the train set and adding 1
# why? Current data look like: 1="Comedy", 2="POLITIC"
num_classes = np.max(y_train) + 1

# Data looks like [0 0 1] after one hot encoding
y_train = np_utils.to_categorical(y_train, num_classes)
y_test = np_utils.to_categorical(y_test, num_classes)

In [274]:
# delete later
total_words = len(tokenize_cnn.word_index)
total_words

89936

In [275]:
# inspired by: https://www.kaggle.com/code/avikumart/nlp-news-articles-classif-wordembeddings-rnn

# basline model using embedding layers and simpleRNN
model_cnn = Sequential()
# 50 represents the number of dimensions in the embedding space.
# This means that each word in the vocabulary will be represented by a vector of 50 numbers
model_cnn.add(Embedding(max_words, 50, input_length=maxlen))
model_cnn.add(Bidirectional(SimpleRNN(64, dropout=0.2, recurrent_dropout=0.20, activation="tanh", return_sequences=True)))
model_cnn.add(Bidirectional(SimpleRNN(64, dropout=0.3, recurrent_dropout=0.30, activation="tanh", return_sequences=True)))
model_cnn.add(SimpleRNN(32, activation="tanh"))
model_cnn.add(Dropout(0.4))
model_cnn.add(Dense(num_classes))
model_cnn.add(Activation("softmax"))
model_cnn.summary()

Model: "sequential_18"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_12 (Embedding)    (None, 110, 50)           1000000   
                                                                 
 bidirectional_22 (Bidirecti  (None, 110, 128)         14720     
 onal)                                                           
                                                                 
 bidirectional_23 (Bidirecti  (None, 110, 128)         24704     
 onal)                                                           
                                                                 
 simple_rnn_35 (SimpleRNN)   (None, 32)                5152      
                                                                 
 dropout_15 (Dropout)        (None, 32)                0         
                                                                 
 dense_19 (Dense)            (None, 14)              

In [276]:
model_cnn.compile(loss="categorical_crossentropy",
              optimizer="adam",
              metrics=["accuracy"])

In [277]:
# Number of samples per gradient update
# larger batch size -> faster training -> overfitting
batch_size = 64
# Number of times the model will cycle through the data
epochs = 4

In [278]:
# fit model to the data
history = model_cnn.fit(train_text_padseq, y_train, 
                        batch_size=batch_size, 
                        epochs=epochs, 
                        validation_split=0.2)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


In [279]:
score = model_cnn.evaluate(test_text_padseq, y_test,
                       batch_size=batch_size, verbose=1)
print("Test accuracy:", score[1])

Test accuracy: 0.2380465418100357


**Accuracy of 0.23** is not good at all. This is pretty like guessing. Train accuracy of 0.6 and test accuracy of 0.23 means there is an overfitting. Dropout is already pretty high but we could increase it even more to prevent overfitting. Other options are decrease batch size and increase epochs. Also we could use LSTM or GRU but we will need to design a whole new architecture then.

In [280]:
# Test the model
sentence = "The Funniest Tweets From Parents This Week"

# Tokenize the sentence
sentence_seq = tokenize_cnn.texts_to_sequences([sentence])

# Pad the sentence to the same length as the training data
sentence_padseq = pad_sequences(sentence_seq, maxlen=maxlen)

# Make a prediction using the model
predictions = model_cnn.predict(sentence_padseq)

# Get the class with the highest probability
predicted_class = np.argmax(predictions)

# Convert the integer label back to the original category
predicted_category = encoder.inverse_transform([predicted_class])

# Print the result
print("Predicted category:", predicted_category[0])

Predicted category: HEALTHY LIVING


In [281]:
# Uncomment the code below to export the model, tokenizer and encoder
# model_cnn.save("model_cnn.h5")
# with open("tokenizer_cnn.pickle", "wb") as handle:
#     pickle.dump(tokenize, handle, protocol=pickle.HIGHEST_PROTOCOL)
# with open("encoder_cnn.pickle", "wb") as handle:
#     pickle.dump(encoder, handle, protocol=pickle.HIGHEST_PROTOCOL)

## Conclusion
Overall our use case performs better with a simpler model like Naive Bayes compared to a deep learning model like CNN. The poor performance of the CNN could be due to overfitting, which can be addressed by adjusting parameters such as the dropout rate. Futhermore we could optimise our cnn model by increasing max_len. Max_len of 110 is pretty small for our use case. For NN we could increase the max_word. 1000 was probably too small it probably leads to a loss of important information and result in less accurate predictions. Additionally, the data augmentation we performed in the beginning may not have had a positive impact on the model's performance and it may be more effective to train the model directly on the original dataset.