# Model Development
We want to develop and compare different models

In [1]:
# Built-in
import re
import pickle
import random

# Utils
from tqdm import tqdm

# Data science utils
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# NLP
import nltk
from nltk.corpus import wordnet, stopwords
from nltk.tokenize import word_tokenize

In [2]:
# UNCOMMMENT the line below if your system doesn't have wordnet
# nltk.download('wordnet')

## Preprocessing

In [3]:
# Import data
df = pd.read_json("../data/News_Category_Dataset_v3.json", orient = "records", lines = True)
df.head()

Unnamed: 0,link,headline,category,short_description,authors,date
0,https://www.huffpost.com/entry/covid-boosters-...,Over 4 Million Americans Roll Up Sleeves For O...,U.S. NEWS,Health experts said it is too early to predict...,"Carla K. Johnson, AP",2022-09-23
1,https://www.huffpost.com/entry/american-airlin...,"American Airlines Flyer Charged, Banned For Li...",U.S. NEWS,He was subdued by passengers and crew when he ...,Mary Papenfuss,2022-09-23
2,https://www.huffpost.com/entry/funniest-tweets...,23 Of The Funniest Tweets About Cats And Dogs ...,COMEDY,"""Until you have a dog you don't understand wha...",Elyse Wanshel,2022-09-23
3,https://www.huffpost.com/entry/funniest-parent...,The Funniest Tweets From Parents This Week (Se...,PARENTING,"""Accidentally put grown-up toothpaste on my to...",Caroline Bologna,2022-09-23
4,https://www.huffpost.com/entry/amy-cooper-lose...,Woman Who Called Cops On Black Bird-Watcher Lo...,U.S. NEWS,Amy Cooper accused investment firm Franklin Te...,Nina Golgowski,2022-09-22


In [4]:
# Selected only relevant columns
df_filtered = df[["headline", "category", "short_description"]].copy()

# Concatenate headline and short description together
df_filtered.loc[:, "text"] = df_filtered["headline"] + " " + df_filtered["short_description"]
df_filtered.drop(["headline","short_description"],axis=1,inplace=True)
df_filtered.head()

Unnamed: 0,category,text
0,U.S. NEWS,Over 4 Million Americans Roll Up Sleeves For O...
1,U.S. NEWS,"American Airlines Flyer Charged, Banned For Li..."
2,COMEDY,23 Of The Funniest Tweets About Cats And Dogs ...
3,PARENTING,The Funniest Tweets From Parents This Week (Se...
4,U.S. NEWS,Woman Who Called Cops On Black Bird-Watcher Lo...


In [5]:
# Count the frequency in each category
def count_category_frequency(df, column_name):
    freq = df[column_name].value_counts()
    print(f"There are {len(freq)} categories")
    print(freq)

In [6]:
count_category_frequency(df_filtered, "category")

There are 42 categories
POLITICS          35602
WELLNESS          17945
ENTERTAINMENT     17362
TRAVEL             9900
STYLE & BEAUTY     9814
PARENTING          8791
HEALTHY LIVING     6694
QUEER VOICES       6347
FOOD & DRINK       6340
BUSINESS           5992
COMEDY             5400
SPORTS             5077
BLACK VOICES       4583
HOME & LIVING      4320
PARENTS            3955
THE WORLDPOST      3664
WEDDINGS           3653
WOMEN              3572
CRIME              3562
IMPACT             3484
DIVORCE            3426
WORLD NEWS         3299
MEDIA              2944
WEIRD NEWS         2777
GREEN              2622
WORLDPOST          2579
RELIGION           2577
STYLE              2254
SCIENCE            2206
TECH               2104
TASTE              2096
MONEY              1756
ARTS               1509
ENVIRONMENT        1444
FIFTY              1401
GOOD NEWS          1398
U.S. NEWS          1377
ARTS & CULTURE     1339
COLLEGE            1144
LATINO VOICES      1130
CULTURE & ARTS  

There are many unique categories in the dataset that are not commonly found. These include categories such as "Weird news," "Green," and "Fifty." Additionally, there is confusion about when a category should be classified as "World news" versus "World post," and when a category should be classified as "Money" versus "Business." To improve the applicability of the machine learning model to general datasets, **we will only retain categories that have more than 4000 articles**.

In [7]:
def filter_categories(df, threshold):
    # Make a copy of the dataframe
    data = df.copy()
    
    # Create a new column "count" that stores the count of each category
    data["count"] = data.groupby("category")["category"].transform("count")
    # Keep only the rows where the "count" column is greater than the specified threshold
    
    data = data[data["count"] > threshold]
    
    # Drop the "count" column
    data.drop(columns=["count"], inplace=True)
    
    return data

In [8]:
df_over_4000 = filter_categories(df_filtered, 4000)

In [9]:
# source for this func: https://www.kaggle.com/code/tarunchaubey/news-category-classification-machine-learning
# since preprocessing always look the same, we will just utilize existing code instead of reinvent the wheel

# preprocess text (removing stopwords and tokenizing)
def process_text(text):
    # convert text to lowercase, remove newlines and carriage returns, and strip leading/trailing whitespace
    text = text.lower().replace('\n',' ').replace('\r','').strip()
    
    # replace multiple spaces with single space
    text = re.sub(' +', ' ', text)
    
    # remove non-alphanumeric characters and digits
    text = re.sub(r'[^\w\s]','',text)
    text = re.sub(r'[0-9]','',text)
    
    # create set of english stopwords
    stop_words = set(stopwords.words('english')) 
    
    # tokenize text into words
    word_tokens = word_tokenize(text)
    
    # if word not in stops_words, add word to filtered_sentence
    filtered_sentence = [] 
    for w in word_tokens: 
        if w not in stop_words: 
            filtered_sentence.append(w) 
    
    text = " ".join(filtered_sentence)
    return text

In [10]:
df_over_4000["text"] = df_over_4000["text"].apply(lambda x:process_text(x))

In [35]:
old_length = df_filtered["text"].apply(len).sum()
new_length = df_over_4000["text"].apply(len).sum()

In [39]:
print(f"Before: {old_length} characters")
print(f"After: {new_length} characters")
print(f"Total lost of: {old_length - new_length} characters")

Before: 36378921 characters
After: 17948453 characters
Total lost of: 18430468 characters


In [11]:
df_over_4000.head()

Unnamed: 0,category,text
2,COMEDY,funniest tweets cats dogs week sept dog dont u...
3,PARENTING,funniest tweets parents week sept accidentally...
17,SPORTS,maury wills basestealing shortstop dodgers die...
20,ENTERTAINMENT,golden globes returning nbc january year offai...
21,POLITICS,biden says us forces would defend taiwan china...


In [12]:
count_category_frequency(df_over_4000, "category")

There are 14 categories
POLITICS          35602
WELLNESS          17945
ENTERTAINMENT     17362
TRAVEL             9900
STYLE & BEAUTY     9814
PARENTING          8791
HEALTHY LIVING     6694
QUEER VOICES       6347
FOOD & DRINK       6340
BUSINESS           5992
COMEDY             5400
SPORTS             5077
BLACK VOICES       4583
HOME & LIVING      4320
Name: category, dtype: int64


The data is still imbalanced through. There are many different technique to counter this: 
- **Oversampling**: This technique involves duplicating instances of the underrepresented class(es) in the dataset to balance the class distribution. This can be done by randomly repeating instances of the underrepresented class(es) until the number of instances of each class is roughly equal.
- **Undersampling**: This technique involves removing instances of the overrepresented class(es) in the dataset to balance the class distribution. This can be done by randomly removing instances of the overrepresented class(es) until the number of instances of each class is roughly equal.
- **SMOTE**: (Synthetic Minority Over-sampling Technique) This is a more sophisticated oversampling technique that generates synthetic instances of the underrepresented class(es) by interpolating between existing instances of that class.
- **Data augmentation**: This technique involves generating new instances of the underrepresented class(es) by applying data transformation techniques such as synonym replacement, random insertion, random deletion, or random swap to the existing instances of the underrepresented class(es).

For our use case, **undersampling** is not suitable since we would lose a lot of valuable data in the categories "politics", "wellness" and "entertainment". **SMOTE** is not a suitable technique either because it operates in feature space, which means it generates synthetic data by interpolating between existing instances of the underrepresented class. However, in NLP problems, the feature space is large and high-dimensional, which makes it difficult for the KNN algorithm used by SMOTE to effectively identify similar instances. Additionally, the generated synthetic data may not be representative of real text data in the feature space, which may negatively impact the performance of the model. **Oversampling** can lead to overfitting, as the model becomes more sensitive to the duplicated instances of the minority class. Because of those reasons **we will use data augmentation with synonym replacement!**

In [20]:
# get number of samples for each category
class_counts = df_over_4000["category"].value_counts()

# threshold for sample
threshold = 10000

# get the list of categories that has less sample than threshold
underrepresented_classes = class_counts[class_counts < threshold].index.tolist()

print(f"There are {len(underrepresented_classes)} categories that have less than {threshold} samples")
print(underrepresented_classes)

There are 11 categories that have less than 10000 samples
['TRAVEL', 'STYLE & BEAUTY', 'PARENTING', 'HEALTHY LIVING', 'QUEER VOICES', 'FOOD & DRINK', 'BUSINESS', 'COMEDY', 'SPORTS', 'BLACK VOICES', 'HOME & LIVING']


In [25]:
def generate_new_instance(text, label):
    # Tokenize the text
    tokens = nltk.word_tokenize(text)
    
    # error handling when text is not tokenizable
    if len(tokens) == 0:
        return text, label
    
    # Select a random token
    random_token = random.choice(tokens)
    
    # Synonym dict
    synonyms = wordnet.synsets(random_token)
    
    # when arent any synonyms for the token, then dont modify the text
    if len(synonyms) == 0:
        return text, label
    else:
        # Select a random synonym of the selected token
        random_synonym = random.choice(synonyms).lemmas()[0].name()
        # Replace the selected token with its synonym in the text
        new_text = text.replace(random_token, random_synonym)
        return new_text, label

In [26]:
# Initialize an empty list to store the additional instances
additional_instances = []

# Iterate through all the underrepresented classes
for label in tqdm(underrepresented_classes):
    
    # Determine the number of instances to generate for this class
    num_to_generate = threshold - class_counts[label]
    
    # Generate the new instances
    for i in range(num_to_generate):
        # select random text from the underrepresented class
        text = df_over_4000[df_over_4000["category"] == label]["text"].sample(n=1).values[0]
        # generate a new instance and label
        new_text, new_label = generate_new_instance(text, label)
        # append the new instance and label to the additional_instances list
        additional_instances.append((new_text, new_label))

100%|██████████████████████████████████████████████████████████████████████████████████| 11/11 [10:25<00:00, 56.88s/it]


In [27]:
# Append the additional instances to the original dataset
df_aug = df_over_4000.append(pd.DataFrame(additional_instances, columns=["text", "category"]), ignore_index=True)

In [32]:
count_category_frequency(df_aug, "category")

There are 14 categories
POLITICS          35602
WELLNESS          17945
ENTERTAINMENT     17362
COMEDY            10000
PARENTING         10000
SPORTS            10000
BUSINESS          10000
STYLE & BEAUTY    10000
FOOD & DRINK      10000
QUEER VOICES      10000
HOME & LIVING     10000
BLACK VOICES      10000
TRAVEL            10000
HEALTHY LIVING    10000
Name: category, dtype: int64


In [34]:
print(f"Number of rows previous {len(df_over_4000)}")
print(f"Number of rows after {len(df_aug)}")
print(f"In total we augmented {len(df_aug) - len(df_over_4000)} rows")

Number of rows previous 144167
Number of rows after 180909
In total we augmented 36742 rows


While inspecting the process, there are some disadvantages from data augmentation which could affect the model performance:
- **grammatical errors**: ...make things worse people started pitying -> ...make things bad people started pitying
- **nonsense**: rory mcilroy pulls olympics... -> rory mcilroy puff olympics...

But those disadvantages are minimal so its fine for our use case

In [41]:
# Run the line below to save the dataframe after preprocessing
# df_aug.to_csv("df_aug.csv", index=False, mode="wb")

## Linear model

In [70]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import GridSearchCV

In [44]:
X = df_aug["text"]
y = df_aug["category"]

In [45]:
# Splitting data into train and test dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2 ,random_state = 123)

First model we try is a Naives Bayer Classifier. It is based on the assumption of independence between features, which makes it a "naive" classifier

In [46]:
# source: https://towardsdatascience.com/multi-class-text-classification-model-comparison-and-selection-5eb066197568
nb = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('clf', MultinomialNB()),
              ])
nb.fit(X_train, y_train)

- **The first step vect** uses the CountVectorizer class to convert the training data (X_train) into a sparse matrix of token counts. This step is also known as text vectorization. Text vectorization is the process of converting raw text data into a numerical format
- **The second step tfidf** uses the TfidfTransformer class to transform the token counts into the Tf-idf representation of the data. Tf-idf (term frequency-inverse document frequency) is a measure of the importance of a word in a document, with respect to an entire corpus of documents.
- **The final step clf**  uses the MultinomialNB class on the Tf-idf representation of the data

In [50]:
y_pred = nb.predict(X_test)
print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred))

accuracy 0.6089492012602952
                precision    recall  f1-score   support

  BLACK VOICES       0.92      0.22      0.36      1990
      BUSINESS       0.94      0.21      0.34      2014
        COMEDY       0.87      0.24      0.37      1971
 ENTERTAINMENT       0.61      0.75      0.67      3582
  FOOD & DRINK       0.87      0.74      0.80      1995
HEALTHY LIVING       0.95      0.04      0.07      2022
 HOME & LIVING       0.91      0.75      0.82      1991
     PARENTING       0.84      0.35      0.49      1993
      POLITICS       0.48      0.98      0.64      7000
  QUEER VOICES       0.97      0.35      0.51      2003
        SPORTS       0.91      0.56      0.69      2006
STYLE & BEAUTY       0.89      0.66      0.76      2011
        TRAVEL       0.89      0.56      0.69      2014
      WELLNESS       0.45      0.87      0.59      3590

      accuracy                           0.61     36182
     macro avg       0.82      0.52      0.56     36182
  weighted avg    

**Average accuracy is 0.61**! Since we have 14 categories, **the baseline would be 1 / 14 = 0.07**, when the model just randomly guesses! So the model itself is pretty good! Furthermore categories like Entertainment (0.67), Food & Drink (0.80) and Home & Living (0.82) seems easy to recognize. Let try to improve the model with hyperparameter tuning!

In [51]:
# Display hyperparameters that we can change
nb.get_params()

{'memory': None,
 'steps': [('vect', CountVectorizer()),
  ('tfidf', TfidfTransformer()),
  ('clf', MultinomialNB())],
 'verbose': False,
 'vect': CountVectorizer(),
 'tfidf': TfidfTransformer(),
 'clf': MultinomialNB(),
 'vect__analyzer': 'word',
 'vect__binary': False,
 'vect__decode_error': 'strict',
 'vect__dtype': numpy.int64,
 'vect__encoding': 'utf-8',
 'vect__input': 'content',
 'vect__lowercase': True,
 'vect__max_df': 1.0,
 'vect__max_features': None,
 'vect__min_df': 1,
 'vect__ngram_range': (1, 1),
 'vect__preprocessor': None,
 'vect__stop_words': None,
 'vect__strip_accents': None,
 'vect__token_pattern': '(?u)\\b\\w\\w+\\b',
 'vect__tokenizer': None,
 'vect__vocabulary': None,
 'tfidf__norm': 'l2',
 'tfidf__smooth_idf': True,
 'tfidf__sublinear_tf': False,
 'tfidf__use_idf': True,
 'clf__alpha': 1.0,
 'clf__class_prior': None,
 'clf__fit_prior': True}

In [52]:
parameters = {'vect__ngram_range': [(1, 1), (1, 2)],
              'tfidf__use_idf': (True, False),
              'clf__alpha': (1e-2, 1e-3),
}

# GridSearchCV performs an exhaustive search over all possible combinations
# This is much longer than RandomizedSearchCV
gs_clf = GridSearchCV(nb, parameters, cv=5)
gs_clf = gs_clf.fit(X_train, y_train)

- **vect__ngram_range**: Controls the range of n-grams that the vectorizer should consider
- **tfidf__use_idf**: Controls whether or not the Tf-idf transformer should use the idf weighting scheme.
- **clf__alpha**: Controls the regularization strength of the Naive Bayes classifier. Regularization is a technique used to prevent overfitting by adding a penalty term to the loss function.

In [53]:
y_pred = gs_clf.predict(X_test)
print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred))

accuracy 0.8400585926703886
                precision    recall  f1-score   support

  BLACK VOICES       0.89      0.87      0.88      1990
      BUSINESS       0.90      0.80      0.85      2014
        COMEDY       0.86      0.80      0.83      1971
 ENTERTAINMENT       0.81      0.79      0.80      3582
  FOOD & DRINK       0.89      0.91      0.90      1995
HEALTHY LIVING       0.84      0.58      0.68      2022
 HOME & LIVING       0.92      0.95      0.94      1991
     PARENTING       0.79      0.72      0.75      1993
      POLITICS       0.85      0.93      0.89      7000
  QUEER VOICES       0.91      0.84      0.87      2003
        SPORTS       0.93      0.93      0.93      2006
STYLE & BEAUTY       0.88      0.82      0.85      2011
        TRAVEL       0.85      0.80      0.82      2014
      WELLNESS       0.66      0.85      0.75      3590

      accuracy                           0.84     36182
     macro avg       0.86      0.83      0.84     36182
  weighted avg    

An average accuracy of 0.84 is pretty good! There is an accuracy of at least 0.75 for each category

In [69]:
# Test the model by passing a chosen sentence
test_sentence = "The Funniest Tweets From Parents This Week"
test_sentence_process = process_text(test_sentence)
category_pred = gs_clf.predict([test_sentence_process])

print("Original sentence", test_sentence)
print("Test sentence after preprocessing:", test_sentence_process)
print("Predicted category:", category_pred[0])

Original sentence The Funniest Tweets From Parents This Week
Test sentence after preprocessing: funniest tweets parents week
Predicted category: PARENTING


The sentence wasnt randomly chosen. It could belong to the category parenting but **it would fit much better to comedy**. The model probably see the word parent and automatically assign it to "parenting" category. This is a weakness of Naives Bayes.

In [65]:
# Run the code below to export the model
# with open("naiveBayes.pkl", "wb") as f:
#     pickle.dump(gs_clf.best_estimator_, f)

Next linear model we will try is **Linear Support Vector Machine** . Linear Support Vector Machine (SVM) is a model that finds the best boundary to separate different classes in the feature space, by maximizing the distance between the boundary and the closest data points from each class. The SGDClassifier is a linear classifier that can also be used as a linear SVM by setting the loss parameter to "hinge" and the penalty parameter to "l2" and it learns from training data incrementally using stochastic gradient descent optimization.

In [72]:
# source: https://towardsdatascience.com/multi-class-text-classification-model-comparison-and-selection-5eb066197568
sgd = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42, max_iter=5, tol=None)),
               ])
sgd.fit(X_train, y_train)

In [73]:
y_pred = sgd.predict(X_test)

print("accuracy %s" % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred))

accuracy 0.6865844895251783
                precision    recall  f1-score   support

  BLACK VOICES       0.71      0.47      0.56      1990
      BUSINESS       0.76      0.48      0.59      2014
        COMEDY       0.68      0.37      0.48      1971
 ENTERTAINMENT       0.73      0.60      0.66      3582
  FOOD & DRINK       0.72      0.83      0.77      1995
HEALTHY LIVING       0.69      0.12      0.20      2022
 HOME & LIVING       0.73      0.82      0.77      1991
     PARENTING       0.69      0.68      0.68      1993
      POLITICS       0.62      0.94      0.75      7000
  QUEER VOICES       0.81      0.72      0.77      2003
        SPORTS       0.78      0.77      0.77      2006
STYLE & BEAUTY       0.71      0.79      0.75      2011
        TRAVEL       0.77      0.72      0.74      2014
      WELLNESS       0.62      0.72      0.66      3590

      accuracy                           0.69     36182
     macro avg       0.72      0.64      0.65     36182
  weighted avg    

**Average score is 0.69** which is 8% better than Naive Bayes without hyperparameter tuning. Let apply hyperparameter tuning to increase the accuracy.

In [74]:
# Display hyperparameters that we can change
sgd.get_params()

{'memory': None,
 'steps': [('vect', CountVectorizer()),
  ('tfidf', TfidfTransformer()),
  ('clf', SGDClassifier(alpha=0.001, max_iter=5, random_state=42, tol=None))],
 'verbose': False,
 'vect': CountVectorizer(),
 'tfidf': TfidfTransformer(),
 'clf': SGDClassifier(alpha=0.001, max_iter=5, random_state=42, tol=None),
 'vect__analyzer': 'word',
 'vect__binary': False,
 'vect__decode_error': 'strict',
 'vect__dtype': numpy.int64,
 'vect__encoding': 'utf-8',
 'vect__input': 'content',
 'vect__lowercase': True,
 'vect__max_df': 1.0,
 'vect__max_features': None,
 'vect__min_df': 1,
 'vect__ngram_range': (1, 1),
 'vect__preprocessor': None,
 'vect__stop_words': None,
 'vect__strip_accents': None,
 'vect__token_pattern': '(?u)\\b\\w\\w+\\b',
 'vect__tokenizer': None,
 'vect__vocabulary': None,
 'tfidf__norm': 'l2',
 'tfidf__smooth_idf': True,
 'tfidf__sublinear_tf': False,
 'tfidf__use_idf': True,
 'clf__alpha': 0.001,
 'clf__average': False,
 'clf__class_weight': None,
 'clf__early_stoppin

In [75]:
# Define the parameter grid for the SGDClassifier
param_grid = {'clf__alpha': [0.001, 0.01, 0.1],
              'clf__max_iter': [5, 10, 15],
              'clf__tol': [None, 1e-3, 1e-4]
             }

# Create the GridSearchCV object
gs_sgd = GridSearchCV(sgd, param_grid, cv=5, scoring='accuracy', n_jobs=-1)

# Fit the GridSearchCV object to the training data
gs_sgd.fit(X_train, y_train)

- **clf_alpha**: Controls the regularization strength. Regularization is a technique that helps to prevent overfitting by adding a penalty term to the loss function that the model is trying to minimize. The alpha parameter determines the strength of this penalty term. A smaller alpha means a stronger regularization, and a larger alpha means a weaker regularization. By setting alpha to different values, you can control the trade-off between fitting the training data well and preventing overfitting.
- **clf_max_iter**: Maximum number of iteration before the model stops.
- **clf_tol**: is the tolerance for stopping criterion. It is used to control the stopping criteria of the optimizer. When the loss or score is not improving by at least tol for n_iter_no_change consecutive iterations, the training process is stopped.

In [79]:
print("Best set of parameters: ", gs_sgd.best_params_)

Best set of parameters:  {'clf__alpha': 0.001, 'clf__max_iter': 10, 'clf__tol': 0.001}


In [76]:
y_pred = gs_sgd.predict(X_test)
print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred))

accuracy 0.6859764523796363
                precision    recall  f1-score   support

  BLACK VOICES       0.71      0.48      0.57      1990
      BUSINESS       0.76      0.48      0.59      2014
        COMEDY       0.69      0.37      0.48      1971
 ENTERTAINMENT       0.74      0.59      0.66      3582
  FOOD & DRINK       0.72      0.83      0.77      1995
HEALTHY LIVING       0.68      0.12      0.20      2022
 HOME & LIVING       0.72      0.82      0.77      1991
     PARENTING       0.69      0.68      0.68      1993
      POLITICS       0.61      0.94      0.74      7000
  QUEER VOICES       0.82      0.72      0.77      2003
        SPORTS       0.78      0.77      0.77      2006
STYLE & BEAUTY       0.72      0.79      0.75      2011
        TRAVEL       0.77      0.71      0.74      2014
      WELLNESS       0.61      0.73      0.66      3590

      accuracy                           0.69     36182
     macro avg       0.72      0.64      0.65     36182
  weighted avg    

**No improvement at all**! The default parameters are already the best. The only improvement is on software engineering site and the model is only 13mb big. Naive bayes before is 342mb big.

In [81]:
# Run the code below to export the model
# with open("linearSVM.pkl", "wb") as f:
#     pickle.dump(gs_sgd.best_estimator_, f)

In [84]:
# Let test the model
# Test the model by passing a chosen sentence
test_sentence = "The Funniest Tweets From Parents This Week"
test_sentence_process = process_text(test_sentence)
category_pred = gs_sgd.predict([test_sentence_process])

print("Original sentence", test_sentence)
print("Test sentence after preprocessing:", test_sentence_process)
print("Predicted category:", category_pred[0])

Original sentence The Funniest Tweets From Parents This Week
Test sentence after preprocessing: funniest tweets parents week
Predicted category: PARENTING


The model didn't make an improvement either

### Try CNN

In [None]:
# Converts a collection of raw documents to a matrix of TF-IDF features
vectorizer = TfidfVectorizer()
# Tokenize the text into sequences of integers
tokenizer = Tokenizer()

In [None]:
# Get max length to calculate pad sequences later
max_len = df_filtered["text"].apply(lambda x:len(x.split())).max()
max_len

Why do we need pad sequences?
<br>Pad sequences is used to ensure that all input sequences have the same length. This is important because many machine learning models, such as neural networks, expect input data to have a fixed size. If the input sequences have different lengths, they need to be padded or truncated so that they all have the same length.

In [None]:
def TF_IDF_ML(X,y):
    # Fit the tokenizer on the input text data
    tokenizer.fit_on_texts(X)
    # Create a mapping of words to their corresponding index in the vocabulary
    word_index = tokenizer.word_index
    # Determine the size of the vocabulary by adding 1 to the length of the word index
    vocab_size = len(tokenizer.word_index) + 1
    
    # Split the input data into train and test sets, with 30% of the data being used for testing
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)
    
    # Pad the sequences of integers obtained by tokenizing the text to the same length
    X_train = pad_sequences(tokenizer.texts_to_sequences(X_train),
                        maxlen = max_len)
    X_test = pad_sequences(tokenizer.texts_to_sequences(X_test),
                       maxlen = max_len)
    
    # Return the train and test sets for both the features and labels
    return X_train, X_test, y_train, y_test

In [None]:
X_train, X_test, y_train, y_test = TF_IDF_ML(df_filtered["text"], df_filtered["category"])

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
model = DecisionTreeClassifier()

In [None]:
model.fit(X_train,y_train)

In [None]:
print(classification_report(y_test,model.predict(X_test)))

In [None]:
# Tokenize and pad the input string
input_string = ["The Funniest Tweets From Parents This Week"]
input_string = pad_sequences(tokenizer.texts_to_sequences(input_string), maxlen = max_len)

In [None]:
print(model.predict(input_string))

In [None]:
# Save the tokenizer and model to a file
with open("tokenizer.pickle", "wb") as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open("model.pickle", "wb") as handle:
    pickle.dump(model, handle, protocol=pickle.HIGHEST_PROTOCOL)