# Model Development
We want to compare different models

In [39]:
# Built-in
import re
import pickle

# Data science utils
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from lazypredict.Supervised import LazyClassifier
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.metrics import classification_report

# NLP
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
# Import data
df = pd.read_json("../data/News_Category_Dataset_v3.json", orient = "records", lines = True)
df.head()

Unnamed: 0,link,headline,category,short_description,authors,date
0,https://www.huffpost.com/entry/covid-boosters-...,Over 4 Million Americans Roll Up Sleeves For O...,U.S. NEWS,Health experts said it is too early to predict...,"Carla K. Johnson, AP",2022-09-23
1,https://www.huffpost.com/entry/american-airlin...,"American Airlines Flyer Charged, Banned For Li...",U.S. NEWS,He was subdued by passengers and crew when he ...,Mary Papenfuss,2022-09-23
2,https://www.huffpost.com/entry/funniest-tweets...,23 Of The Funniest Tweets About Cats And Dogs ...,COMEDY,"""Until you have a dog you don't understand wha...",Elyse Wanshel,2022-09-23
3,https://www.huffpost.com/entry/funniest-parent...,The Funniest Tweets From Parents This Week (Se...,PARENTING,"""Accidentally put grown-up toothpaste on my to...",Caroline Bologna,2022-09-23
4,https://www.huffpost.com/entry/amy-cooper-lose...,Woman Who Called Cops On Black Bird-Watcher Lo...,U.S. NEWS,Amy Cooper accused investment firm Franklin Te...,Nina Golgowski,2022-09-22


In [3]:
df_filtered = df[["headline", "category", "short_description"]].copy()
df_filtered.loc[:, "text"] = df_filtered["headline"] + df_filtered["short_description"]
df_filtered.drop(["headline","short_description"],axis=1,inplace=True)
df_filtered.head()

Unnamed: 0,category,text
0,U.S. NEWS,Over 4 Million Americans Roll Up Sleeves For O...
1,U.S. NEWS,"American Airlines Flyer Charged, Banned For Li..."
2,COMEDY,23 Of The Funniest Tweets About Cats And Dogs ...
3,PARENTING,The Funniest Tweets From Parents This Week (Se...
4,U.S. NEWS,Woman Who Called Cops On Black Bird-Watcher Lo...


In [4]:
print("Number of unique categories", df["category"].nunique())

Number of unique categories 42


In [5]:
# preprocess text (removing stopwords and tokenizing)
def process_text(text):
    # convert text to lowercase, remove newlines and carriage returns, and strip leading/trailing whitespace
    text = text.lower().replace('\n',' ').replace('\r','').strip()
    # replace multiple spaces with single space
    text = re.sub(' +', ' ', text)
    # remove non-alphanumeric characters and digits
    text = re.sub(r'[^\w\s]','',text)
    text = re.sub(r'[0-9]','',text)
    # create set of english stopwords
    stop_words = set(stopwords.words('english')) 
    # tokenize text into words
    word_tokens = word_tokenize(text)
    # if word not in stops_words, add word to filtered_sentence
    filtered_sentence = [] 
    for w in word_tokens: 
        if w not in stop_words: 
            filtered_sentence.append(w) 
    
    text = " ".join(filtered_sentence)
    return text

In [6]:
df_filtered["text"] = df_filtered["text"].apply(lambda x:process_text(x))

In [7]:
# Converts a collection of raw documents to a matrix of TF-IDF features
vectorizer = TfidfVectorizer()
# Tokenize the text into sequences of integers
tokenizer = Tokenizer()

In [8]:
# Get max length to calculate pad sequences later
max_len = df_filtered["text"].apply(lambda x:len(x.split())).max()
max_len

140

Why do we need pad sequences?
<br>Pad sequences is used to ensure that all input sequences have the same length. This is important because many machine learning models, such as neural networks, expect input data to have a fixed size. If the input sequences have different lengths, they need to be padded or truncated so that they all have the same length.

In [25]:
def TF_IDF_ML(X,y):
    # Fit the tokenizer on the input text data
    tokenizer.fit_on_texts(X)
    # Create a mapping of words to their corresponding index in the vocabulary
    word_index = tokenizer.word_index
    # Determine the size of the vocabulary by adding 1 to the length of the word index
    vocab_size = len(tokenizer.word_index) + 1
    
    # Split the input data into train and test sets, with 30% of the data being used for testing
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)
    
    # Pad the sequences of integers obtained by tokenizing the text to the same length
    X_train = pad_sequences(tokenizer.texts_to_sequences(X_train),
                        maxlen = max_len)
    X_test = pad_sequences(tokenizer.texts_to_sequences(X_test),
                       maxlen = max_len)
    
    # Return the train and test sets for both the features and labels
    return X_train, X_test, y_train, y_test

In [26]:
X_train, X_test, y_train, y_test = TF_IDF_ML(df_filtered["text"], df_filtered["category"])

In [28]:
from sklearn.tree import DecisionTreeClassifier

In [30]:
model = DecisionTreeClassifier()

In [31]:
model.fit(X_train,y_train)

In [33]:
print(classification_report(y_test,model.predict(X_test)))

                precision    recall  f1-score   support

          ARTS       0.04      0.03      0.03       438
ARTS & CULTURE       0.03      0.03      0.03       388
  BLACK VOICES       0.03      0.03      0.03      1378
      BUSINESS       0.05      0.05      0.05      1796
       COLLEGE       0.01      0.02      0.01       297
        COMEDY       0.07      0.07      0.07      1620
         CRIME       0.04      0.04      0.04      1094
CULTURE & ARTS       0.02      0.02      0.02       309
       DIVORCE       0.06      0.06      0.06      1013
     EDUCATION       0.01      0.01      0.01       315
 ENTERTAINMENT       0.16      0.16      0.16      5148
   ENVIRONMENT       0.04      0.04      0.04       442
         FIFTY       0.04      0.04      0.04       413
  FOOD & DRINK       0.19      0.20      0.19      1896
     GOOD NEWS       0.02      0.02      0.02       402
         GREEN       0.03      0.03      0.03       781
HEALTHY LIVING       0.08      0.08      0.08  

In [36]:
# Tokenize and pad the input string
input_string = ["The Funniest Tweets From Parents This Week"]
input_string = pad_sequences(tokenizer.texts_to_sequences(input_string), maxlen = max_len)

In [38]:
print(model.predict(input_string))

['ARTS']


In [40]:
# Save the tokenizer and model to a file
with open("tokenizer.pickle", "wb") as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open("model.pickle", "wb") as handle:
    pickle.dump(model, handle, protocol=pickle.HIGHEST_PROTOCOL)