## News Category Classification

### Import Libraries

In [2]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
import spacy
import pandas as pd
import string
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from imblearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE, SVMSMOTE, ADASYN, BorderlineSMOTE, RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

### Initialization

In [3]:
# load english language model and create nlp object from it
nlp = spacy.load("en_core_web_sm")
# Get the stop words list
stop_words = nlp.Defaults.stop_words
# Get the punctuation list
punctuation_list = list(string.punctuation)

#### Dataset 

Dataset Link: https://www.kaggle.com/code/hengzheng/news-category-classifier-val-acc-0-65

In [45]:
# Load the json file
df = pd.read_json(r"news_category.json", lines=True)

df_original = df

# Illustrate the top 5 column
df.head(5)

Unnamed: 0,short_description,headline,date,link,authors,category
0,She left her husband. He killed their children...,There Were 2 Mass Shootings In Texas Last Week...,2018-05-26,https://www.huffingtonpost.com/entry/texas-ama...,Melissa Jeltsen,CRIME
1,Of course it has a song.,Will Smith Joins Diplo And Nicky Jam For The 2...,2018-05-26,https://www.huffingtonpost.com/entry/will-smit...,Andy McDonald,ENTERTAINMENT
2,The actor and his longtime girlfriend Anna Ebe...,Hugh Grant Marries For The First Time At Age 57,2018-05-26,https://www.huffingtonpost.com/entry/hugh-gran...,Ron Dicker,ENTERTAINMENT
3,The actor gives Dems an ass-kicking for not fi...,Jim Carrey Blasts 'Castrato' Adam Schiff And D...,2018-05-26,https://www.huffingtonpost.com/entry/jim-carre...,Ron Dicker,ENTERTAINMENT
4,"The ""Dietland"" actress said using the bags is ...",Julianna Margulies Uses Donald Trump Poop Bags...,2018-05-26,https://www.huffingtonpost.com/entry/julianna-...,Ron Dicker,ENTERTAINMENT


In [10]:
# Count the each category
df.category.value_counts()

category
POLITICS          32739
ENTERTAINMENT     14257
HEALTHY LIVING     6694
QUEER VOICES       4995
BUSINESS           4254
SPORTS             4167
COMEDY             3971
PARENTS            3955
BLACK VOICES       3858
THE WORLDPOST      3664
WOMEN              3490
CRIME              2893
MEDIA              2815
WEIRD NEWS         2670
GREEN              2622
IMPACT             2602
WORLDPOST          2579
RELIGION           2556
STYLE              2254
WORLD NEWS         2177
TRAVEL             2145
TASTE              2096
ARTS               1509
FIFTY              1401
GOOD NEWS          1398
SCIENCE            1381
ARTS & CULTURE     1339
TECH               1231
COLLEGE            1144
LATINO VOICES      1129
EDUCATION          1004
Name: count, dtype: int64

### Preprocessing

In [41]:
# Get unique label names from category
unique_labels = df.category.unique()
# Define the numerical labels for each category
numerical_labels = [x for x in range(len(unique_labels))]
# Convert text category into numerical category
numerical_target_dict = dict(zip(unique_labels, numerical_labels))
print("Numerical Labels: ", numerical_target_dict)

# Add the new numerical label into the dataframe
df['category_label'] = df['category'].map(numerical_target_dict)

Numerical Labels:  {'CRIME': 0, 'ENTERTAINMENT': 1, 'WORLD NEWS': 2, 'IMPACT': 3, 'POLITICS': 4, 'WEIRD NEWS': 5, 'BLACK VOICES': 6, 'WOMEN': 7, 'COMEDY': 8, 'QUEER VOICES': 9, 'SPORTS': 10, 'BUSINESS': 11, 'TRAVEL': 12, 'MEDIA': 13, 'TECH': 14, 'RELIGION': 15, 'SCIENCE': 16, 'LATINO VOICES': 17, 'EDUCATION': 18, 'COLLEGE': 19, 'PARENTS': 20, 'ARTS & CULTURE': 21, 'STYLE': 22, 'GREEN': 23, 'TASTE': 24, 'HEALTHY LIVING': 25, 'THE WORLDPOST': 26, 'GOOD NEWS': 27, 'WORLDPOST': 28, 'FIFTY': 29, 'ARTS': 30}


In [19]:
# Remove the redundant words from dataset
df['headline_adjusted'] = df['headline'].apply(lambda x: x.split())
# Remove the stop words
df['headline_adjusted'] = df['headline_adjusted'].apply(lambda x: [word for word in x if word.lower() not in stop_words])
# Remove the punctuations 
df['headline_adjusted'] = df['headline_adjusted'].apply(lambda x: [word for word in x if word.lower() not in punctuation_list])
df['headline_adjusted'] = df['headline_adjusted'].apply(' '.join)

df.head(5)

Unnamed: 0,short_description,headline,date,link,authors,category,category_label,headline_adjusted
0,She left her husband. He killed their children...,There Were 2 Mass Shootings In Texas Last Week...,2018-05-26,https://www.huffingtonpost.com/entry/texas-ama...,Melissa Jeltsen,CRIME,0,"2 Mass Shootings Texas Week, 1 TV"
1,Of course it has a song.,Will Smith Joins Diplo And Nicky Jam For The 2...,2018-05-26,https://www.huffingtonpost.com/entry/will-smit...,Andy McDonald,ENTERTAINMENT,1,Smith Joins Diplo Nicky Jam 2018 World Cup's O...
2,The actor and his longtime girlfriend Anna Ebe...,Hugh Grant Marries For The First Time At Age 57,2018-05-26,https://www.huffingtonpost.com/entry/hugh-gran...,Ron Dicker,ENTERTAINMENT,1,Hugh Grant Marries Time Age 57
3,The actor gives Dems an ass-kicking for not fi...,Jim Carrey Blasts 'Castrato' Adam Schiff And D...,2018-05-26,https://www.huffingtonpost.com/entry/jim-carre...,Ron Dicker,ENTERTAINMENT,1,Jim Carrey Blasts 'Castrato' Adam Schiff Democ...
4,"The ""Dietland"" actress said using the bags is ...",Julianna Margulies Uses Donald Trump Poop Bags...,2018-05-26,https://www.huffingtonpost.com/entry/julianna-...,Ron Dicker,ENTERTAINMENT,1,Julianna Margulies Uses Donald Trump Poop Bags...


### Training wo/Sampling

#### 1-Gram

In [28]:
# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(df["headline_adjusted"], df['category_label'], test_size=0.2, random_state=331, stratify=df.category_label)

# Create pipeline
textclassifier =Pipeline([('vect', CountVectorizer(ngram_range=(1,1))),
('mnb', MultinomialNB(alpha =1.0))
])

# Start the training
textclassifier.fit(X_train, y_train)

# Test the classifier
y_pred = textclassifier.predict(X_test)

# Print the result
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.52      0.56      0.54       579
           1       0.48      0.84      0.61      2851
           2       0.49      0.09      0.16       435
           3       0.38      0.10      0.15       520
           4       0.56      0.93      0.69      6548
           5       0.42      0.21      0.28       534
           6       0.52      0.26      0.34       772
           7       0.51      0.20      0.28       698
           8       0.65      0.26      0.37       794
           9       0.66      0.56      0.61       999
          10       0.72      0.58      0.64       833
          11       0.55      0.35      0.43       851
          12       0.76      0.32      0.45       429
          13       0.72      0.25      0.37       563
          14       0.71      0.11      0.19       246
          15       0.74      0.28      0.41       511
          16       0.83      0.22      0.35       276
          17       0.80    

#### 2-Gram

In [29]:
# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(df["headline_adjusted"], df['category_label'], test_size=0.2, random_state=331, stratify=df.category_label)

# Create pipeline
textclassifier =Pipeline([('vect', CountVectorizer(ngram_range=(1,2))),
('mnb', MultinomialNB(alpha =1.0))
])

# Start the training
textclassifier.fit(X_train, y_train)

# Test the classifier
y_pred = textclassifier.predict(X_test)

# Print the result
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.68      0.30      0.42       579
           1       0.45      0.84      0.59      2851
           2       0.57      0.04      0.07       435
           3       0.57      0.02      0.04       520
           4       0.41      0.98      0.58      6548
           5       0.58      0.07      0.12       534
           6       0.69      0.11      0.19       772
           7       0.75      0.08      0.14       698
           8       0.83      0.11      0.20       794
           9       0.84      0.36      0.51       999
          10       0.81      0.38      0.52       833
          11       0.78      0.15      0.25       851
          12       0.83      0.15      0.26       429
          13       0.80      0.09      0.15       563
          14       0.60      0.02      0.05       246
          15       0.81      0.12      0.20       511
          16       0.88      0.05      0.10       276
          17       1.00    

#### 3-Gram

In [30]:
# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(df["headline_adjusted"], df['category_label'], test_size=0.2, random_state=331, stratify=df.category_label)

# Create pipeline
textclassifier =Pipeline([('vect', CountVectorizer(ngram_range=(1,3))),
('mnb', MultinomialNB(alpha =1.0))
])

# Start the training
textclassifier.fit(X_train, y_train)

# Test the classifier
y_pred = textclassifier.predict(X_test)

# Print the result
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.68      0.26      0.37       579
           1       0.46      0.83      0.59      2851
           2       0.56      0.03      0.06       435
           3       0.63      0.02      0.04       520
           4       0.39      0.99      0.56      6548
           5       0.67      0.05      0.10       534
           6       0.68      0.09      0.16       772
           7       0.79      0.07      0.12       698
           8       0.82      0.10      0.18       794
           9       0.86      0.33      0.48       999
          10       0.83      0.33      0.48       833
          11       0.80      0.12      0.21       851
          12       0.84      0.14      0.25       429
          13       0.87      0.08      0.15       563
          14       0.62      0.02      0.04       246
          15       0.84      0.10      0.18       511
          16       0.78      0.05      0.10       276
          17       0.00    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### Training w/Oversampling

#### 1-Gram

In [34]:
X_train, X_test, y_train, y_test = train_test_split(df["headline_adjusted"], df['category_label'], test_size=0.2, random_state=331, stratify=df.category_label)

textclassifier =Pipeline([('vect', CountVectorizer(ngram_range=(1,1))),
('tfidf', TfidfTransformer()),
('rndsmplr', RandomOverSampler()),
('mnb', MultinomialNB(alpha =1.0))
])
# Start the training
textclassifier.fit(X_train, y_train)

# Test the classifier
y_pred = textclassifier.predict(X_test)

# Print the result
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.42      0.68      0.52       579
           1       0.76      0.57      0.65      2851
           2       0.25      0.35      0.29       435
           3       0.23      0.31      0.26       520
           4       0.89      0.57      0.69      6548
           5       0.31      0.33      0.32       534
           6       0.40      0.45      0.42       772
           7       0.35      0.35      0.35       698
           8       0.38      0.45      0.41       794
           9       0.63      0.62      0.62       999
          10       0.67      0.69      0.68       833
          11       0.42      0.44      0.43       851
          12       0.49      0.57      0.53       429
          13       0.39      0.59      0.47       563
          14       0.28      0.46      0.35       246
          15       0.46      0.55      0.50       511
          16       0.35      0.43      0.38       276
          17       0.27    

#### 2-Gram

In [35]:
X_train, X_test, y_train, y_test = train_test_split(df["headline_adjusted"], df['category_label'], test_size=0.2, random_state=331, stratify=df.category_label)

textclassifier =Pipeline([('vect', CountVectorizer(ngram_range=(1,2))),
('tfidf', TfidfTransformer()),
('rndsmplr', RandomOverSampler()),
('mnb', MultinomialNB(alpha =1.0))
])
# Start the training
textclassifier.fit(X_train, y_train)

# Test the classifier
y_pred = textclassifier.predict(X_test)

# Print the result
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.41      0.70      0.52       579
           1       0.78      0.60      0.68      2851
           2       0.27      0.37      0.31       435
           3       0.26      0.31      0.29       520
           4       0.89      0.59      0.71      6548
           5       0.32      0.33      0.32       534
           6       0.44      0.51      0.47       772
           7       0.37      0.36      0.36       698
           8       0.39      0.47      0.43       794
           9       0.65      0.69      0.67       999
          10       0.68      0.72      0.70       833
          11       0.48      0.44      0.46       851
          12       0.50      0.58      0.54       429
          13       0.40      0.60      0.48       563
          14       0.31      0.50      0.39       246
          15       0.47      0.52      0.49       511
          16       0.36      0.44      0.40       276
          17       0.25    

#### 3-Gram

In [36]:
X_train, X_test, y_train, y_test = train_test_split(df["headline_adjusted"], df['category_label'], test_size=0.2, random_state=331, stratify=df.category_label)

textclassifier =Pipeline([('vect', CountVectorizer(ngram_range=(1,2))),
('tfidf', TfidfTransformer()),
('rndsmplr', RandomOverSampler()),
('mnb', MultinomialNB(alpha =1.0))
])
# Start the training
textclassifier.fit(X_train, y_train)

# Test the classifier
y_pred = textclassifier.predict(X_test)

# Print the result
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.42      0.71      0.53       579
           1       0.79      0.60      0.68      2851
           2       0.26      0.38      0.31       435
           3       0.26      0.31      0.29       520
           4       0.89      0.59      0.71      6548
           5       0.33      0.34      0.33       534
           6       0.44      0.51      0.47       772
           7       0.36      0.36      0.36       698
           8       0.39      0.47      0.42       794
           9       0.65      0.68      0.66       999
          10       0.69      0.73      0.71       833
          11       0.48      0.44      0.46       851
          12       0.51      0.59      0.54       429
          13       0.39      0.62      0.48       563
          14       0.30      0.49      0.37       246
          15       0.48      0.53      0.50       511
          16       0.36      0.46      0.40       276
          17       0.27    

### Training w/Undersampling

In [51]:
# Create empty dataframe
df_resampled = pd.DataFrame()
# Set minimum number of sample
min_samples = 1004 


for label in unique_labels:
    df_new = df[df.category==label].sample(min_samples, random_state=331)
    df_resampled = pd.concat([df_resampled,df_new],axis=0)

# Convert text category into numerical category
numerical_target_dict = dict(zip(unique_labels, numerical_labels))

# Add the new numerical label into the dataframe
df_resampled['category_label'] = df_resampled['category'].map(numerical_target_dict)

df_resampled['headline_adjusted'] = df_resampled['headline'].apply(lambda x: x.split())
# Remove the stop words
df_resampled['headline_adjusted'] = df_resampled['headline_adjusted'].apply(lambda x: [word for word in x if word.lower() not in stop_words])
# Remove the punctuations 
df_resampled['headline_adjusted'] = df_resampled['headline_adjusted'].apply(lambda x: [word for word in x if word.lower() not in punctuation_list])
df_resampled['headline_adjusted'] = df_resampled['headline_adjusted'].apply(' '.join)

df_resampled.head(5)

Unnamed: 0,short_description,headline,date,link,authors,category,category_label,headline_adjusted
92276,,Prisoner In Van Said Freddie Gray Was ‘Banging...,2015-04-30,https://www.huffingtonpost.com/entry/prisoner-...,,CRIME,0,Prisoner Van Said Freddie Gray ‘Banging Walls'...
56315,“The one thing that we can say is that this is...,New Details Emerge About Deadliest Mass Shooti...,2016-06-12,https://www.huffingtonpost.com/entry/terror-sh...,"Sebastian Murdock, Andy Campbell, Roque Planas...",CRIME,0,New Details Emerge Deadliest Mass Shooting U.S...
39620,The child was sitting in the back of his grand...,"Road-Rage Shooting Leaves 3-Year-Old Boy Dead,...",2016-12-18,https://www.huffingtonpost.com/entry/toddler-k...,Nina Golgowski,CRIME,0,"Road-Rage Shooting Leaves 3-Year-Old Boy Dead,..."
100579,,1-Month-Old's Face Mauled By Ferrets In Philad...,2015-01-24,https://www.huffingtonpost.com/entry/ferrets-a...,,CRIME,0,1-Month-Old's Face Mauled Ferrets Philadelphia...
114843,"American prisons foster a culture of violence,...",The Dirt Wars: An Intimate Look at Convict Cul...,2014-08-13,https://www.huffingtonpost.com/entry/the-dirt-...,"Christopher Zoukis, ContributorAuthor, Federal...",CRIME,0,Dirt Wars: Intimate Look Convict Culture Ameri...


#### 1-Gram

In [55]:
X_train, X_test, y_train, y_test = train_test_split(df_resampled["headline_adjusted"], df_resampled['category_label'], test_size=0.2, random_state=42, stratify=df_resampled.category_label)

textclassifier =Pipeline([('vect', CountVectorizer(ngram_range=(1,1))),
('mnb', MultinomialNB(alpha =1.0))
])
# Start the training
textclassifier.fit(X_train, y_train)

# Test the classifier
y_pred = textclassifier.predict(X_test)

# Print the result
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.44      0.74      0.55       201
           1       0.40      0.39      0.39       201
           2       0.42      0.44      0.43       201
           3       0.24      0.19      0.21       201
           4       0.38      0.47      0.42       201
           5       0.39      0.27      0.32       201
           6       0.51      0.43      0.47       201
           7       0.37      0.35      0.36       201
           8       0.42      0.50      0.46       200
           9       0.54      0.58      0.56       200
          10       0.61      0.60      0.61       201
          11       0.45      0.37      0.40       201
          12       0.52      0.52      0.52       201
          13       0.52      0.55      0.53       201
          14       0.52      0.64      0.57       201
          15       0.62      0.47      0.54       200
          16       0.52      0.47      0.50       201
          17       0.63    

#### 2-Gram

In [56]:
X_train, X_test, y_train, y_test = train_test_split(df_resampled["headline_adjusted"], df_resampled['category_label'], test_size=0.2, random_state=42, stratify=df_resampled.category_label)

textclassifier =Pipeline([('vect', CountVectorizer(ngram_range=(1,2))),
('mnb', MultinomialNB(alpha =1.0))
])
# Start the training
textclassifier.fit(X_train, y_train)

# Test the classifier
y_pred = textclassifier.predict(X_test)

# Print the result
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.43      0.73      0.54       201
           1       0.41      0.43      0.42       201
           2       0.43      0.47      0.45       201
           3       0.24      0.18      0.21       201
           4       0.37      0.49      0.42       201
           5       0.37      0.26      0.31       201
           6       0.53      0.50      0.52       201
           7       0.37      0.33      0.35       201
           8       0.43      0.50      0.46       200
           9       0.55      0.59      0.57       200
          10       0.62      0.64      0.63       201
          11       0.43      0.35      0.39       201
          12       0.52      0.49      0.50       201
          13       0.51      0.59      0.55       201
          14       0.52      0.65      0.58       201
          15       0.66      0.48      0.56       200
          16       0.56      0.47      0.51       201
          17       0.61    

#### 3-Gram

In [58]:
X_train, X_test, y_train, y_test = train_test_split(df_resampled["headline_adjusted"], df_resampled['category_label'], test_size=0.2, random_state=42, stratify=df_resampled.category_label)

textclassifier =Pipeline([('vect', CountVectorizer(ngram_range=(1,3))),
('mnb', MultinomialNB(alpha =1.0))
])

# Start the training
textclassifier.fit(X_train, y_train)

# Test the classifier
y_pred = textclassifier.predict(X_test)

# Print the result
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.39      0.72      0.51       201
           1       0.41      0.41      0.41       201
           2       0.41      0.46      0.43       201
           3       0.25      0.18      0.21       201
           4       0.37      0.49      0.42       201
           5       0.39      0.25      0.30       201
           6       0.54      0.49      0.51       201
           7       0.37      0.34      0.36       201
           8       0.43      0.50      0.46       200
           9       0.53      0.60      0.56       200
          10       0.62      0.64      0.63       201
          11       0.46      0.35      0.40       201
          12       0.53      0.50      0.52       201
          13       0.51      0.60      0.55       201
          14       0.52      0.66      0.58       201
          15       0.65      0.48      0.55       200
          16       0.54      0.47      0.51       201
          17       0.60    