Importing the [Dataset](https://www.kaggle.com/datasets/rmisra/news-category-dataset)

In [None]:
!unzip "/content/News_Category_Dataset_v3.json.zip"

Archive:  /content/News_Category_Dataset_v3.json.zip
  inflating: News_Category_Dataset_v3.json  


Installing the necessary packages

In [None]:
!pip install jsonlines

Collecting jsonlines
  Downloading jsonlines-4.0.0-py3-none-any.whl (8.7 kB)
Installing collected packages: jsonlines
Successfully installed jsonlines-4.0.0


In [None]:
!pip install sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-2.6.1-py3-none-any.whl (163 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/163.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.0/163.3 kB[0m [31m1.1 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━[0m [32m143.4/163.3 kB[0m [31m2.0 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m163.3/163.3 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m [31m46.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.11.0->sentence-trans

Importing required packages

In [1]:
import pandas as pd
import jsonlines
from sentence_transformers import SentenceTransformer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.decomposition import KernelPCA
import re

Loading the dataset

In [2]:
with jsonlines.open('/content/News_Category_Dataset_v3.json') as reader:
  df = pd.DataFrame(reader)
df.head()

Unnamed: 0,link,headline,category,short_description,authors,date
0,https://www.huffpost.com/entry/covid-boosters-...,Over 4 Million Americans Roll Up Sleeves For O...,U.S. NEWS,Health experts said it is too early to predict...,"Carla K. Johnson, AP",2022-09-23
1,https://www.huffpost.com/entry/american-airlin...,"American Airlines Flyer Charged, Banned For Li...",U.S. NEWS,He was subdued by passengers and crew when he ...,Mary Papenfuss,2022-09-23
2,https://www.huffpost.com/entry/funniest-tweets...,23 Of The Funniest Tweets About Cats And Dogs ...,COMEDY,"""Until you have a dog you don't understand wha...",Elyse Wanshel,2022-09-23
3,https://www.huffpost.com/entry/funniest-parent...,The Funniest Tweets From Parents This Week (Se...,PARENTING,"""Accidentally put grown-up toothpaste on my to...",Caroline Bologna,2022-09-23
4,https://www.huffpost.com/entry/amy-cooper-lose...,Woman Who Called Cops On Black Bird-Watcher Lo...,U.S. NEWS,Amy Cooper accused investment firm Franklin Te...,Nina Golgowski,2022-09-22


In [3]:
df['category'].value_counts()

category
POLITICS          35602
WELLNESS          17945
ENTERTAINMENT     17362
TRAVEL             9900
STYLE & BEAUTY     9814
PARENTING          8791
HEALTHY LIVING     6694
QUEER VOICES       6347
FOOD & DRINK       6340
BUSINESS           5992
COMEDY             5400
SPORTS             5077
BLACK VOICES       4583
HOME & LIVING      4320
PARENTS            3955
THE WORLDPOST      3664
WEDDINGS           3653
WOMEN              3572
CRIME              3562
IMPACT             3484
DIVORCE            3426
WORLD NEWS         3299
MEDIA              2944
WEIRD NEWS         2777
GREEN              2622
WORLDPOST          2579
RELIGION           2577
STYLE              2254
SCIENCE            2206
TECH               2104
TASTE              2096
MONEY              1756
ARTS               1509
ENVIRONMENT        1444
FIFTY              1401
GOOD NEWS          1398
U.S. NEWS          1377
ARTS & CULTURE     1339
COLLEGE            1144
LATINO VOICES      1130
CULTURE & ARTS     1074
EDUCATI

In [None]:
df.shape

(209527, 6)

Selecting only specific categories to have lesser rows to keep the training quicker

In [4]:
df_data = df[['headline', 'category']]
df_data = df_data[df_data['category'].isin(['POLITICS', 'WELLNESS', 'ENTERTAINMENT'])]
df_data.dropna(inplace=True)
df_data.shape

(70909, 2)

In [5]:
headlines = df_data['headline'].tolist()
categories = df_data['category'].tolist()

Basic text cleaning

In [6]:
def text_clean(text):
  text = re.sub(r'[^A-Za-z0-9 .]+', '', text)
  return text

headlines = [text_clean(text) for text in headlines]

BERT model to convert sentences to word embeddings

In [7]:
model = SentenceTransformer('bert-base-nli-mean-tokens', device='cuda')
headlineEmbeddings = model.encode(headlines)
headlineEmbeddings[0]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


array([-7.58076966e-01,  1.19756505e-01,  1.53652668e+00,  1.94916278e-01,
        5.19288599e-01, -1.04765809e+00,  8.65096867e-01,  1.07103162e-01,
       -6.53860569e-01, -1.72367081e-01,  2.32030600e-01, -1.11791289e+00,
        1.00537109e+00,  3.97259623e-01, -5.63177705e-01,  4.29334305e-02,
       -5.17203271e-01,  2.84594476e-01,  4.21533138e-01, -1.01046181e+00,
        2.85030715e-02, -2.22282141e-01,  8.09958298e-03, -9.39288735e-02,
        1.06959248e+00, -7.56412983e-01, -4.23515558e-01, -1.18424380e+00,
       -3.83749902e-01,  2.52491515e-02, -9.02195871e-01,  3.69470596e-01,
       -5.92041016e-01, -1.92641139e-01,  3.77276868e-01,  2.82131881e-01,
       -1.33229816e+00,  4.60853785e-01,  4.14373577e-02, -1.94462821e-01,
        2.76312739e-01, -6.49961114e-01, -8.10949728e-02, -6.24468923e-01,
       -1.77573609e+00, -8.08284134e-02,  3.97548050e-01,  4.34037149e-01,
        1.08887470e+00, -1.66679752e+00, -1.05264151e+00,  4.38503206e-01,
        9.69271839e-01, -

In [8]:
len(headlineEmbeddings), len(headlineEmbeddings[0])

(70909, 768)

Train and test split

In [9]:
X_train, X_test, y_train, y_test = train_test_split(headlineEmbeddings, categories, test_size=0.2, random_state=42)

Training a basic Random Forest Classifier

In [None]:
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train, y_train)

y_pred = rf_classifier.predict(X_test)

In [12]:
print(classification_report(y_test, y_pred))

               precision    recall  f1-score   support

ENTERTAINMENT       0.84      0.72      0.78      3463
     POLITICS       0.86      0.92      0.89      7177
     WELLNESS       0.84      0.84      0.84      3542

     accuracy                           0.85     14182
    macro avg       0.85      0.83      0.84     14182
 weighted avg       0.85      0.85      0.85     14182



Saving the trained model to a pickle file. We can later load it for predictions

In [13]:
import pickle

with open('random_forest_model.pkl', 'wb') as file:
    pickle.dump(model, file)