In [1]:
import numpy as np
import pandas as pd 

In [2]:
df = pd.read_json('News_Category_Dataset_v3.json', lines=True)[['headline', 'category']]
df.head()

Unnamed: 0,headline,category
0,Over 4 Million Americans Roll Up Sleeves For O...,U.S. NEWS
1,"American Airlines Flyer Charged, Banned For Li...",U.S. NEWS
2,23 Of The Funniest Tweets About Cats And Dogs ...,COMEDY
3,The Funniest Tweets From Parents This Week (Se...,PARENTING
4,Woman Who Called Cops On Black Bird-Watcher Lo...,U.S. NEWS


In [3]:
df.category.value_counts()

category
POLITICS          35602
WELLNESS          17945
ENTERTAINMENT     17362
TRAVEL             9900
STYLE & BEAUTY     9814
PARENTING          8791
HEALTHY LIVING     6694
QUEER VOICES       6347
FOOD & DRINK       6340
BUSINESS           5992
COMEDY             5400
SPORTS             5077
BLACK VOICES       4583
HOME & LIVING      4320
PARENTS            3955
THE WORLDPOST      3664
WEDDINGS           3653
WOMEN              3572
CRIME              3562
IMPACT             3484
DIVORCE            3426
WORLD NEWS         3299
MEDIA              2944
WEIRD NEWS         2777
GREEN              2622
WORLDPOST          2579
RELIGION           2577
STYLE              2254
SCIENCE            2206
TECH               2104
TASTE              2096
MONEY              1756
ARTS               1509
ENVIRONMENT        1444
FIFTY              1401
GOOD NEWS          1398
U.S. NEWS          1377
ARTS & CULTURE     1339
COLLEGE            1144
LATINO VOICES      1130
CULTURE & ARTS     1074
EDUCATI

In [4]:
desired_categories = ['CRIME', 'BUSINESS', 'TRAVEL', 'SPORTS']
df_new = df[df['category'].isin(desired_categories)]
df_new.head()

Unnamed: 0,headline,category
17,"Maury Wills, Base-Stealing Shortstop For Dodge...",SPORTS
26,"Las Vegas Aces Win First WNBA Title, Chelsea G...",SPORTS
61,Boston Marathon To Make Race More Inclusive Fo...,SPORTS
62,"Anthony Varvaro, MLB Pitcher Turned Transit Co...",SPORTS
67,Carlos Alcaraz Wins U.S. Open For 1st Slam Tit...,SPORTS


In [5]:
df_new.category.value_counts()

category
TRAVEL      9900
BUSINESS    5992
SPORTS      5077
CRIME       3562
Name: count, dtype: int64

In [6]:
min_samples = 3562 # we have these many CRIME articles 

df_business = df_new[df_new.category=="BUSINESS"].sample(min_samples, random_state=2022)
df_sports = df_new[df_new.category=="SPORTS"].sample(min_samples, random_state=2022)
df_crime = df_new[df_new.category=="CRIME"].sample(min_samples, random_state=2022)
df_travel = df_new[df_new.category=="TRAVEL"].sample(min_samples, random_state=2022)

In [7]:
df_balanced = pd.concat([df_business,df_sports,df_crime,df_travel],axis=0)
df_balanced.category.value_counts()

category
BUSINESS    3562
SPORTS      3562
CRIME       3562
TRAVEL      3562
Name: count, dtype: int64

In [8]:
df_balanced.info()

<class 'pandas.core.frame.DataFrame'>
Index: 14248 entries, 181516 to 155157
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   headline  14248 non-null  object
 1   category  14248 non-null  object
dtypes: object(2)
memory usage: 333.9+ KB


In [9]:
df_balanced.head()

Unnamed: 0,headline,category
181516,Entrepreneurism: Lots of Little Traumas and No...,BUSINESS
58552,Tesla Just Unveiled The Quickest Car You Can A...,BUSINESS
155102,90 Percent Of Employers Tie Workers' Pay To Co...,BUSINESS
71725,One Of Ben Carson's Craziest Ideas Is Coming True,BUSINESS
80455,CEO Who Price Gouged HIV Drug Arrested For Sec...,BUSINESS


In [11]:
import spacy

# Load the spaCy English model
nlp = spacy.load('en_core_web_sm')

def preprocess_text_spacy(text):
    # Process the text using spaCy
    doc = nlp(text.lower())  # Convert to lowercase

    # Lemmatize, remove stopwords and punctuation
    tokens = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct]

    # Join tokens back into a single string
    preprocessed_text = ' '.join(tokens)
    
    return preprocessed_text

# Apply the preprocessing function to the dataset
df_balanced['cleaned_headline'] = df_balanced['headline'].apply(preprocess_text_spacy)

# Display the first few cleaned headlines
df_balanced[['headline', 'cleaned_headline']].head()

Unnamed: 0,headline,cleaned_headline
181516,Entrepreneurism: Lots of Little Traumas and No...,entrepreneurism lot little trauma big one
58552,Tesla Just Unveiled The Quickest Car You Can A...,tesla unveil quick car actually buy
155102,90 Percent Of Employers Tie Workers' Pay To Co...,90 percent employer tie worker pay company per...
71725,One Of Ben Carson's Craziest Ideas Is Coming True,ben carson crazy idea come true
80455,CEO Who Price Gouged HIV Drug Arrested For Sec...,ceo price gouge hiv drug arrest security fraud...


In [12]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
df_balanced['category_num'] = label_encoder.fit_transform(df_balanced['category'])

In [13]:
df_balanced.head()

Unnamed: 0,headline,category,cleaned_headline,category_num
181516,Entrepreneurism: Lots of Little Traumas and No...,BUSINESS,entrepreneurism lot little trauma big one,0
58552,Tesla Just Unveiled The Quickest Car You Can A...,BUSINESS,tesla unveil quick car actually buy,0
155102,90 Percent Of Employers Tie Workers' Pay To Co...,BUSINESS,90 percent employer tie worker pay company per...,0
71725,One Of Ben Carson's Craziest Ideas Is Coming True,BUSINESS,ben carson crazy idea come true,0
80455,CEO Who Price Gouged HIV Drug Arrested For Sec...,BUSINESS,ceo price gouge hiv drug arrest security fraud...,0


In [14]:
df_balanced.tail()

Unnamed: 0,headline,category,cleaned_headline,category_num
8879,14 Trips To Take From New York City On A Long ...,TRAVEL,14 trip new york city long weekend,3
157123,In Love With Sio Bak (Roast Pork),TRAVEL,love sio bak roast pork,3
176285,How To Spend A Weekend In San Diego,TRAVEL,spend weekend san diego,3
148363,Whither Ye Olde Stomping Grounds?,TRAVEL,ye olde stomp ground,3
155157,Escape From Notting Hill -- A London Staycatio...,TRAVEL,escape notte hill london staycation adventure,3


In [15]:
df_balanced

Unnamed: 0,headline,category,cleaned_headline,category_num
181516,Entrepreneurism: Lots of Little Traumas and No...,BUSINESS,entrepreneurism lot little trauma big one,0
58552,Tesla Just Unveiled The Quickest Car You Can A...,BUSINESS,tesla unveil quick car actually buy,0
155102,90 Percent Of Employers Tie Workers' Pay To Co...,BUSINESS,90 percent employer tie worker pay company per...,0
71725,One Of Ben Carson's Craziest Ideas Is Coming True,BUSINESS,ben carson crazy idea come true,0
80455,CEO Who Price Gouged HIV Drug Arrested For Sec...,BUSINESS,ceo price gouge hiv drug arrest security fraud...,0
...,...,...,...,...
8879,14 Trips To Take From New York City On A Long ...,TRAVEL,14 trip new york city long weekend,3
157123,In Love With Sio Bak (Roast Pork),TRAVEL,love sio bak roast pork,3
176285,How To Spend A Weekend In San Diego,TRAVEL,spend weekend san diego,3
148363,Whither Ye Olde Stomping Grounds?,TRAVEL,ye olde stomp ground,3


In [16]:
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    df_balanced.cleaned_headline, 
    df_balanced.category_num, 
    test_size=0.2, # 20% samples will go to test dataset
    random_state=2023,
    stratify=df_balanced.category_num
)

In [19]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
clf= Pipeline([
    ('vectorizer_tf',TfidfVectorizer()),
    ('multinb',MultinomialNB())
])
clf.fit(X_train,y_train)

In [20]:
y_pred_rf = clf.predict(X_test)

print("\nMultinomialNB - Classification Report:")
print(classification_report(y_test, y_pred_rf))


MultinomialNB - Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.85      0.86       712
           1       0.86      0.93      0.89       713
           2       0.88      0.86      0.87       712
           3       0.90      0.88      0.89       713

    accuracy                           0.88      2850
   macro avg       0.88      0.88      0.88      2850
weighted avg       0.88      0.88      0.88      2850



In [22]:
import pickle
with open('model_pipeline.pkl', 'wb') as file:
    pickle.dump(clf, file)

In [23]:
with open('label_encoder.pkl', 'wb') as file:
    pickle.dump(label_encoder, file)