In [2]:
import pandas as pd
df = pd.read_json('./data/news_dataset.json')
df.head()

Unnamed: 0,text,category
0,Watching Schrödinger's Cat Die University of C...,SCIENCE
1,WATCH: Freaky Vortex Opens Up In Flooded Lake,SCIENCE
2,Entrepreneurs Today Don't Need a Big Budget to...,BUSINESS
3,These Roads Could Recharge Your Electric Car A...,BUSINESS
4,Civilian 'Guard' Fires Gun While 'Protecting' ...,CRIME


In [3]:
df.shape

(12695, 2)

In [4]:
df.category.value_counts()

BUSINESS    4254
SPORTS      4167
CRIME       2893
SCIENCE     1381
Name: category, dtype: int64

In [14]:
target={
    "BUSINESS":0, 
    "SPORTS":1,
    "CRIME" :2,
    "SCIENCE":3
}

In [16]:
df['target'] = df.category.map(target)
df.head()

Unnamed: 0,text,category,target
0,Watching Schrödinger's Cat Die University of C...,SCIENCE,3
1,WATCH: Freaky Vortex Opens Up In Flooded Lake,SCIENCE,3
2,Entrepreneurs Today Don't Need a Big Budget to...,BUSINESS,0
3,These Roads Could Recharge Your Electric Car A...,BUSINESS,0
4,Civilian 'Guard' Fires Gun While 'Protecting' ...,CRIME,2


In [17]:
# Use undersample in here

In [18]:
min_sample = 1381
df_BUSINESS = df[df.category=='BUSINESS'].sample(min_sample, random_state=2)
df_SPORTS = df[df.category=='SPORTS'].sample(min_sample, random_state=2)
df_CRIME = df[df.category=='CRIME'].sample(min_sample, random_state=2)
df_SCIENCE = df[df.category=='SCIENCE'].sample(min_sample, random_state=2)

In [19]:
df_under = pd.concat([df_BUSINESS,df_CRIME,df_SCIENCE,df_SPORTS], axis=0)

In [20]:
df_under.category.value_counts()

BUSINESS    1381
CRIME       1381
SCIENCE     1381
SPORTS      1381
Name: category, dtype: int64

In [21]:
df_under.head()

Unnamed: 0,text,category,target
5257,Building Brand Advocacy From the Inside Out To...,BUSINESS,0
11894,3 Numbers That Say A Lot About Donald Trump Th...,BUSINESS,0
11420,How to Inquire Productively (6.3) Productive i...,BUSINESS,0
8910,Immigration Crackdown Expected To Increase Cro...,BUSINESS,0
11442,A Guide to Getting the Most of Going to Confer...,BUSINESS,0


In [26]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(df_under.text, df_under.target, test_size=0.2, random_state=2024, 
                                                 stratify=df_under.target)

In [27]:
y_train.value_counts()

3    1105
1    1105
2    1105
0    1104
Name: target, dtype: int64

In [28]:
# Using Bag of words -------------------------------------------------------------------------------------

In [30]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import CountVectorizer

In [37]:
clf = Pipeline([
    ('vectorizer', CountVectorizer(ngram_range=(1,2))),
    ('model', MultinomialNB())
])
clf.fit(x_train,y_train)

In [38]:
y_pred = clf.predict(x_test)

In [39]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.71      0.90      0.79       277
           1       0.94      0.83      0.88       276
           2       0.91      0.89      0.90       276
           3       0.91      0.79      0.85       276

    accuracy                           0.85      1105
   macro avg       0.87      0.85      0.85      1105
weighted avg       0.87      0.85      0.85      1105

