In [5]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

In [15]:
df = pd.read_json('news_dataset.json')
print(df.shape)

df.head()


(12695, 2)


Unnamed: 0,text,category
0,Watching Schrödinger's Cat Die University of C...,SCIENCE
1,WATCH: Freaky Vortex Opens Up In Flooded Lake,SCIENCE
2,Entrepreneurs Today Don't Need a Big Budget to...,BUSINESS
3,These Roads Could Recharge Your Electric Car A...,BUSINESS
4,Civilian 'Guard' Fires Gun While 'Protecting' ...,CRIME


In [16]:
df.category.value_counts()

category
BUSINESS    4254
SPORTS      4167
CRIME       2893
SCIENCE     1381
Name: count, dtype: int64

In [21]:
min_sample = 1381
df_business = df[df.category=='BUSINESS'].sample(min_sample, random_state = 2022)
df_sports = df[df.category=='SPORTS'].sample(min_sample, random_state = 2022)
df_crime = df[df.category=='CRIME'].sample(min_sample, random_state = 2022)
df_science = df[df.category=='SCIENCE'].sample(min_sample, random_state = 2022)


In [26]:
df_balance = pd.concat([df_business,df_sports,df_crime, df_science],axis = 0)
df_balance.category.value_counts()

category
BUSINESS    1381
SPORTS      1381
CRIME       1381
SCIENCE     1381
Name: count, dtype: int64

In [27]:
df.head()

Unnamed: 0,text,category
0,Watching Schrödinger's Cat Die University of C...,SCIENCE
1,WATCH: Freaky Vortex Opens Up In Flooded Lake,SCIENCE
2,Entrepreneurs Today Don't Need a Big Budget to...,BUSINESS
3,These Roads Could Recharge Your Electric Car A...,BUSINESS
4,Civilian 'Guard' Fires Gun While 'Protecting' ...,CRIME


In [29]:
df_balance.head()

Unnamed: 0,text,category
11967,GCC Business Leaders Remain Confident in the F...,BUSINESS
2912,From the Other Side; an Honest Review from Emp...,BUSINESS
3408,"Mike McDerment, CEO of FreshBooks, Talks About...",BUSINESS
502,How to Market Your Business While Traveling th...,BUSINESS
5279,How to Leverage Intuition in Decision-making I...,BUSINESS


In [30]:
df_balance['category_num'] = df_balance['category'].map({

    'BUSINESS': 0,
    'SPORTS' : 1,
     'CRIME' : 2,
    'SCIENCE' :3 
})

In [31]:
df_balance.head()

Unnamed: 0,text,category,category_num
11967,GCC Business Leaders Remain Confident in the F...,BUSINESS,0
2912,From the Other Side; an Honest Review from Emp...,BUSINESS,0
3408,"Mike McDerment, CEO of FreshBooks, Talks About...",BUSINESS,0
502,How to Market Your Business While Traveling th...,BUSINESS,0
5279,How to Leverage Intuition in Decision-making I...,BUSINESS,0


In [32]:
df_balance.category_num.value_counts()

category_num
0    1381
1    1381
2    1381
3    1381
Name: count, dtype: int64

In [34]:
df_balance.category_num.unique()

array([0, 1, 2, 3], dtype=int64)

In [36]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    df_balance.text, df_balance.category_num, test_size=0.22,stratify=df_balance.category_num ,random_state=42)

In [38]:
print(X_train.shape)
print(y_train.shape)

(4308,)
(4308,)


In [39]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

#1. create a pipeline object
clf = Pipeline([
     ('vectorizer_bow', CountVectorizer(ngram_range = (1, 1))),        #using the ngram_range parameter 
     ('Multi NB', MultinomialNB())         
])

#2. fit with X_train and y_train
clf.fit(X_train, y_train)


#3. get the predictions for X_test and store it in y_pred
y_pred = clf.predict(X_test)


#4. print the classfication report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.79      0.92      0.85       304
           1       0.92      0.84      0.88       304
           2       0.86      0.89      0.88       304
           3       0.93      0.83      0.88       304

    accuracy                           0.87      1216
   macro avg       0.88      0.87      0.87      1216
weighted avg       0.88      0.87      0.87      1216

