In [3]:
from sklearn.feature_extraction.text import CountVectorizer

# Using Combination Of 1-gram and 2-gram BOW:
cv = CountVectorizer(ngram_range=(1,2))
cv.fit(["Thor Hathodewala is looking for a job"])
cv.vocabulary_

{'thor': 9,
 'hathodewala': 2,
 'is': 4,
 'looking': 7,
 'for': 0,
 'job': 6,
 'thor hathodewala': 10,
 'hathodewala is': 3,
 'is looking': 5,
 'looking for': 8,
 'for job': 1}

In [4]:
corpus = [
    "Thor ate pizza",
    "Loki is tall",
    "Loki is eating pizza"
]

In [5]:
import spacy

nlp = spacy.load("en_core_web_sm")

In [13]:
#Creating a Function to Preprocess text [Lemmatization, Stop-Words Removal, & Punctuation Removal] For Model Training:
def preprocess(text):
    doc = nlp(text)
    filtered_tokens = []
    for token in doc:
        if not token.is_stop and not token.is_punct:
            filtered_tokens.append(token.lemma_)
    return " ".join(filtered_tokens) 

In [16]:
preprocess(corpus[0])

'thor eat pizza'

In [19]:
#Storing the preprocessed texts into a new list variable:
corpus_processed = [preprocess(text) for text in corpus]
corpus_processed

['thor eat pizza', 'Loki tall', 'Loki eat pizza']

In [21]:
#Creating a 1 and 2 grams Count Vectors Of the Text List
cv = CountVectorizer(ngram_range=(1,2))

cv.fit(corpus_processed)

cv.vocabulary_

{'thor': 7,
 'eat': 0,
 'pizza': 5,
 'thor eat': 8,
 'eat pizza': 1,
 'loki': 2,
 'tall': 6,
 'loki tall': 4,
 'loki eat': 3}

In [23]:
#Using the Built CountVectorizer to Make BOW Of Text:
cv.transform(["Thor eat pizza"]).toarray()

array([[1, 1, 0, 0, 0, 1, 0, 1, 1]], dtype=int64)

In [25]:
cv.transform(["Hulk eat pizza"]).toarray() #Example Of A Out Of Vocabulary (OOV) Problem. Instead Of 5-1's Its Only 3-1's

array([[1, 1, 0, 0, 0, 1, 0, 0, 0]], dtype=int64)

In [26]:
import pandas as pd

df = pd.read_json("news_dataset.json")

df.head()

Unnamed: 0,text,category
0,Watching Schrödinger's Cat Die University of C...,SCIENCE
1,WATCH: Freaky Vortex Opens Up In Flooded Lake,SCIENCE
2,Entrepreneurs Today Don't Need a Big Budget to...,BUSINESS
3,These Roads Could Recharge Your Electric Car A...,BUSINESS
4,Civilian 'Guard' Fires Gun While 'Protecting' ...,CRIME


In [27]:
df["text"].iloc[0]

"Watching Schrödinger's Cat Die University of California, Berkeley, physicists have, for the first time, showed that, in fact, it's possible to follow the metaphorical cat through the whole process, whether he lives or dies in the end."

In [28]:
df.shape

(12695, 2)

In [30]:
#Determining Counts Of Each Category in order to identify imbalance in Dataset:
df.category.value_counts()

category
BUSINESS    4254
SPORTS      4167
CRIME       2893
SCIENCE     1381
Name: count, dtype: int64

In [31]:
#Removing the imbalance by taking the minimum number of values as base and extracting only that many values from the other category in a random fashion:
#This process is called Under-Sampling:

min_samples = 1381

#Random State ensures that when you re-run this piece of code the sampling is consistent
df_business = df[df.category=="BUSINESS"].sample(min_samples , random_state=2022)
df_sports = df[df.category=="SPORTS"].sample(min_samples , random_state=2022)
df_crime = df[df.category=="CRIME"].sample(min_samples , random_state=2022)
df_science = df[df.category=="SCIENCE"].sample(min_samples , random_state=2022)

In [35]:
df_balanced = pd.concat([df_business,df_sports,df_crime,df_science], axis=0) #axis=0 denotes concatenation in row level

In [36]:
df_balanced.category.value_counts()

category
BUSINESS    1381
SPORTS      1381
CRIME       1381
SCIENCE     1381
Name: count, dtype: int64

In [37]:
#Converting Category Values From Text To Numbers Representation As ML Algos Understands Numbers and Not Text:
target = {'BUSINESS':0,'SPORTS':1,'CRIME':2,'SCIENCE':3}

df_balanced['category_num'] = df_balanced.category.map(target)

In [38]:
df_balanced.head()

Unnamed: 0,text,category,category_num
11967,GCC Business Leaders Remain Confident in the F...,BUSINESS,0
2912,From the Other Side; an Honest Review from Emp...,BUSINESS,0
3408,"Mike McDerment, CEO of FreshBooks, Talks About...",BUSINESS,0
502,How to Market Your Business While Traveling th...,BUSINESS,0
5279,How to Leverage Intuition in Decision-making I...,BUSINESS,0


In [43]:
#Splitting Dataset Into Training and Testing Subsets:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(
    df_balanced.text,
    df_balanced.category_num,
    test_size=0.2,
    random_state=2022,
    stratify=df_balanced.category_num #It will create equal number of samples for training and testing dataset from all the categories
)

In [40]:
X_train.shape

(4419,)

In [42]:
y_train.value_counts()
y_test.value_counts()

category_num
1    277
0    276
3    276
2    276
Name: count, dtype: int64

In [49]:
#Training the model to predict category based on text:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

#Making a pipeline of countvectorizer and naive bayes classifier:
#Using 1-gram BOW CountVectorizer;
nb_classifier = Pipeline([
    ("vectorizer_bow", CountVectorizer()),
    ("multi_nb_model" , MultinomialNB())
])

#Training:
nb_classifier.fit(X_train , y_train)

#Predicting:
y_pred = nb_classifier.predict(X_test)

#Performance Evaluation:
print(classification_report(y_test , y_pred))

              precision    recall  f1-score   support

           0       0.75      0.87      0.81       276
           1       0.93      0.80      0.86       277
           2       0.83      0.90      0.86       276
           3       0.90      0.80      0.85       276

    accuracy                           0.84      1105
   macro avg       0.85      0.84      0.84      1105
weighted avg       0.85      0.84      0.84      1105



In [50]:
#Training the model to predict category based on text:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

#Making a pipeline of countvectorizer and naive bayes classifier:
#Using Combination Of 1-gram and 2-gram BOW CountVectorizer;
nb_classifier = Pipeline([
    ("vectorizer_bow", CountVectorizer(ngram_range=(1,2))),
    ("multi_nb_model" , MultinomialNB())
])

#Training:
nb_classifier.fit(X_train , y_train)

#Predicting:
y_pred = nb_classifier.predict(X_test)

#Performance Evaluation:
print(classification_report(y_test , y_pred))

              precision    recall  f1-score   support

           0       0.69      0.90      0.78       276
           1       0.95      0.74      0.83       277
           2       0.82      0.88      0.85       276
           3       0.92      0.78      0.84       276

    accuracy                           0.82      1105
   macro avg       0.85      0.82      0.83      1105
weighted avg       0.85      0.82      0.83      1105



In [51]:
X_test[:5]

3716     African Nation Slaps Exxon With Fine Nearly 7 ...
608      These Cringe-Worthy Stories Show It Can Be Har...
11172    LISTEN: The Accidental Discovery That Proved T...
1346     Build Loyalty -- The Cost -- $00.00 Remember y...
1356     Man Killed By Michigan Police Wasn't Targeting...
Name: text, dtype: object

In [52]:
y_test[:5]

3716     0
608      3
11172    3
1346     0
1356     2
Name: category_num, dtype: int64

In [53]:
y_pred[:5]

array([0, 0, 3, 0, 2], dtype=int64)

### {'BUSINESS':0,'SPORTS':1,'CRIME':2,'SCIENCE':3}

In [54]:
#using preprocessing on text and then supplying it to training model
df_balanced["preprocessed_text"] = df_balanced.text.apply(preprocess)

In [55]:
df_balanced.head()

Unnamed: 0,text,category,category_num,preprocessed_text
11967,GCC Business Leaders Remain Confident in the F...,BUSINESS,0,GCC Business Leaders remain Confident Face Reg...
2912,From the Other Side; an Honest Review from Emp...,BUSINESS,0,Honest Review employee wake morning love impor...
3408,"Mike McDerment, CEO of FreshBooks, Talks About...",BUSINESS,0,Mike McDerment ceo FreshBooks Talks give build...
502,How to Market Your Business While Traveling th...,BUSINESS,0,market business travel World recently amazing ...
5279,How to Leverage Intuition in Decision-making I...,BUSINESS,0,Leverage intuition decision making feel safe r...


In [56]:
#Splitting Dataset Into Training and Testing Subsets:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(
    df_balanced.preprocessed_text,
    df_balanced.category_num,
    test_size=0.2,
    random_state=2022,
    stratify=df_balanced.category_num #It will create equal number of samples for training and testing dataset from all the categories
)

In [60]:
#Training the model to predict category based on preprocessed_text:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

#Making a pipeline of countvectorizer and naive bayes classifier:
#Using Combination Of 1-gram and 2-gram BOW CountVectorizer;
nb_classifier = Pipeline([
    ("vectorizer_bow", CountVectorizer(ngram_range=(1,2))),
    ("multi_nb_model" , MultinomialNB())
])

#Training:
nb_classifier.fit(X_train , y_train)

#Predicting:
y_pred = nb_classifier.predict(X_test)

#Performance Evaluation:
print(classification_report(y_test , y_pred))

              precision    recall  f1-score   support

           0       0.80      0.88      0.84       276
           1       0.92      0.82      0.87       277
           2       0.83      0.92      0.87       276
           3       0.90      0.81      0.85       276

    accuracy                           0.86      1105
   macro avg       0.86      0.86      0.86      1105
weighted avg       0.86      0.86      0.86      1105

