<a href="https://colab.research.google.com/github/bifinbabu/nlp-jupyter-notebook/blob/main/bag_of_n_grams.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [10]:
from sklearn.feature_extraction.text import CountVectorizer

v = CountVectorizer(ngram_range=(1,2))
v.fit(['Thor Hathodawala is looking for a job'])
v.vocabulary_

{'thor': 9,
 'hathodawala': 2,
 'is': 4,
 'looking': 7,
 'for': 0,
 'job': 6,
 'thor hathodawala': 10,
 'hathodawala is': 3,
 'is looking': 5,
 'looking for': 8,
 'for job': 1}

In [11]:
corpus = ["Thor ate pizza", "Loki is tall", "Loki is eating pizza"]

In [12]:
import spacy
nlp = spacy.load("en_core_web_sm")

def preprocess(text):
  doc = nlp(text)
  filtered_tokens = []
  for token in doc:
    if token.is_stop or token.is_punct:
      continue
    filtered_tokens.append(token.lemma_)
  return " ".join(filtered_tokens)

In [13]:
preprocess("Thor ate pizza")

'thor eat pizza'

In [14]:
corpus_processed = [preprocess(text) for text in corpus]
corpus_processed

['thor eat pizza', 'Loki tall', 'Loki eat pizza']

In [15]:
v = CountVectorizer(ngram_range=(1,2))
v.fit(corpus_processed)
v.vocabulary_

{'thor': 7,
 'eat': 0,
 'pizza': 5,
 'thor eat': 8,
 'eat pizza': 1,
 'loki': 2,
 'tall': 6,
 'loki tall': 4,
 'loki eat': 3}

In [16]:
v.transform(["Thor eat pizza"]).toarray()

array([[1, 1, 0, 0, 0, 1, 0, 1, 1]])

In [17]:
v.transform(["Hulk eat pizza"]).toarray()

array([[1, 1, 0, 0, 0, 1, 0, 0, 0]])

In [18]:
import pandas as pd

df = pd.read_json("news_dataset.json")

print(df.shape)

df.head()

(12695, 2)


Unnamed: 0,text,category
0,Watching Schrödinger's Cat Die University of C...,SCIENCE
1,WATCH: Freaky Vortex Opens Up In Flooded Lake,SCIENCE
2,Entrepreneurs Today Don't Need a Big Budget to...,BUSINESS
3,These Roads Could Recharge Your Electric Car A...,BUSINESS
4,Civilian 'Guard' Fires Gun While 'Protecting' ...,CRIME


In [19]:
df.category.value_counts()


Unnamed: 0_level_0,count
category,Unnamed: 1_level_1
BUSINESS,4254
SPORTS,4167
CRIME,2893
SCIENCE,1381


In [20]:
min_samples = 1381

df_business = df[df.category == "BUSINESS"].sample(min_samples, random_state=2022)
df_sports = df[df.category == "SPORTS"].sample(min_samples, random_state=2022)
df_crime = df[df.category == "CRIME"].sample(min_samples, random_state=2022)
df_science = df[df.category == "SCIENCE"].sample(min_samples, random_state=2022)

In [21]:
df_balanced = pd.concat([df_business, df_sports, df_crime, df_science], axis=0)
df_balanced.category.value_counts()

Unnamed: 0_level_0,count
category,Unnamed: 1_level_1
BUSINESS,1381
SPORTS,1381
CRIME,1381
SCIENCE,1381


In [22]:
target = {'BUSINESS': 0, 'SPORTS': 1, 'CRIME': 2, 'SCIENCE': 3}

df_balanced["category_num"] = df_balanced.category.map(target)
df_balanced.head()

Unnamed: 0,text,category,category_num
11967,GCC Business Leaders Remain Confident in the F...,BUSINESS,0
2912,From the Other Side; an Honest Review from Emp...,BUSINESS,0
3408,"Mike McDerment, CEO of FreshBooks, Talks About...",BUSINESS,0
502,How to Market Your Business While Traveling th...,BUSINESS,0
5279,How to Leverage Intuition in Decision-making I...,BUSINESS,0


In [23]:
from re import X
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    df_balanced.text,
    df_balanced.category_num,
    test_size=0.2,
    stratify=df_balanced.category_num,
    random_state=2022
)

In [24]:
print(X_train.shape)
X_train.head()

(4419,)


Unnamed: 0,text
7589,Ovulating Women Prefer Images of Penetration O...
10442,Scientists Discover Spooky Influence On Baby N...
8792,Olympic Race Walker Steps Up To Propose To His...
1733,Beloved Bipedal Bear Named Pedals Believed Kil...
2526,"Elizabeth Smart Gave Birth To Baby Girl, Fathe..."


In [25]:
y_train.value_counts()

Unnamed: 0_level_0,count
category_num,Unnamed: 1_level_1
3,1105
2,1105
0,1105
1,1104


In [26]:
y_test.value_counts()

Unnamed: 0_level_0,count
category_num,Unnamed: 1_level_1
1,277
0,276
3,276
2,276


In [27]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

clf = Pipeline([
    ('vectorizer_bow', CountVectorizer()),
    ('Multi NB', MultinomialNB())
])

clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.75      0.87      0.81       276
           1       0.93      0.80      0.86       277
           2       0.83      0.90      0.86       276
           3       0.90      0.80      0.85       276

    accuracy                           0.84      1105
   macro avg       0.85      0.84      0.84      1105
weighted avg       0.85      0.84      0.84      1105



In [28]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

clf = Pipeline([
    ('vectorizer_bow', CountVectorizer(ngram_range=(1,2))),
    ('Multi NB', MultinomialNB())
])

clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.69      0.90      0.78       276
           1       0.95      0.74      0.83       277
           2       0.82      0.88      0.85       276
           3       0.92      0.78      0.84       276

    accuracy                           0.82      1105
   macro avg       0.85      0.82      0.83      1105
weighted avg       0.85      0.82      0.83      1105



In [29]:
X_test[:5]

Unnamed: 0,text
3716,African Nation Slaps Exxon With Fine Nearly 7 ...
608,These Cringe-Worthy Stories Show It Can Be Har...
11172,LISTEN: The Accidental Discovery That Proved T...
1346,Build Loyalty -- The Cost -- $00.00 Remember y...
1356,Man Killed By Michigan Police Wasn't Targeting...


In [30]:
y_test[:5]

Unnamed: 0,category_num
3716,0
608,3
11172,3
1346,0
1356,2


In [31]:
y_pred[:5]

array([0, 0, 3, 0, 2])

In [32]:
df_balanced["preprocessed_text"] = df_balanced.text.apply(preprocess)

In [33]:
df_balanced.head()

Unnamed: 0,text,category,category_num,preprocessed_text
11967,GCC Business Leaders Remain Confident in the F...,BUSINESS,0,GCC Business Leaders remain Confident Face Reg...
2912,From the Other Side; an Honest Review from Emp...,BUSINESS,0,Honest Review employee wake morning love impor...
3408,"Mike McDerment, CEO of FreshBooks, Talks About...",BUSINESS,0,Mike McDerment ceo FreshBooks Talks give build...
502,How to Market Your Business While Traveling th...,BUSINESS,0,market business travel World recently amazing ...
5279,How to Leverage Intuition in Decision-making I...,BUSINESS,0,Leverage intuition decision making feel safe r...


In [34]:
from re import X
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    df_balanced.preprocessed_text,
    df_balanced.category_num,
    test_size=0.2,
    stratify=df_balanced.category_num,
    random_state=2022
)

In [35]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

clf = Pipeline([
    ('vectorizer_bow', CountVectorizer(ngram_range=(1,2))),
    ('Multi NB', MultinomialNB())
])

clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.80      0.88      0.84       276
           1       0.92      0.82      0.87       277
           2       0.83      0.92      0.87       276
           3       0.90      0.81      0.85       276

    accuracy                           0.86      1105
   macro avg       0.86      0.86      0.86      1105
weighted avg       0.86      0.86      0.86      1105

