In [1]:
from sklearn.feature_extraction.text import CountVectorizer 

In [7]:
v = CountVectorizer(ngram_range = (1,4))
v.fit(["I love to play with data and its forms"])
v.vocabulary_

{'love': 10,
 'to': 18,
 'play': 14,
 'with': 22,
 'data': 3,
 'and': 0,
 'its': 8,
 'forms': 7,
 'love to': 11,
 'to play': 19,
 'play with': 15,
 'with data': 23,
 'data and': 4,
 'and its': 1,
 'its forms': 9,
 'love to play': 12,
 'to play with': 20,
 'play with data': 16,
 'with data and': 24,
 'data and its': 5,
 'and its forms': 2,
 'love to play with': 13,
 'to play with data': 21,
 'play with data and': 17,
 'with data and its': 25,
 'data and its forms': 6}

In [8]:
corpus = [
    "Thor ate pizza",
    "Loki is tall",
    "Loki is eating pizza"
]

In [12]:
#preprocessing
import spacy
nlp = spacy.load("en_core_web_sm")
def preprocessing(text):
    doc = nlp(text)
    filtered_token = []
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        filtered_token.append(token.lemma_)
    return " ".join(filtered_token)


In [13]:
corpus_processed = [preprocessing(text) for text in corpus]

In [14]:
corpus_processed

['thor eat pizza', 'Loki tall', 'Loki eat pizza']

In [15]:
v = CountVectorizer(ngram_range = (1,4))
v.fit(corpus_processed)
v.vocabulary_

{'thor': 8,
 'eat': 0,
 'pizza': 6,
 'thor eat': 9,
 'eat pizza': 1,
 'thor eat pizza': 10,
 'loki': 2,
 'tall': 7,
 'loki tall': 5,
 'loki eat': 3,
 'loki eat pizza': 4}

In [None]:
#vectorization

In [16]:
v.transform(['thor eat pizza']).toarray()

array([[1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1]], dtype=int64)

In [18]:
import pandas as pd
df = pd.read_json("news_dataset.json")
print(df.shape)
df.head()

(12695, 2)


Unnamed: 0,text,category
0,Watching Schrödinger's Cat Die University of C...,SCIENCE
1,WATCH: Freaky Vortex Opens Up In Flooded Lake,SCIENCE
2,Entrepreneurs Today Don't Need a Big Budget to...,BUSINESS
3,These Roads Could Recharge Your Electric Car A...,BUSINESS
4,Civilian 'Guard' Fires Gun While 'Protecting' ...,CRIME


In [25]:
df.category.value_counts()

BUSINESS    4254
SPORTS      4167
CRIME       2893
SCIENCE     1381
Name: category, dtype: int64

In [26]:
#it is imbalanced dataset we use UNDERSAMPLING taking min value here is science

In [28]:
min_sample = 1381
df_business = df[df.category == "BUSINESS"].sample(min_sample, random_state = 1000)
df_sports = df[df.category == "SPORTS"].sample(min_sample, random_state = 1000)
df_crime = df[df.category == "CRIME"].sample(min_sample, random_state = 1000)
df_science = df[df.category == "SCIENCE"].sample(min_sample, random_state = 1000)

In [34]:
df_balanced = pd.concat([df_business,df_sports,df_crime,df_science])

In [35]:
df_balanced['category'].value_counts()

BUSINESS    1381
SPORTS      1381
CRIME       1381
SCIENCE     1381
Name: category, dtype: int64

In [37]:
type(df_balanced.text)

pandas.core.series.Series

In [36]:
from sklearn.model_selection import train_test_split

In [38]:
df_balanced['category_num'] = df_balanced.category.map({
    "BUSINESS" : 0,
    "SPORTS" : 1,
    "CRIME" : 2,
    "SCIENCE" : 3
})

In [39]:
df_balanced.head()

Unnamed: 0,text,category,category_num
5318,The Job Market Is Still Years Away From A Full...,BUSINESS,0
6286,Establishing a Solid Legal Foundation for Your...,BUSINESS,0
3320,"Gender Diversity on Boards: Good, Bad or Indif...",BUSINESS,0
2844,Volunteering Surprisingly Makes You Feel Like ...,BUSINESS,0
9903,How Managers Can Hire Employees More Effectively,BUSINESS,0


In [48]:
X_train, X_test, y_train, y_test = train_test_split(
    df_balanced.text,
    df_balanced.category_num,
    test_size = 0.2,
    random_state =1000,
    stratify = df_balanced.category_num
)

In [49]:
X_train.shape


(4419,)

In [50]:
X_train.head()

8669     Here's How Long It Would Take To Fall Through ...
10962    Tony Hawk Does A '900' At Age 48 Because He's ...
9543     10 Things Entrepreneurs Do Differently Than Me...
6728     In A Win For Waymo, Judge Rules Uber Lawsuit W...
588      Matt Barnes Is Probably Going To Hear More ‘De...
Name: text, dtype: object

In [51]:
y_train.value_counts() #stratify sample from  all classes are similar

1    1105
0    1105
2    1105
3    1104
Name: category_num, dtype: int64

In [52]:
y_test.value_counts()

3    277
2    276
1    276
0    276
Name: category_num, dtype: int64