## Bag of n -grams

In [2]:
from sklearn.feature_extraction.text import CountVectorizer

In [4]:
vector = CountVectorizer(ngram_range=(1,2)) # ngram_range(min,max grams)
vector.fit(['Steve ate pizza'])
vector.vocabulary_

{'steve': 3, 'ate': 0, 'pizza': 2, 'steve ate': 4, 'ate pizza': 1}

In [5]:
corpus = [
    'Alex ate pizza',
    'Steve is tall',
    'Villager is eating pizza'
]

In [6]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [16]:
def remove_stop_words(text):
  doc = nlp(text)
  return ' '.join([token.lemma_ for token in doc if not token.is_punct and not token.is_stop])


In [17]:
corpus_processed = [remove_stop_words(text) for text in corpus]
print(corpus_processed)

['Alex eat pizza', 'Steve tall', 'villager eat pizza']


In [19]:
vector = CountVectorizer(ngram_range=(1,2))
vector.fit(corpus_processed)
vector.vocabulary_

{'alex': 0,
 'eat': 2,
 'pizza': 4,
 'alex eat': 1,
 'eat pizza': 3,
 'steve': 5,
 'tall': 7,
 'steve tall': 6,
 'villager': 8,
 'villager eat': 9}

In [21]:
vector.transform(['Alex eat pizza']).toarray() # text to vector and converting it to an array

array([[1, 1, 1, 1, 1, 0, 0, 0, 0, 0]])

In [22]:
#limitation of bag of words oov problem out of vocabulary problem
# limitation 2: not suitable for large number of words

## News Category Classifier

In [25]:
import pandas as pd

In [30]:
df = pd.read_json('news_data.json',lines=True)

In [31]:
df.shape

(124989, 6)

In [32]:
df.head(5)

Unnamed: 0,short_description,headline,date,link,authors,category
0,She left her husband. He killed their children...,There Were 2 Mass Shootings In Texas Last Week...,2018-05-26,https://www.huffingtonpost.com/entry/texas-ama...,Melissa Jeltsen,CRIME
1,Of course it has a song.,Will Smith Joins Diplo And Nicky Jam For The 2...,2018-05-26,https://www.huffingtonpost.com/entry/will-smit...,Andy McDonald,ENTERTAINMENT
2,The actor and his longtime girlfriend Anna Ebe...,Hugh Grant Marries For The First Time At Age 57,2018-05-26,https://www.huffingtonpost.com/entry/hugh-gran...,Ron Dicker,ENTERTAINMENT
3,The actor gives Dems an ass-kicking for not fi...,Jim Carrey Blasts 'Castrato' Adam Schiff And D...,2018-05-26,https://www.huffingtonpost.com/entry/jim-carre...,Ron Dicker,ENTERTAINMENT
4,"The ""Dietland"" actress said using the bags is ...",Julianna Margulies Uses Donald Trump Poop Bags...,2018-05-26,https://www.huffingtonpost.com/entry/julianna-...,Ron Dicker,ENTERTAINMENT


In [34]:
df['category'].value_counts() # imbalanced data

POLITICS          32739
ENTERTAINMENT     14257
HEALTHY LIVING     6694
QUEER VOICES       4995
BUSINESS           4254
SPORTS             4167
COMEDY             3971
PARENTS            3955
BLACK VOICES       3858
THE WORLDPOST      3664
WOMEN              3490
CRIME              2893
MEDIA              2815
WEIRD NEWS         2670
GREEN              2622
IMPACT             2602
WORLDPOST          2579
RELIGION           2556
STYLE              2254
WORLD NEWS         2177
TRAVEL             2145
TASTE              2096
ARTS               1509
FIFTY              1401
GOOD NEWS          1398
SCIENCE            1381
ARTS & CULTURE     1339
TECH               1231
COLLEGE            1144
LATINO VOICES      1129
EDUCATION          1004
Name: category, dtype: int64

In [38]:
# using under sampling to remove class imbalance
min_samples = 1004
df[df['category']=='BUSINESS'].sample(min_samples,random_state=0) # to randomly select min_samples rows

Unnamed: 0,short_description,headline,date,link,authors,category
87307,Do you work for a boss or with a leader? Are y...,The Need for More Leaders and Less Bosses,2015-06-26,https://www.huffingtonpost.com/entry/the-need-...,"Eric Sheninger, ContributorSenior Fellow, Inte...",BUSINESS
89168,"Like with anything, there's good and bad. Not ...",What Murphy Showed Me on Memorial Day that Eve...,2015-06-04,https://www.huffingtonpost.com/entry/what-murp...,"Michael Goldberg, ContributorAuthor, speaker, ...",BUSINESS
58216,,New Apple Store Makeover: Looks Aren't Everything,2016-05-21,https://www.huffingtonpost.com/entry/new-apple...,"Ira Kalb, ContributorAssistant Professor of Cl...",BUSINESS
105594,This week campaigners against cluster munition...,Banks Behaving Badly: Are You Funding Cluster ...,2014-11-27,https://www.huffingtonpost.com/entry/banks-beh...,"Amy Little, ContributorExecutive Lead, Global ...",BUSINESS
64275,Our biases nearly always get the best of us.,The Financial Crisis Film 'Boom Bust Boom' Fal...,2016-03-12,https://www.huffingtonpost.com/entry/boom-bust...,Shane Ferro,BUSINESS
...,...,...,...,...,...,...
51504,This marks the second straight month of strong...,"America Added 255,000 Jobs In July",2016-08-05,https://www.huffingtonpost.com/entry/july-jobs...,"Lucia Mutikani, Reuters",BUSINESS
104810,"While any one month's results from these ""high...",First Impressions: Strong Payroll Gains Mark A...,2014-12-06,https://www.huffingtonpost.com/entry/first-imp...,"Jared Bernstein, ContributorFmr. Obama adminis...",BUSINESS
46508,,Remote Work Can Work For Tech,2016-10-01,https://www.huffingtonpost.com/entry/remote-wo...,"Alaina Percival, ContributorCEO at Women Who C...",BUSINESS
37182,"All 110 stores, as well as the company's Los A...",American Apparel Stores To Close After Canadia...,2017-01-15,https://www.huffingtonpost.com/entry/american-...,Nina Golgowski,BUSINESS


In [43]:
new_df = None
for category in df.columns:
  if category in ['BUSINESS','SPORTS','CRIME','SCIENCE']:
    new_df = pd.concat(new_df,df[category])
    df = new_df


In [47]:
df.shape

(124989, 6)