In [108]:
# https://www.kaggle.com/kinguistics/classifying-news-headlines-with-scikit-learn/notebook
import numpy as np
import pandas as pd
import re

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import LabelEncoder

In [109]:
# read to dataframe
news = pd.read_json("news_v02.json")

print(news.groupby(["topic"]).count())
news.sample(10)

                 title
topic                 
business        115967
entertainment   152469
health           45639
politics         32739
science & tech  108344
travel            9887


Unnamed: 0,title,topic
186779,Tesla to hand over its technology to other aut...,science & tech
171109,"Forget about Dre, Will.i.Am is going to make a...",science & tech
94155,Nikkei gains after shrugging off BOJ tankan; C...,business
438164,Donald Trump Jr. Says Leaked Boasts Of Assault...,politics
204415,"Samsung to launch Galaxy Core II, Galaxy Ace 4...",science & tech
460201,Help for Japan Travelers WiFi Woes,travel
112312,Scientific Games Acquisition Conference Call A...,business
37583,US Hot Stocks: Hot Stocks to Watch,business
449854,Florida Voter Purge Fiasco May Complicate Jeb ...,politics
54770,"China hits out at US, Japan over SCS stand off",business


In [110]:
def normalize_text(s):
    s = s.lower()
    
    # remove punctuation that is not word-internal (e.g., hyphens, apostrophes)
    s = re.sub('\s\W',' ',s)
    s = re.sub('\W\s',' ',s)
    
    # make sure we didn't introduce any double spaces
    s = re.sub('\s+',' ',s)
    
    return s

news['text'] = [normalize_text(s) for s in news['title']]
print("done normalizing text")
news.sample(10)

done normalizing text


Unnamed: 0,title,topic,text
167052,Oracle wins key reversal in Java copyright cas...,science & tech,oracle wins key reversal in java copyright cas...
404878,San Diego ranks 8th in American Fitness Index,health,san diego ranks 8th in american fitness index
343243,Kim combines nude strapless dress with ... har...,entertainment,kim combines nude strapless dress with . hard ...
901,China Stocks Seen Rallying 24% at Goldman on V...,business,china stocks seen rallying 24 at goldman on va...
416277,2-year-old amputee takes first steps with walk...,health,2-year-old amputee takes first steps with walk...
404151,Find Out How Michelle Obama Just Made This Fiv...,health,find out how michelle obama just made this fiv...
191332,Amazon Expected To Announce Smartphone June 18,science & tech,amazon expected to announce smartphone june 18
88290,"Stocks edge lower in midday trade, bringing in...",business,stocks edge lower in midday trade bringing ind...
9160,Nearly all major US banks pass new round of 's...,business,nearly all major us banks pass new round of st...
257309,"Boot up: Fire TV review, Android first? , fast...",entertainment,boot up fire tv review android first fast phon...


In [111]:
# pull the data into vectors
vectorizer = CountVectorizer()
x = vectorizer.fit_transform(news['text'])

encoder = LabelEncoder()
y = encoder.fit_transform(news['topic'])

# split into train and test sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
print("done vectorizing text")

done vectorizing text


In [112]:
nb = MultinomialNB()
nb.fit(x_train, y_train)

print(nb.score(x_test, y_test))

print("\nDONE!")

0.9032459224376136

DONE!
