In [58]:
# https://www.kaggle.com/kinguistics/classifying-news-headlines-with-scikit-learn/notebook
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import re

#from joblib import dump, load
from sklearn.externals import joblib
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import LabelEncoder
from wordcloud import ImageColorGenerator, STOPWORDS, WordCloud

In [59]:
# read to dataframe
news = pd.read_json("news_v02.json")

print("title count:", len(news))
print(news.groupby(["topic"]).count())
news.sample(10)

title count: 59322
                title
topic                
business         9887
entertainment    9887
health           9887
politics         9887
science & tech   9887
travel           9887


Unnamed: 0,title,topic
26717,Manresa scores big with three 2014 James Beard...,entertainment
25712,Traditional Irish food in time for St. Patrick...,entertainment
8084,"UPDATE 1-Kocherlakota, Fed's lone dissenter, b...",business
22102,Noah has a super-awkward family meeting in thi...,entertainment
31595,"Dozens Of Groups Meet in Washington, D.C. to C...",health
20876,Lena Dunham's “Saturday Night Live” Posts Weak...,entertainment
53680,Volunteering Abroad With Kids: Is a Volunteer ...,travel
1718,"Fannie, Freddie profits surprise",business
48759,Just What The Middle East Needs -- $110 Billio...,politics
45867,Thursday's Morning Email: Government Shutdown ...,politics


In [60]:
def normalize_text(s):
    s = s.lower()
    
    # remove punctuation that is not word-internal (e.g., hyphens, apostrophes)
    s = re.sub('\s\W',' ',s)
    s = re.sub('\W\s',' ',s)
    
    # make sure we didn't introduce any double spaces
    s = re.sub('\s+',' ',s)
    
    return s

news['text'] = [normalize_text(s) for s in news['title']]
print("done normalizing text")
news.sample(10)

done normalizing text


Unnamed: 0,title,topic,text
40888,Trump Loses Bid To Dismiss Accuser's Defamatio...,politics,trump loses bid to dismiss accuser's defamatio...
8162,Wall St ends lower as biotechs fall,business,wall st ends lower as biotechs fall
47188,Net Neutrality Is A Class Issue,politics,net neutrality is a class issue
45624,The Racism Heard Round The World,politics,the racism heard round the world
220,Hackers accuse Mt. Gox of pocketing users' Bit...,business,hackers accuse mt gox of pocketing users bitcoins
35482,"Measles outbreak in Orange County, Calif. lead...",health,measles outbreak in orange county calif leads ...
13282,"Video Game Sales Up in Feb, Software Sales Dismal",science & tech,video game sales up in feb software sales dismal
44166,John Kelly And The History That Never Was,politics,john kelly and the history that never was
18207,How to see your first tweet,science & tech,how to see your first tweet
36403,"Benton, Franklin counties ranked 13th and 18th...",health,benton franklin counties ranked 13th and 18th ...


In [61]:
# pull the data into vectors
vectorizer = CountVectorizer()
x = vectorizer.fit_transform(news['text'])
# persist vectorizer
#joblib.dump(vectorizer, "vectorizer.joblib")
#
vocab = vectorizer.vocabulary_
#print(vocab)

encoder = LabelEncoder()
y = encoder.fit_transform(news['topic'])

# split into train and test sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
print(x_test.shape)
print("DONE vectorizing text")

(11865, 25325)
DONE vectorizing text


In [None]:
nb = MultinomialNB()
nb.fit(x_train, y_train)
'''
# save model
joblib.dump(nb, "mnb-news_gCloud.joblib")
# load model
nb_load = joblib.load("mnb-news_gCloud.joblib")

print(nb_load.score(x_test, y_test))
'''
print("\nDONE!")

In [None]:
# wordcloud
text = [key for key in vocab]
text = " ".join(text)
stop = set(STOPWORDS)
stop.add("will")
wordcloud = WordCloud(stopwords = stop, background_color = "white").generate(text)

plt.imshow(wordcloud, interpolation = "bilinear")
plt.axis("off")
plt.show()