In [1]:
import pandas as pd

In [2]:
# set the path to our file
path = 'NewsCategorizer.csv'
# convert our csv file to a pandas DataFrame
# converters allows pandas to know that abilities is a list, not a string
news = pd.read_csv(path)

In [3]:
news

Unnamed: 0,category,headline,links,short_description,keywords
0,WELLNESS,143 Miles in 35 Days: Lessons Learned,https://www.huffingtonpost.com/entry/running-l...,Resting is part of training. I've confirmed wh...,running-lessons
1,WELLNESS,Talking to Yourself: Crazy or Crazy Helpful?,https://www.huffingtonpost.com/entry/talking-t...,Think of talking to yourself as a tool to coac...,talking-to-yourself-crazy
2,WELLNESS,Crenezumab: Trial Will Gauge Whether Alzheimer...,https://www.huffingtonpost.com/entry/crenezuma...,The clock is ticking for the United States to ...,crenezumab-alzheimers-disease-drug
3,WELLNESS,"Oh, What a Difference She Made",https://www.huffingtonpost.com/entry/meaningfu...,"If you want to be busy, keep trying to be perf...",meaningful-life
4,WELLNESS,Green Superfoods,https://www.huffingtonpost.com/entry/green-sup...,"First, the bad news: Soda bread, corned beef a...",green-superfoods
...,...,...,...,...,...
49995,SPORTS,This Baseball Team Learned There's A Wrong Way...,https://www.huffingtonpost.com/entry/san-jose-...,Many fans were pissed after seeing the minor l...,san-jose-giants-japanese-heritage-night
49996,SPORTS,Some Young Spurs Fan Dabbed 38 Times In A Sing...,https://www.huffingtonpost.com/entry/dab-kid-s...,"Never change, young man. Never change.",dab-kid-san-antonio-spurs
49997,SPORTS,Rasheed Wallace Ejected From Knicks-Suns Game ...,https://www.huffingtonpost.com/entry/rasheed-w...,Wallace was hit with a first technical for a h...,rasheed-wallace-ejected-knicks-suns-ball-dont-lie
49998,SPORTS,Why Jake Plummer And Other NFL Players Are Pus...,https://www.huffingtonpost.comhttp://extras.de...,They believe CBD could be an alternative to po...,


In [4]:
len(news.category.unique())

10

In [7]:
news.category.unique()

array(['WELLNESS', 'POLITICS', 'ENTERTAINMENT', 'TRAVEL',
       'STYLE & BEAUTY', 'PARENTING', 'FOOD & DRINK', 'WORLD NEWS',
       'BUSINESS', 'SPORTS'], dtype=object)

In [5]:
from stop_words import get_stop_words
# if "No module named 'stop_words'" error; yuo haven't imported a library, shut down everyhting and import in terminal

# {'first_keyword': shdfshdf, 'second_keyword': uagdfgafh, 'third_keyword': ousgeaofuqge}
def create_features(headline):
    # make all the words lowercase
    headline = headline.lower()
    # split the headline into a list of words
    words = headline.split()
    # using list comprehension to create a list of keywords
    keywords = [ x for x in words if x not in get_stop_words('english') ]
    # make sure all keyword lists have three elements
    while len(keywords) < 3:
        keywords.append('None')
    return {'first_keyword': keywords[0], 'second_keyword': keywords[1], 'third_keyword': keywords[2]}

In [8]:
zipped_feature_and_labels = zip(news.headline, news.category)

In [10]:
# bring together the catergory(label), and the headline(features)

# we are creating a list of tuples
# tuples have parenthesies around them -> (create_features(headline), category)
feature_sets = [ (create_features(headline), category) for headline, category in zipped_feature_and_labels ]
feature_sets

[({'first_keyword': '143', 'second_keyword': 'miles', 'third_keyword': '35'},
  'WELLNESS'),
 ({'first_keyword': 'talking',
   'second_keyword': 'yourself:',
   'third_keyword': 'crazy'},
  'WELLNESS'),
 ({'first_keyword': 'crenezumab:',
   'second_keyword': 'trial',
   'third_keyword': 'will'},
  'WELLNESS'),
 ({'first_keyword': 'oh,',
   'second_keyword': 'difference',
   'third_keyword': 'made'},
  'WELLNESS'),
 ({'first_keyword': 'green',
   'second_keyword': 'superfoods',
   'third_keyword': 'None'},
  'WELLNESS'),
 ({'first_keyword': 'bad',
   'second_keyword': 'love',
   'third_keyword': 'advice:'},
  'WELLNESS'),
 ({'first_keyword': 'happiest',
   'second_keyword': '(and',
   'third_keyword': 'unhappiest)'},
  'WELLNESS'),
 ({'first_keyword': 'seaweed:',
   'second_keyword': 'green',
   'third_keyword': 'superfood'},
  'WELLNESS'),
 ({'first_keyword': 'addicted',
   'second_keyword': 'food?',
   'third_keyword': 'None'},
  'WELLNESS'),
 ({'first_keyword': 'high',
   'second_key

In [11]:
import random
random.shuffle(feature_sets)

In [12]:
import math

# calculating what 80% of our data set is
split_num = math.floor(len(feature_sets) * .8)

# 80% of my data
training_set = feature_sets[:split_num]
# 20% of my data
testing_set = feature_sets[split_num:]

In [13]:
import nltk

classifier = nltk.NaiveBayesClassifier.train(training_set)

In [14]:
classifier.classify(create_features("Resting is part of training. I've confirmed what I sort of already knew: I'm not built for running streaks. I'm built for hard workouts three to five days a week with lots of cross training, physical therapy and foam rolling. But I've also confirmed that I'm stubborn with myself."))

'ENTERTAINMENT'

In [18]:
print(nltk.classify.accuracy(classifier, testing_set))

0.5986


In [19]:
classifier.show_most_informative_features(12)

Most Informative Features
           third_keyword = 'recipes'      FOOD & : WELLNE =     86.2 : 1.0
          second_keyword = 'day:'         FOOD & : POLITI =     80.3 : 1.0
           first_keyword = 'recipe'       FOOD & : BUSINE =     79.6 : 1.0
           third_keyword = 'style'        STYLE  : ENTERT =     77.7 : 1.0
           first_keyword = 'women'        BUSINE : FOOD & =     73.0 : 1.0
          second_keyword = 'recipes'      FOOD & : TRAVEL =     71.3 : 1.0
           third_keyword = 'fashion'      STYLE  : PARENT =     64.0 : 1.0
           first_keyword = 'nfl'          SPORTS : POLITI =     60.9 : 1.0
          second_keyword = 'travel'       TRAVEL : POLITI =     60.1 : 1.0
           first_keyword = 'best'         FOOD & : WORLD  =     59.7 : 1.0
          second_keyword = 'business'     BUSINE : PARENT =     57.0 : 1.0
          second_keyword = 'best'         TRAVEL : WORLD  =     56.8 : 1.0
