## The notebook which will make the model and export it in a pickle which will be later loaded by the web app

## Downloading popular nltk packages

In [1]:
import nltk
nltk.download('popular')

[nltk_data] Downloading collection 'popular'
[nltk_data]    | 
[nltk_data]    | Downloading package cmudict to
[nltk_data]    |     C:\Users\aadit\AppData\Roaming\nltk_data...
[nltk_data]    |   Package cmudict is already up-to-date!
[nltk_data]    | Downloading package gazetteers to
[nltk_data]    |     C:\Users\aadit\AppData\Roaming\nltk_data...
[nltk_data]    |   Package gazetteers is already up-to-date!
[nltk_data]    | Downloading package genesis to
[nltk_data]    |     C:\Users\aadit\AppData\Roaming\nltk_data...
[nltk_data]    |   Package genesis is already up-to-date!
[nltk_data]    | Downloading package gutenberg to
[nltk_data]    |     C:\Users\aadit\AppData\Roaming\nltk_data...
[nltk_data]    |   Package gutenberg is already up-to-date!
[nltk_data]    | Downloading package inaugural to
[nltk_data]    |     C:\Users\aadit\AppData\Roaming\nltk_data...
[nltk_data]    |   Package inaugural is already up-to-date!
[nltk_data]    | Downloading package movie_reviews to
[nltk_data]   

True

## Importing needed libraries and loading the dataset

In [2]:
import pandas as pd
import numpy as np
import pickle
df = pd.read_json('data/News_Category_Dataset.json', lines=True)

## Filtering the dataframe to get the data needed

In [3]:
df = df.filter(['category', 'headline'])
df

Unnamed: 0,category,headline
0,CRIME,There Were 2 Mass Shootings In Texas Last Week...
1,ENTERTAINMENT,Will Smith Joins Diplo And Nicky Jam For The 2...
2,ENTERTAINMENT,Hugh Grant Marries For The First Time At Age 57
3,ENTERTAINMENT,Jim Carrey Blasts 'Castrato' Adam Schiff And D...
4,ENTERTAINMENT,Julianna Margulies Uses Donald Trump Poop Bags...
...,...,...
200848,TECH,RIM CEO Thorsten Heins' 'Significant' Plans Fo...
200849,SPORTS,Maria Sharapova Stunned By Victoria Azarenka I...
200850,SPORTS,"Giants Over Patriots, Jets Over Colts Among M..."
200851,SPORTS,Aldon Smith Arrested: 49ers Linebacker Busted ...


## Mapping the topics to its relevance

In [4]:
relevance = {
    'ARTS': 1,
    'ARTS & CULTURE': 0,
    'BLACK VOICES': 0,
    'BUSINESS': 1,
    'COLLEGE': 0,
    'COMEDY': 0,
    'CRIME': 0,
    'CULTURE & ARTS': 0,
    'DIVORCE': 0,
    'EDUCATION': 1,
    'ENTERTAINMENT': 0,
    'ENVIRONMENT': 1,
    'FIFTY': 0,
    'FOOD & DRINK': 1,
    'GOOD NEWS': 1,
    'GREEN': 1,
    'HEALTHY LIVING': 1,
    'HOME & LIVING': 1,
    'IMPACT': 0,
    'LATINO VOICES': 0,
    'MEDIA': 0,
    'MONEY': 1,
    'PARENTING': 0,
    'PARENTS': 0,
    'POLITICS': 1,
    'QUEER VOICES': 0,
    'RELIGION': 0,
    'SCIENCE': 1,
    'SPORTS': 1,
    'STYLE': 0,
    'STYLE & BEAUTY': 1,
    'TASTE': 1,
    'TECH': 1,
    'THE WORLDPOST': 0,
    'TRAVEL': 1,
    'WEDDINGS': 0,
    'WEIRD NEWS': 0,
    'WELLNESS': 1,
    'WOMEN': 0,
    'WORLD NEWS': 0,
    'WORLDPOST': 0,
}
df['relevance'] = df.category.map(relevance)
df

Unnamed: 0,category,headline,relevance
0,CRIME,There Were 2 Mass Shootings In Texas Last Week...,0
1,ENTERTAINMENT,Will Smith Joins Diplo And Nicky Jam For The 2...,0
2,ENTERTAINMENT,Hugh Grant Marries For The First Time At Age 57,0
3,ENTERTAINMENT,Jim Carrey Blasts 'Castrato' Adam Schiff And D...,0
4,ENTERTAINMENT,Julianna Margulies Uses Donald Trump Poop Bags...,0
...,...,...,...
200848,TECH,RIM CEO Thorsten Heins' 'Significant' Plans Fo...,1
200849,SPORTS,Maria Sharapova Stunned By Victoria Azarenka I...,1
200850,SPORTS,"Giants Over Patriots, Jets Over Colts Among M...",1
200851,SPORTS,Aldon Smith Arrested: 49ers Linebacker Busted ...,1


## Function which tokenizes, stems and lemmatizes given string

In [5]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords 
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
ps = PorterStemmer()

def extract_words(text):
    stop_words = set(stopwords.words('english')) 
    word_tokens = word_tokenize(text.lower()) 
    filtered_sentence = [w for w in word_tokens if not w in stop_words] 
    filtered_sentence = [] 
    for w in word_tokens: 
        if w not in stop_words: 
            stemmed = ps.stem(w)
            lemmed = lemmatizer.lemmatize(stemmed)
            filtered_sentence.append(lemmed) 
    return filtered_sentence

## Functions which will build the bagofwords

In [6]:
def map_book(hash_map, tokens):
    if tokens is not None:
        for word in tokens:
            # Word Exist?
            if word in hash_map:
                hash_map[word] += 1
            else:
                hash_map[word] = 1

        return hash_map
    else:
        return None
    
def make_hash_map(df):
    hash_map = {}
    for index, row in df.iterrows():
        hash_map = map_book(hash_map, extract_words(row['headline']))
    return hash_map

def frequent_vocab(word_freq, max_features): 
    counter = 0  
    vocab = [] 
    for key, value in sorted(word_freq.items(), key=lambda item: (item[1], item[0]), reverse=True): 
        if counter<max_features: 
            vocab.append(key)
            counter+=1
        else: break
    return vocab

def bagofwords(sentence, words):
    sentence_words = extract_words(sentence)
    bag = np.zeros(len(words))
    for sw in sentence_words:
        for i,word in enumerate(words):
            if word == sw: 
                bag[i] += 1
                
    return np.array(bag)

## Make a variable which will hold the frequent vocabs in the hashmap
And dump it in a pickle file to be later loaded by the web app

In [7]:
hash_map = make_hash_map(df) 

vocab=frequent_vocab(hash_map, 500)

print(vocab)
pickle.dump(vocab, open('web/vocab.pickle', 'wb'))

[',', ':', "'s", "'", '(', ')', 'trump', 'photo', '?', 'new', "n't", 'video', 'say', '’', 'day', 'make', 'get', 'woman', '.', 'donald', 'way', 'best', 'year', '5', 'show', 'world', '!', 'kid', 'time', 'one', '10', 'like', 'thing', 'love', 'peopl', 'look', 'want', 'need', 'week', 'life', 'take', 'help', 'first', 'obama', 'could', 'live', 'child', '--', 'u.s.', 'health', 'know', '&', 'wed', 'chang', 'man', 'parent', 'find', 'hous', 'american', 'home', 'call', 'clinton', 'mom', 'america', 'watch', 'back', 'food', 'may', 'go', 'state', 'black', 'u', 'report', 'gop', 'right', 'white', 'talk', 'work', 'famili', 'use', 'give', 'school', '$', 'celebr', 'top', 'studi', 'bill', 'babi', 'star', 'travel', 'presid', 'gay', 'plan', 'divorc', 'tip', 'kill', '7', 'hillari', 'good', 'polic', 'recip', '‘', 'girl', "'the", 'republican', 'come', 'realli', '3', 'fight', 'attack', "'re", 'stop', 'democrat', '...', 'reason', 'citi', 'fashion', 'big', 'still', 'death', 'think', 'care', '6', 'war', 'beauti', '

In [8]:
text = 'FBI found gold hidden in a old house'
bagofwords(text, vocab)

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0.

## Building the bag of words

In [9]:
n_words = len(vocab)
n_docs = len(df)
bag_o = np.zeros([n_docs,n_words])
for ii in range(n_docs): 
    bag_o[ii,:] = bagofwords(df['headline'].iloc[ii], vocab) 

In [10]:
bag_o.shape

(200853, 500)

In [11]:
popular_words = sorted(hash_map, key = hash_map.get, reverse = True)
print(popular_words[:20])

[',', ':', "'s", "'", '(', ')', 'trump', 'photo', '?', 'new', "n't", 'video', 'say', '’', 'day', 'get', 'make', 'woman', '.', 'donald']


## Finding the idf and tfidf

In [12]:
numdocs, numwords = np.shape(bag_o)
N = numdocs
word_frequency = np.empty(numwords)

for word in range(numwords):
    word_frequency[word]=np.sum((bag_o[:,word]>0)) 

idf = np.log(N/word_frequency)
idf.shape
pickle.dump(idf, open('web/idf.pickle', 'wb'))

In [13]:
tfidf = np.empty([numdocs, numwords])
for doc in range(numdocs):
    tfidf[doc, :]=bag_o[doc, :]*idf

In [14]:
print (tfidf)

[[1.8677155  0.         0.         ... 0.         0.         0.        ]
 [0.         0.         1.8071269  ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 ...
 [1.8677155  0.         0.         ... 0.         0.         0.        ]
 [0.         1.70737285 0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]]


## Code to build the model and export it in a pickle

In [15]:
from sklearn.linear_model import LogisticRegression #to import logistic regression model
from sklearn.model_selection import train_test_split #to split data into training and testing set
from sklearn.model_selection import GridSearchCV #to find out the best parameter for our model

In [16]:
X_train,X_test,y_train,y_test = train_test_split(tfidf,df['relevance'].values,shuffle=True)

In [17]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(150639, 500)
(50214, 500)
(150639,)
(50214,)


In [18]:
logreg = LogisticRegression(solver = 'lbfgs')
logreg.fit(X_train,y_train)
y_pred=logreg.predict(X_test)
print (y_pred)
score = logreg.score(X_train, y_train)
print(score)
print('Accuracy of logistic regression classifier on training set: {:.3f}'.format(score))

[1 1 0 ... 1 1 1]
0.7277464667184461
Accuracy of logistic regression classifier on training set: 0.728


In [23]:
def classify(rf, X_all, y_all): 
    X_train,X_test,y_train,y_test = train_test_split(X_all,y_all,shuffle=True)
    logreg.fit(X_train,y_train) 
    print(rf.score(X_train,y_train)) 
    return logreg

In [24]:
logreg = LogisticRegression()
X_all = tfidf
y_all = df['relevance'].values
logreg = classify(logreg, X_all, y_all)
pickle.dump(logreg, open('web/logreg.pickle', 'wb'))

0.727846042525508
