# Model Testing 

In [8]:
import pandas as pd
import numpy as np
import regex as re
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from afinn import Afinn

from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn import metrics

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier, BaggingClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn import svm

from sklearn.linear_model import Lasso, LassoCV

from project4_function import *
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import nltk

from sklearn.pipeline import Pipeline

import time
import warnings
from datetime import date
import requests
now = time.time()

warnings.filterwarnings('ignore')


import matplotlib.pyplot as plt
%matplotlib inline

<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Model-Testing" data-toc-modified-id="Model-Testing-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Model Testing</a></span></li><li><span><a href="#Acquire-News-From-News-API-With-Keyword-:-&quot;flood&quot;" data-toc-modified-id="Acquire-News-From-News-API-With-Keyword-:-&quot;flood&quot;-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Acquire News From News API With Keyword : "flood"</a></span><ul class="toc-item"><li><ul class="toc-item"><li><span><a href="#Importing-and-Cleaning-dataframe" data-toc-modified-id="Importing-and-Cleaning-dataframe-2.0.1"><span class="toc-item-num">2.0.1&nbsp;&nbsp;</span>Importing and Cleaning dataframe</a></span></li><li><span><a href="#second-sets-of-testing-data" data-toc-modified-id="second-sets-of-testing-data-2.0.2"><span class="toc-item-num">2.0.2&nbsp;&nbsp;</span>second sets of testing data</a></span></li><li><span><a href="#Preparing-Test-Data" data-toc-modified-id="Preparing-Test-Data-2.0.3"><span class="toc-item-num">2.0.3&nbsp;&nbsp;</span>Preparing Test Data</a></span></li></ul></li></ul></li><li><span><a href="#Training-Models" data-toc-modified-id="Training-Models-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Training Models</a></span><ul class="toc-item"><li><span><a href="#Term-Frequency–Inverse-Document-Frequency-(TFIDF)" data-toc-modified-id="Term-Frequency–Inverse-Document-Frequency-(TFIDF)-3.1"><span class="toc-item-num">3.1&nbsp;&nbsp;</span>Term Frequency–Inverse Document Frequency (TFIDF)</a></span></li></ul></li><li><span><a href="#LogisticRegression:-Feature-Importance" data-toc-modified-id="LogisticRegression:-Feature-Importance-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>LogisticRegression: Feature Importance</a></span></li><li><span><a href="#MultinomialNB:-Feature-Importance" data-toc-modified-id="MultinomialNB:-Feature-Importance-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>MultinomialNB: Feature Importance</a></span><ul class="toc-item"><li><ul class="toc-item"><li><span><a href="#Prediction-Dataframe" data-toc-modified-id="Prediction-Dataframe-5.0.1"><span class="toc-item-num">5.0.1&nbsp;&nbsp;</span>Prediction Dataframe</a></span></li></ul></li></ul></li></ul></div>

# Acquire News From News API With Keyword : "flood" 

In [9]:
def save_news (search_terms, file_name, n_pagesize, start_page, end_pages, save_to_csv): 
    '''
    term_request = which is the key word for search.
    save_to_csv = True indicates csv will be saved
    '''
    
    # API requests
    #for term in search_terms: 
    url = 'https://newsapi.org/v2/everything?'
        
    param = {
    #'country' : 'us',
    'q': search_terms,  #search term 
    'apiKey' : 'e685d6e1420f4882b86d029ed3c1a11d',
    'pageSize': n_pagesize, #max page
    'language': 'en'}
    print (search_terms)
        
    every_term = requests.get(url, params = param)

    articles = every_term.json()['articles'] 
    
    for page in range(start_page, end_pages): #go throught 10 times, and get more pages, 10 more pages
        param['page'] = page
        
        more_term = requests.get(url, params = param)
        more_term = more_term.json()['articles']
        
        articles.extend(more_term)
    arts = pd.DataFrame(articles)
    
    # Drop null and duplicate 
    arts.dropna(inplace=True)
    arts.drop_duplicates(subset=['content','description'],inplace = True)
    
    # Creahttp://localhost:8888/notebooks/dsi/Project4_Disaster_Test_Classification/code/NewAPI_exploration.ipynb#te columns
    arts['source_id'] = arts['source'].map(lambda x: x['id'])
    arts['source_name'] = arts['source'].map(lambda x: x['name']) #break up the source, source id, and name colums seperate
    arts.drop (columns = ['source'], axis=1)
    arts['types'] = str(search_terms)
    arts['yes_disaster'] = 1

    # Save df to csv
    if save_to_csv == True: 
        arts.to_csv('../data/'+str(file_name)+'_'+str(search_terms)+'_'+str(now) +'.csv' ,index = False, sep = ",") #index = False for no extra columns
        print (f'{len(articles)} unique news haved been saved')

In [10]:
# save_news ('flood', file_name ='e', n_pagesize=10, start_page=2, end_pages=25, save_to_csv=True)

In [11]:
save_news ('today', file_name ='e', n_pagesize=10, start_page=2, end_pages=25, save_to_csv=True)

today
240 unique news haved been saved


### Importing and Cleaning dataframe 

In [15]:
df = pd.read_csv('../data/e_flood_1556224208.220971.csv')
df.head()

Unnamed: 0,author,content,description,publishedAt,source,title,url,urlToImage,source_id,source_name,types,yes_disaster
0,Eric Holthaus,This story originally appeared on Grist and is...,Rapid collapse of Antarctic glaciers could flo...,2017-11-30T14:00:00Z,"{'id': 'wired', 'name': 'Wired'}",Two Melting Antarctic Glaciers Could Decide th...,https://www.wired.com/story/two-melting-glacie...,https://media.wired.com/photos/5a1f4e7f0cb1f52...,wired,Wired,flood,1
1,Patrick Allan,Flash floods can strike with almost no warning...,Flash floods can strike with almost no warning...,2018-06-11T19:00:00Z,"{'id': None, 'name': 'Lifehacker.com'}",Here's Everything You Need to Know to Survive ...,https://lifehacker.com/heres-everything-you-ne...,https://i.kinja-img.com/gawker-media/image/upl...,,Lifehacker.com,flood,1
2,Mike Butcher,The flood underinsurance problem is arguably t...,The flood underinsurance problem is arguably t...,2018-08-06T15:47:30Z,"{'id': 'techcrunch', 'name': 'TechCrunch'}",FloodFlash insurance startup raises £1.9M via ...,http://techcrunch.com/2018/08/06/floodflash-in...,https://techcrunch.com/wp-content/uploads/2017...,techcrunch,TechCrunch,flood,1
3,Marcello Rossi,This story originally appeared on CityLab and ...,"Finally, construction is finishing on the dela...",2018-04-05T12:00:00Z,"{'id': 'wired', 'name': 'Wired'}",Will a Huge New Flood Barrier Save Venice?,https://www.wired.com/story/will-a-huge-new-fl...,https://media.wired.com/photos/5ac569774738fe0...,wired,Wired,flood,1
4,Martha Pskowski,"“SCANDAL!!,” read the Facebook post of Mexican...",As Mexico gears up for the largest election in...,2018-06-27T14:46:30Z,"{'id': 'the-verge', 'name': 'The Verge'}",Mexico struggles to weed out fake news ahead o...,https://www.theverge.com/2018/6/27/17503444/me...,https://cdn.vox-cdn.com/thumbor/ZWrJDEF_PfP768...,the-verge,The Verge,flood,1


In [16]:
flood = df[['content', 'yes_disaster']]
flood.head()

Unnamed: 0,content,yes_disaster
0,This story originally appeared on Grist and is...,1
1,Flash floods can strike with almost no warning...,1
2,The flood underinsurance problem is arguably t...,1
3,This story originally appeared on CityLab and ...,1
4,"“SCANDAL!!,” read the Facebook post of Mexican...",1


### second sets of testing data

In [17]:
df2 = pd.read_csv('../data/e_today_1556233834.9498.csv')
df2.head()

Unnamed: 0,author,content,description,publishedAt,source,title,url,urlToImage,source_id,source_name,types,yes_disaster
0,Mallory Locklear,"If you want to avoid the fee, you'll have to s...","Last month, MoviePass CEO Mitch Lowe announced...",2018-07-05T17:39:00Z,"{'id': 'engadget', 'name': 'Engadget'}",MoviePass’ surge pricing starts today,https://www.engadget.com/2018/07/05/moviepass-...,https://o.aolcdn.com/images/dims?thumbnail=120...,engadget,Engadget,today,1
1,Jessica Conditt,"The Watchlist is now bundled into the ""My Stuf...","Hulu.com has a fresh face today, following a d...",2018-09-20T19:06:00Z,"{'id': 'engadget', 'name': 'Engadget'}",Hulu’s website looks different today,https://www.engadget.com/2018/09/20/hulu-redes...,https://o.aolcdn.com/images/dims?thumbnail=120...,engadget,Engadget,today,1
2,Shannon Liao,Tumblr’s ban on adult content is now in effect...,Tumblr’s ban on adult content is now in effect...,2018-12-17T15:00:04Z,"{'id': 'the-verge', 'name': 'The Verge'}",Tumblr porn vanishes today,https://www.theverge.com/2018/12/17/18141106/t...,https://cdn.vox-cdn.com/thumbor/O7zIGrXVdra3E5...,the-verge,The Verge,today,1
3,Josh Ocampo,Headed to the airport this morning? You might ...,Headed to the airport this morning? You might ...,2019-04-01T14:31:00Z,"{'id': None, 'name': 'Lifehacker.com'}",Check For Flight Delays Before Flying Today,https://lifehacker.com/check-for-flight-delays...,https://i.kinja-img.com/gawker-media/image/upl...,,Lifehacker.com,today,1
4,Alicia Adamczyk,It might be the most Monday (Mondayiest?) Mond...,It might be the most “Monday” (Mondayiest?) Mo...,2018-11-26T15:00:00Z,"{'id': None, 'name': 'Lifehacker.com'}",Focus on Accomplishing One Task Today,https://lifehacker.com/focus-on-accomplishing-...,https://i.kinja-img.com/gawker-media/image/upl...,,Lifehacker.com,today,1


In [18]:
today = df2[['content', 'yes_disaster']]
today.head()

Unnamed: 0,content,yes_disaster
0,"If you want to avoid the fee, you'll have to s...",1
1,"The Watchlist is now bundled into the ""My Stuf...",1
2,Tumblr’s ban on adult content is now in effect...,1
3,Headed to the airport this morning? You might ...,1
4,It might be the most Monday (Mondayiest?) Mond...,1


### Preparing Test Data 

In [19]:
tokenizer_lemmatizer(flood, 'content')

tokenizer processed: 213
lemmatizer processed: 213


In [41]:
flood.head()

Unnamed: 0,content,yes_disaster
0,this story originally appeared on grist and is...,1
1,flash flood can strike with almost no warning ...,1
2,the flood underinsurance problem is arguably t...,1
3,this story originally appeared on citylab and ...,1
4,scandal read the facebook post of mexican come...,1


In [20]:
tokenizer_lemmatizer(today, 'content')

tokenizer processed: 235
lemmatizer processed: 235


In [21]:
today.head()

Unnamed: 0,content,yes_disaster
0,if you want to avoid the fee you ll have to se...,1
1,the watchlist is now bundled into the my stuff...,1
2,tumblr s ban on adult content is now in effect...,1
3,headed to the airport this morning you might w...,1
4,it might be the most monday mondayiest monday ...,1


# Training Models

## Term Frequency–Inverse Document Frequency (TFIDF)

In [22]:
df = pd.read_csv('../data/balanced_df_tokenized_lemmatized.csv')
df['content'].fillna('', inplace=True)

In [23]:
X = df['content']
y = df['yes_disaster']

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y,
                                                    random_state=42,
                                                    stratify=y)

**Models Metrics**

We decided to adapt NB for for model because of the interpretability and overall performance, and then applied customized stop words for the final models. 

In [25]:
knn = KNeighborsClassifier()
lr  = LogisticRegression(random_state=0, solver='lbfgs')
nb  = MultinomialNB()

In [26]:
from project4_function import model_scores

**Using the custom_stop_words_final**

In [30]:
# append the cuustomized words to the english stopwords
import pickle
custom_stop_words_final = pickle.load( open( "save.p", "rb" ) )
len(custom_stop_words_final)

2490

In [31]:
words_not_use = stopwords.words('english')
words_not_use.extend(custom_stop_words_final)

In [32]:
len(words_not_use) 

2669

In [33]:
tvec = TfidfVectorizer(analyzer = "word",
                             tokenizer = None,
                             preprocessor = None,
                             stop_words = words_not_use) 

X_train_tvec_sw = tvec.fit_transform(X_train)
X_test_tvec_sw = tvec.transform(X_test)
X_evaluate_sw = tvec.transform(flood['content'])
X_evaluate2_sw = tvec.transform(today['content'])

In [34]:
models = [knn, lr, nb]

score_matrix = pd.DataFrame() 

for i in models:
    score_matrix = score_matrix.append(model_scores(i, X_train_tvec_sw, y_train, X_test_tvec_sw , y_test))
score_matrix

Unnamed: 0,model,accuracy score,cv train score,cv test score,train score,test score,train-test gap,model status,bias vs variance,fit time
0,"KNeighborsClassifier(algorithm='auto', leaf_si...",0.753963,0.770596,0.746768,0.842181,0.753963,0.088219,overfit,high variance,0.0
0,"LogisticRegression(C=1.0, class_weight=None, d...",0.795029,0.793056,0.789287,0.852991,0.795029,0.057962,overfit,high variance,1e-06
0,"MultinomialNB(alpha=1.0, class_prior=None, fit...",0.785663,0.786933,0.776668,0.822844,0.785663,0.037181,overfit,high variance,1e-06


# LogisticRegression: Feature Importance

For a binary classification problems this is basically the log of the estimated probability of a feature given the positive class. It means that higher values mean more important features for the positive class.


In [35]:
lr = LogisticRegression()

my_params = {
    'penalty': ['l1', 'l2'],
    'C': [0.5, 1.0, 25],
}
gs = GridSearchCV(lr, param_grid=my_params, cv=5)
gs = gs.fit(X_train_tvec_sw, y_train)
print(gs.best_score_)

0.7937785251020898


In [39]:
#make prediction after training the model 
prediction = gs.predict(X_evaluate_sw)

In [40]:
flood['prediction_lr'] = prediction

In [41]:
today_prediction = gs.predict(X_evaluate2_sw )

In [42]:
today['prediction_lr'] = today_prediction

# MultinomialNB: Feature Importance

In [43]:
tvec_nb = TfidfVectorizer(analyzer = "word",
                             tokenizer = None,
                             preprocessor = None,
                             stop_words = words_not_use) 

X_train_tvec_sw_nb = tvec_nb.fit_transform(X_train)
X_test_tvec_sw_nb = tvec_nb.transform(X_test)
X_evaluate_sw_nb = tvec_nb.transform(flood['content'])
X_evaluate2_sw_nb = tvec_nb.transform(today['content'])

In [44]:
nb = MultinomialNB()
gs_nb = nb.fit(X_train_tvec_sw_nb, y_train)

In [45]:
prediction_nb = gs_nb.predict(X_evaluate_sw_nb)

In [46]:
flood['prediction_nb'] = prediction_nb

In [49]:
today_nb_prediction = gs_nb.predict(X_evaluate2_sw_nb)

In [50]:
today['prediction_nb'] = today_nb_prediction

### Prediction Dataframe 

In [51]:
flood['title'] = df['title']
today['title'] = df2['title']

In [48]:
flood.head()

Unnamed: 0,content,yes_disaster,prediction_lr,prediction_nb,title
0,this story originally appeared on grist and is...,1,0,1,people hospitalized after missouri tourist boa...
1,flash flood can strike with almost no warning ...,1,1,1,passenger stranded after iceland s wow air col...
2,the flood underinsurance problem is arguably t...,1,0,0,mgm resort sue victim of la vega shooting s...
3,this story originally appeared on citylab and ...,1,0,1,santa fe high school multiple fatality repor...
4,scandal read the facebook post of mexican come...,1,0,0,ied blast in kashmir km from pulwama terror a...


In [52]:
today.head()

Unnamed: 0,content,yes_disaster,prediction_lr,prediction_nb,title
0,if you want to avoid the fee you ll have to se...,1,0,0,MoviePass’ surge pricing starts today
1,the watchlist is now bundled into the my stuff...,1,0,0,Hulu’s website looks different today
2,tumblr s ban on adult content is now in effect...,1,0,0,Tumblr porn vanishes today
3,headed to the airport this morning you might w...,1,0,1,Check For Flight Delays Before Flying Today
4,it might be the most monday mondayiest monday ...,1,0,0,Focus on Accomplishing One Task Today


In [53]:
today.to_csv('../data/today_prediction.csv')

In [36]:
flood.to_csv('../data/flood_prediction.csv')