In [1]:
import pickle
import numpy as np
import pandas as pd
import xgboost as xgb
from time import time
from feature_generators.prediction_features_generator import *
from feature_generators.helpers import *
import feature_generators.ngram as ngram
import warnings
warnings.filterwarnings("ignore")

GloVe model loaded!


In [2]:
# Load the saved booster model
with open('saved_data/xgb_model.pkl', 'rb') as mod:
    xgb_mod = pickle.load(mod)

In [3]:
def process(data):
    """
    Preprocesses the data provided and generates
    unigrams, bigrams and trigrams.
    Saves the features in the separate columns in the dataframe.
    
    Input: Dataframe
    
    Returns Dataframe
    """
    
    # print (data.iloc[1])
    print ('>>> Data shape: ', data.shape)
    
    t0 = time()
    print("---Generating n-grams Features!---")
    print ("Generating unigram")
    data["Headline_unigram"] = data["Headline"].map(lambda x: preprocess_data(x))
    data["articleBody_unigram"] = data["articleBody"].map(lambda x: preprocess_data(x))
    
    print ("Generating bigram")
    join_str = "_"
    data["Headline_bigram"] = data["Headline_unigram"].map(lambda x: ngram.getBigram(x, join_str))
    data["articleBody_bigram"] = data["articleBody_unigram"].map(lambda x: ngram.getBigram(x, join_str))

    print ("Generating trigram")
    join_str = "_"
    data["Headline_trigram"] = data["Headline_unigram"].map(lambda x: ngram.getTrigram(x, join_str))
    data["articleBody_trigram"] = data["articleBody_unigram"].map(lambda x: ngram.getTrigram(x, join_str))

    print("---n-gram Features generated---")
    print("Time taken: {} seconds\n".format(time() - t0))
    
    return data


In [4]:
def build_test_data(data):
    """
    Generates various features needed to predict
    the class of the news.
    
    Input: DataFrame
    Returns Array of generated features.
    """

    data = process(data)
    
    generators = [
                  CountFeatureGenerator,
                  TfidfFeatureGenerator,
                  Word2VecFeatureGenerator,
                  SentimentFeatureGenerator,
                  ReadabilityFeatureGenerator
                  # Add more generators
                 ]
    
    # Class generators one by one to generate features
    features = [feature for generator in generators for feature in generator(data)]
    print("Total number of raw features: {}".format(len(features)))
    
    # Stack and return the features
    return np.hstack(features)

In [5]:
def check(news):
    """
    Predicts the probable class and corresponding probabilites
    of the news belonging to a certian clas
    
    Input: DataFrame of news: Headline and Article body only
    Returns DataFrame with class predictions
    """

    test_x = build_test_data(news)
    
    # Save feature_vector for quick debugging
    with open('tmp/feature_vector', 'wb') as fv:
        pickle.dump(test_x, fv)
        

    dtest = xgb.DMatrix(test_x)
    print("Total Feature count in the test set: ", len(dtest.feature_names))
    
    # Use Booster to predict class
    pred_prob_y = xgb_mod.predict(dtest).reshape(test_x.shape[0], 4) # predicted probabilities
    pred_y = np.argmax(pred_prob_y, axis=1)

    LABELS = ['reliable', 'unreliable']
    predicted = [LABELS[int(a)] for a in pred_y]

    # print (predicted)
    print ('pred_y.shape: ', pred_y.shape)
    predicted = [LABELS[int(a)] for a in pred_y]

    news['preds'] = predicted
    news['Reliable'] = pred_prob_y[:, 0]
    news['Unreliable'] = pred_prob_y[:, 1]
    
    return news

In [9]:
data = pd.read_csv("datasets/kaggle_clean.csv", encoding='utf-8')
data.drop(columns=['type', 'length'], inplace=True)
data.columns = ['Headline', 'articleBody', 'target']
data.Headline = data.Headline.astype(str)
data.articleBody = data.articleBody.astype(str)
data.head()

Unnamed: 0,Headline,articleBody,target
0,House Dem Aide: We Didn’t Even See Comey’s Let...,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Ever get the feeling your life circles the rou...,0
2,Why the Truth Might Get You Fired,Why the Truth Might Get You Fired October numb...,1
3,number Civilians Killed In Single US Airstrike...,Videos number Civilians Killed In Single US Ai...,1
4,Life: Life Of Luxury: Elton John’s number Favo...,Ever wonder how Britain’s most iconic pop pian...,1


In [11]:
df_output = check(data)

>>> Data shape:  (261, 3)
---Generating n-grams Features!---
Generating unigram
Generating bigram
Generating trigram
---n-gram Features generated---
Time taken: 1.2594358921051025 seconds


---Generating Counting Features:---
xBasicCounts.shape:  (261, 41)
---Counting Features is complete---
Time taken 0.4412424564361572 seconds


---Generating TFIDF Features:---
xHeadlineTfidf.shape: (261, 859153)
xBodyTfidf.shape:  (261, 859153)
simTfidf.shape:  (261, 1)
---TFIDF Features is complete---
Time taken 4.514606714248657 seconds


---Generating Word2Vector Features:---
headlineVec.shape:  (261, 50)
bodyVec.shape:  (261, 50)
simVec.shape:  (261, 1)
---Word2Vector Features is complete---
Time taken 0.4970223903656006 seconds


---Generating Sentiment Features:---
headlineSenti.shape:  (261, 4)
bodySenti.shape:  (261, 4)
---Sentiment Features is complete---
Time taken 1.908996820449829 seconds


---Generating Readability Features:---
xReadable.shape:  (261, 12)
---Readability Features is comp

Unnamed: 0,Headline,articleBody,target,Headline_unigram,articleBody_unigram,Headline_bigram,articleBody_bigram,Headline_trigram,articleBody_trigram,count_of_Headline_unigram,...,dale_chall_readability_score,difficult_words,linsear_write_formula,gunning_fog,i_me_myself,punct,lexical_diversity,preds,Reliable,Unreliable
0,House Dem Aide: We Didn’t Even See Comey’s Let...,House Dem Aide: We Didn’t Even See Comey’s Let...,1,"[hous, dem, aid, even, see, comey, letter, jas...","[hous, dem, aid, even, see, comey, letter, jas...","[hous_dem, dem_aid, aid_even, even_see, see_co...","[hous_dem, dem_aid, aid_even, even_see, see_co...","[hous_dem_aid, dem_aid_even, aid_even_see, eve...","[hous_dem_aid, dem_aid_even, aid_even_see, eve...",10,...,7.63,114,15.250000,18.24,1,51,61.295082,unreliable,0.001410,0.998587
1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Ever get the feeling your life circles the rou...,0,"[flynn, hillari, clinton, big, woman, campus, ...","[ever, get, feel, life, circl, roundabout, rat...","[flynn_hillari, hillari_clinton, clinton_big, ...","[ever_get, get_feel, feel_life, life_circl, ci...","[flynn_hillari_clinton, hillari_clinton_big, c...","[ever_get_feel, get_feel_life, feel_life_circl...",7,...,8.14,100,17.000000,22.13,4,56,59.679245,reliable,0.983399,0.016591
2,Why the Truth Might Get You Fired,Why the Truth Might Get You Fired October numb...,1,"[whi, truth, might, get, fire]","[whi, truth, might, get, fire, octob, number, ...","[whi_truth, truth_might, might_get, get_fire]","[whi_truth, truth_might, might_get, get_fire, ...","[whi_truth_might, truth_might_get, might_get_f...","[whi_truth_might, truth_might_get, might_get_f...",5,...,7.89,70,10.833333,18.77,0,29,39.980769,unreliable,0.066568,0.933418
3,number Civilians Killed In Single US Airstrike...,Videos number Civilians Killed In Single US Ai...,1,"[number, civilian, kill, singl, us, airstrik, ...","[video, number, civilian, kill, singl, us, air...","[number_civilian, civilian_kill, kill_singl, s...","[video_number, number_civilian, civilian_kill,...","[number_civilian_kill, civilian_kill_singl, ki...","[video_number_civilian, number_civilian_kill, ...",7,...,7.05,64,18.666667,17.16,1,31,51.725490,unreliable,0.000082,0.999918
4,Life: Life Of Luxury: Elton John’s number Favo...,Ever wonder how Britain’s most iconic pop pian...,1,"[life, life, luxuri, elton, john, number, favo...","[ever, wonder, britain, icon, pop, pianist, ge...","[life_life, life_luxuri, luxuri_elton, elton_j...","[ever_wonder, wonder_britain, britain_icon, ic...","[life_life_luxuri, life_luxuri_elton, luxuri_e...","[ever_wonder_britain, wonder_britain_icon, bri...",14,...,7.14,69,13.500000,17.92,0,36,51.388889,unreliable,0.052874,0.947103
5,Benoît Hamon Wins French Socialist Party’s Pre...,"PARIS — France chose an idealistic, traditiona...",0,"[benoît, hamon, win, french, socialist, parti,...","[pari, franc, chose, idealist, tradit, candid,...","[benoît_hamon, hamon_win, win_french, french_s...","[pari_franc, franc_chose, chose_idealist, idea...","[benoît_hamon_win, hamon_win_french, win_frenc...","[pari_franc_chose, franc_chose_idealist, chose...",11,...,7.95,165,15.500000,19.18,1,87,86.578947,reliable,0.966803,0.033180
6,Obama’s Organizing for Action Partners with So...,"Organizing for Action, the activist group that...",0,"[obama, organ, action, partner, soro, link, in...","[organ, action, activist, group, morph, barack...","[obama_organ, organ_action, action_partner, pa...","[organ_action, action_activist, activist_group...","[obama_organ_action, organ_action_partner, act...","[organ_action_activist, action_activist_group,...",10,...,8.43,120,60.000000,21.16,0,53,63.690909,unreliable,0.359273,0.640692
7,Russian Researchers Discover Secret Nazi Milit...,The mystery surrounding The Third Reich and Na...,1,"[russian, research, discov, secret, nazi, mili...","[mysteri, surround, third, reich, nazi, german...","[russian_research, research_discov, discov_sec...","[mysteri_surround, surround_third, third_reich...","[russian_research_discov, research_discov_secr...","[mysteri_surround_third, surround_third_reich,...",10,...,8.70,112,10.333333,20.98,0,53,51.035714,unreliable,0.084424,0.915548
8,US Officials See No Link Between Trump and Russia,Clinton Campaign Demands FBI Affirm Trump's Ru...,1,"[us, offici, see, link, trump, russia]","[clinton, campaign, demand, fbi, affirm, trump...","[us_offici, offici_see, see_link, link_trump, ...","[clinton_campaign, campaign_demand, demand_fbi...","[us_offici_see, offici_see_link, see_link_trum...","[clinton_campaign_demand, campaign_demand_fbi,...",6,...,8.84,58,22.666667,23.80,0,17,34.847826,unreliable,0.037225,0.962745
9,"Re: Yes, There Are Paid Government Trolls On S...","Yes, There Are Paid Government Trolls On Socia...",1,"[yes, paid, govern, troll, social, media, blog...","[yes, paid, govern, troll, social, media, blog...","[yes_paid, paid_govern, govern_troll, troll_so...","[yes_paid, paid_govern, govern_troll, troll_so...","[yes_paid_govern, paid_govern_troll, govern_tr...","[yes_paid_govern, paid_govern_troll, govern_tr...",9,...,9.13,76,16.750000,24.13,1,32,34.145455,unreliable,0.000162,0.999836


In [12]:
df_needed = df_output[['target', 'preds']]

In [13]:
df_needed.to_csv('datasets/kaggle_clean_preds.csv', index=False)

In [34]:
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
# Visualize the importance of each feature in the model.
fig, ax = plt.subplots(1, 1, figsize=(20,30))
important_feats = xgb.plot_importance(xgb_mod, ax=ax, height=0.2)

In [None]:
important_feats.figure.savefig('results/important_features.png', format='png', dpi=300)

In [42]:
# Make deciscion tree
fig = xgb.to_graphviz(xgb_mod)

In [43]:
fig.render('results/decision_tree.png', format='png')

'graph_.png'

## For quick debugging.

In [18]:
import pickle
import xgboost as xgb

with open('saved_data/xgb_model.pkl', 'rb') as mod:
    xgb_mod = pickle.load(mod)

with open('tmp/feature_vector', 'rb') as fv:
    test_x = pickle.load(fv)

dtest = xgb.DMatrix(test_x)

print("Total Feature count in the test set: ", len(dtest.feature_names))

dtest = xgb.DMatrix(test_x)
print("Total Feature count in the test set: ", len(dtest.feature_names))

# Use Booster to predict class
pred_prob_y = xgb_mod.predict(dtest).reshape(test_x.shape[0], 4) # predicted probabilities
pred_y = np.argmax(pred_prob_y, axis=1)

LABELS = ['reliable', 'unreliable']
predicted = [LABELS[int(a)] for a in pred_y]

# print (predicted)
print ('pred_y.shape: ', pred_y.shape)
predicted = [LABELS[int(a)] for a in pred_y]

news['preds'] = predicted
news['Reliable'] = pred_prob_y[:, 0]
news['Unreliable'] = pred_prob_y[:, 1]


Total Feature count in the test set:  163
Total Feature count in the test set:  163
pred_y.shape:  (100,)
