In [3]:
from pymongo import MongoClient
import gensim
from gensim import corpora, models, similarities, matutils
from gensim.test.utils import datapath
import nltk
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import dbscan, KMeans, mean_shift
import numpy as np
import pandas as pd
from tqdm import tqdm
import time
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
from nltk.stem import WordNetLemmatizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import random
import pickle
import datetime
import sys
import warnings
if not sys.warnoptions:
    warnings.simplefilter("ignore")

In [4]:
df = pd.read_pickle('data/test_df.pkl')

wordnet_lemmatizer = WordNetLemmatizer()
text_lemmas = {}

db = MongoClient('localhost', 27017)['politician_db']
collection_names = df.collectionname

specific_stop_words = ['advertisement', 'sept','this','is','a','digitized','version',
                       'of','an','article','from','the','times','print','archive','before',
                       'the','start','online','publication','in','to','preserve','these',
                       'a','they','originally','appeared','time','doe','does','not','alter',
                       'edit','or','update','them','occasionally','digitization','process',
                       'introduces','transcription','error','other','problem','please','send',
                       'report', 'such','problem','archivefeedbacknytimescom']

chunks = [[],[],[],[],[],[]]
for i, name in enumerate(collection_names):
    for j in range(6):
        if i%6 == j:
            chunks[j].append(name)

for chunk in chunks:
    #time.sleep(0.3)
    for name in tqdm(chunk):
        #time.sleep(0.1)
        text_lemmas[name] = []
        for doc in db[name].find():
            lemmalist = []
            try:
                for word in doc['full_text'].split():
                    if word not in specific_stop_words:
                        lemmalist.append(wordnet_lemmatizer.lemmatize(word))
                text_lemmas[name].append(" ".join(lemmalist))
            except:
                pass

100%|██████████| 13/13 [00:10<00:00,  1.22it/s]
100%|██████████| 13/13 [00:03<00:00,  4.15it/s]
100%|██████████| 13/13 [00:04<00:00,  3.19it/s]
100%|██████████| 13/13 [00:09<00:00,  1.43it/s]
100%|██████████| 13/13 [00:00<00:00, 12.13it/s]
100%|██████████| 12/12 [00:04<00:00,  2.64it/s]


In [5]:
pos_lda = gensim.models.LdaModel.load('models/pos_lda.pickle')
neg_lda = gensim.models.LdaModel.load('models/neg_lda.pickle')

In [6]:
w2v = gensim.models.Word2Vec.load('models/word2vec.model')
km = pickle.load(open('models/kmeans.pkl', 'rb'))
vs = SentimentIntensityAnalyzer()

In [7]:
def get_w2v_cluster(doc):
    vec = np.zeros(150)
    length = 0
    for word in doc.split():
        try:
            vec += w2v[word]
            length += 1
        except:
            pass
    doc_vec = vec/length
    return km.predict(doc_vec.reshape(1,-1))[0]

In [8]:
def get_topics(doc, lda):
    count_vectorizer = CountVectorizer(ngram_range=(1, 2),  
                                       stop_words='english', token_pattern="\\b[a-z][a-z]+\\b")
    count_vectorizer.fit([doc])
    counts = count_vectorizer.transform([doc]).transpose()
    corpus = matutils.Sparse2Corpus(counts)
    topic_spread = list(lda.get_document_topics(corpus))
    topics = np.zeros(30)
    for i,doc in enumerate(topic_spread[0]):
        topics[doc[0]] = doc[1]
    return topics

In [9]:
def pub_date_to_datetime(s):
    s = s[:10]
    l = s.split('-')
    d = datetime.date(year=int(l[0]),month=int(l[1]),day=int(l[2]))
    return d

In [10]:
def build_vector(politician, doc):
    df_row = df[df.collectionname == politician].drop(['Exited Office', 'collectionname', 'label'],axis=1)
    df_row = df_row.reset_index()
    vec = []
    try:
        vec.append(df_row['Entered Office'][0])
        vec.append(df_row.Name[0])
        vec.append(df_row.Party[0])
        vec.append(df_row.President[0])
        vec.append(df_row.Role[0])
    except:
        print('error in df extraction')
    try:
        for pos_topic in get_topics(doc, pos_lda):
            vec.append(pos_topic)
        for neg_topic in get_topics(doc, neg_lda):
            vec.append(neg_topic)
    except:
        print('error in topic modeling')
    vec.append(get_w2v_cluster(doc))
    vec.append(vs.polarity_scores(doc)['compound'])
    return vec

In [11]:
container = []
for politician in collection_names:
    print(politician)
    dates = []
    for doc in db[politician].find():
        dates.append(pub_date_to_datetime(doc['pub_date']))
    for doc in db[politician].find():
        try:
            vec = build_vector(politician, doc['full_text'])
            current_date = pub_date_to_datetime(doc['pub_date'])
            vec.append(current_date)
            freq_count = 0
            for date in dates:
                if current_date - date < datetime.timedelta(30) and current_date - date > datetime.timedelta(0):
                    freq_count += 1
            vec.append(freq_count)
            container.append(vec)
        except:
            pass

albertogonzales
kathleensebelius
scottpruitt
tomprice
hannibalhamlin
fdavidmathews
francisbiddle
brentscowcroft
harrystruman
jamesfbyrnes
waltermondale
jimmycarter
cyrusvance
ebenezerrhoar
joebiden
janetnapolitano
error in topic modeling
jamesvforrestal
josephacalifanojr
griffinbell
benjaminhbristow
billdaleyborn
patriciarobertsharris
johnsherman
susanrice
malcolmbaldrige
ovetaculphobby
williamfrenchsmith
raymondjdonovan
edhugler
jimmattis
error in topic modeling
error in topic modeling
error in topic modeling
williamprogers
thomasfbayard
jeffsessions
benjaminharrison
charleswfoster
elaineduke
lyndonbjohnson
donjwright
anndoremclaughlin
alexazar
josephmckenna
deanrusk
johnrbolton
samuelkskinner
alansboyd
robertbacon
franklinmacveagh
lloydbentsen
anthonylake
jamescmcreynolds
williambwilson
williamgmcadoo
federicopea
roberthfinch
robertreich
melvinrlaird
leonpanettaborn
calvincoolidge
warrengharding
andrewmcuomo
rodneyeslater
jamesjdavis
charlesgdawes
dickcheney
williamesimon
casparwwein

In [12]:
X_raw = pd.DataFrame(container)

X_raw

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,59,60,61,62,63,64,65,66,67,68
0,2005-02-03,alberto gonzales,Republican,George W. Bush,Attorney General,0.030361,0.035776,0.020756,0.032054,0.083589,...,0.0,0.0,0.0,0.0,0.0,0.0,12,0.9798,2007-03-14,7
1,2005-02-03,alberto gonzales,Republican,George W. Bush,Attorney General,0.035611,0.032437,0.014364,0.036164,0.101557,...,0.0,0.0,0.0,0.0,0.0,0.0,4,0.9989,2007-03-18,20
2,2005-02-03,alberto gonzales,Republican,George W. Bush,Attorney General,0.020474,0.022648,0.023567,0.027367,0.093895,...,0.0,0.0,0.0,0.0,0.0,0.0,9,-0.9997,2007-09-09,31
3,2005-02-03,alberto gonzales,Republican,George W. Bush,Attorney General,0.029846,0.036838,0.026381,0.015100,0.078399,...,0.0,0.0,0.0,0.0,0.0,0.0,9,0.9977,2007-08-27,16
4,2005-02-03,alberto gonzales,Republican,George W. Bush,Attorney General,0.027691,0.024990,0.022792,0.024741,0.075789,...,0.0,0.0,0.0,0.0,0.0,0.0,12,0.9973,2007-09-14,31
5,2005-02-03,alberto gonzales,Republican,George W. Bush,Attorney General,0.015365,0.000000,0.014762,0.057619,0.031154,...,0.0,0.0,0.0,0.0,0.0,0.0,9,-0.0258,2007-08-27,16
6,2005-02-03,alberto gonzales,Republican,George W. Bush,Attorney General,0.029346,0.026211,0.020045,0.024589,0.086151,...,0.0,0.0,0.0,0.0,0.0,0.0,9,0.9991,2007-08-27,16
7,2005-02-03,alberto gonzales,Republican,George W. Bush,Attorney General,0.026485,0.031677,0.019999,0.021126,0.074035,...,0.0,0.0,0.0,0.0,0.0,0.0,7,0.9998,2007-08-28,24
8,2005-02-03,alberto gonzales,Republican,George W. Bush,Attorney General,0.024313,0.030679,0.019852,0.029046,0.067449,...,0.0,0.0,0.0,0.0,0.0,0.0,9,0.9963,2007-08-27,16
9,2005-02-03,alberto gonzales,Republican,George W. Bush,Attorney General,0.031559,0.035973,0.019029,0.031720,0.074275,...,0.0,0.0,0.0,0.0,0.0,0.0,12,0.9971,2007-08-30,28


In [13]:
party_dummies = pd.get_dummies(X_raw[2])
pres_dummies = pd.get_dummies(X_raw[3])
role_dummies = pd.get_dummies(X_raw[4])

In [14]:
merged = pd.merge(X_raw, party_dummies, left_index=True, right_index=True)
merged = pd.merge(merged,pres_dummies,left_index=True, right_index=True)
merged = pd.merge(merged,role_dummies,left_index=True, right_index=True)
merged

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,President,Secretary of Commerce,Secretary of Defense,Secretary of HHS,Secretary of HUD,Secretary of Homeland Security,Secretary of Labor,Secretary of Transportation,Secretary of the Treasury,Vice President
0,2005-02-03,alberto gonzales,Republican,George W. Bush,Attorney General,0.030361,0.035776,0.020756,0.032054,0.083589,...,0,0,0,0,0,0,0,0,0,0
1,2005-02-03,alberto gonzales,Republican,George W. Bush,Attorney General,0.035611,0.032437,0.014364,0.036164,0.101557,...,0,0,0,0,0,0,0,0,0,0
2,2005-02-03,alberto gonzales,Republican,George W. Bush,Attorney General,0.020474,0.022648,0.023567,0.027367,0.093895,...,0,0,0,0,0,0,0,0,0,0
3,2005-02-03,alberto gonzales,Republican,George W. Bush,Attorney General,0.029846,0.036838,0.026381,0.015100,0.078399,...,0,0,0,0,0,0,0,0,0,0
4,2005-02-03,alberto gonzales,Republican,George W. Bush,Attorney General,0.027691,0.024990,0.022792,0.024741,0.075789,...,0,0,0,0,0,0,0,0,0,0
5,2005-02-03,alberto gonzales,Republican,George W. Bush,Attorney General,0.015365,0.000000,0.014762,0.057619,0.031154,...,0,0,0,0,0,0,0,0,0,0
6,2005-02-03,alberto gonzales,Republican,George W. Bush,Attorney General,0.029346,0.026211,0.020045,0.024589,0.086151,...,0,0,0,0,0,0,0,0,0,0
7,2005-02-03,alberto gonzales,Republican,George W. Bush,Attorney General,0.026485,0.031677,0.019999,0.021126,0.074035,...,0,0,0,0,0,0,0,0,0,0
8,2005-02-03,alberto gonzales,Republican,George W. Bush,Attorney General,0.024313,0.030679,0.019852,0.029046,0.067449,...,0,0,0,0,0,0,0,0,0,0
9,2005-02-03,alberto gonzales,Republican,George W. Bush,Attorney General,0.031559,0.035973,0.019029,0.031720,0.074275,...,0,0,0,0,0,0,0,0,0,0


In [15]:
X_test = merged.drop([1,2,3,4], axis=1)

In [16]:
###  Generating y_test  ###
resigned = list(df.Name[df.label==1])
y_train = merged[1].apply(lambda x: 1 if x in resigned else 0)
y_train

0       1
1       1
2       1
3       1
4       1
5       1
6       1
7       1
8       1
9       1
10      1
11      1
12      1
13      1
14      1
15      1
16      1
17      1
18      1
19      1
20      1
21      1
22      1
23      1
24      1
25      1
26      1
27      1
28      1
29      1
       ..
9311    0
9312    0
9313    0
9314    0
9315    0
9316    0
9317    0
9318    0
9319    0
9320    0
9321    0
9322    0
9323    0
9324    0
9325    0
9326    0
9327    0
9328    0
9329    0
9330    0
9331    0
9332    0
9333    0
9334    0
9335    0
9336    0
9337    0
9338    0
9339    0
9340    0
Name: 1, Length: 9341, dtype: int64

In [17]:
testing_merge = pd.merge(merged, df, left_on=[0,1,4], right_on=['Entered Office','Name','Role'])
testing_merge

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,Secretary of the Treasury,Vice President,Entered Office,Exited Office,Name,Party,President_y,Role,collectionname,label
0,2005-02-03,alberto gonzales,Republican,George W. Bush,Attorney General,0.030361,0.035776,0.020756,0.032054,0.083589,...,0,0,2005-02-03,2007-09-17 00:00:00,alberto gonzales,Republican,George W. Bush,Attorney General,albertogonzales,1
1,2005-02-03,alberto gonzales,Republican,George W. Bush,Attorney General,0.035611,0.032437,0.014364,0.036164,0.101557,...,0,0,2005-02-03,2007-09-17 00:00:00,alberto gonzales,Republican,George W. Bush,Attorney General,albertogonzales,1
2,2005-02-03,alberto gonzales,Republican,George W. Bush,Attorney General,0.020474,0.022648,0.023567,0.027367,0.093895,...,0,0,2005-02-03,2007-09-17 00:00:00,alberto gonzales,Republican,George W. Bush,Attorney General,albertogonzales,1
3,2005-02-03,alberto gonzales,Republican,George W. Bush,Attorney General,0.029846,0.036838,0.026381,0.015100,0.078399,...,0,0,2005-02-03,2007-09-17 00:00:00,alberto gonzales,Republican,George W. Bush,Attorney General,albertogonzales,1
4,2005-02-03,alberto gonzales,Republican,George W. Bush,Attorney General,0.027691,0.024990,0.022792,0.024741,0.075789,...,0,0,2005-02-03,2007-09-17 00:00:00,alberto gonzales,Republican,George W. Bush,Attorney General,albertogonzales,1
5,2005-02-03,alberto gonzales,Republican,George W. Bush,Attorney General,0.015365,0.000000,0.014762,0.057619,0.031154,...,0,0,2005-02-03,2007-09-17 00:00:00,alberto gonzales,Republican,George W. Bush,Attorney General,albertogonzales,1
6,2005-02-03,alberto gonzales,Republican,George W. Bush,Attorney General,0.029346,0.026211,0.020045,0.024589,0.086151,...,0,0,2005-02-03,2007-09-17 00:00:00,alberto gonzales,Republican,George W. Bush,Attorney General,albertogonzales,1
7,2005-02-03,alberto gonzales,Republican,George W. Bush,Attorney General,0.026485,0.031677,0.019999,0.021126,0.074035,...,0,0,2005-02-03,2007-09-17 00:00:00,alberto gonzales,Republican,George W. Bush,Attorney General,albertogonzales,1
8,2005-02-03,alberto gonzales,Republican,George W. Bush,Attorney General,0.024313,0.030679,0.019852,0.029046,0.067449,...,0,0,2005-02-03,2007-09-17 00:00:00,alberto gonzales,Republican,George W. Bush,Attorney General,albertogonzales,1
9,2005-02-03,alberto gonzales,Republican,George W. Bush,Attorney General,0.031559,0.035973,0.019029,0.031720,0.074275,...,0,0,2005-02-03,2007-09-17 00:00:00,alberto gonzales,Republican,George W. Bush,Attorney General,albertogonzales,1


In [23]:
for item in testing_merge['Exited Office']:
    print(type(item))

<class 'pandas._libs.tslibs.timestamps.Timestamp'>
<class 'pandas._libs.tslibs.timestamps.Timestamp'>
<class 'pandas._libs.tslibs.timestamps.Timestamp'>
<class 'pandas._libs.tslibs.timestamps.Timestamp'>
<class 'pandas._libs.tslibs.timestamps.Timestamp'>
<class 'pandas._libs.tslibs.timestamps.Timestamp'>
<class 'pandas._libs.tslibs.timestamps.Timestamp'>
<class 'pandas._libs.tslibs.timestamps.Timestamp'>
<class 'pandas._libs.tslibs.timestamps.Timestamp'>
<class 'pandas._libs.tslibs.timestamps.Timestamp'>
<class 'pandas._libs.tslibs.timestamps.Timestamp'>
<class 'pandas._libs.tslibs.timestamps.Timestamp'>
<class 'pandas._libs.tslibs.timestamps.Timestamp'>
<class 'pandas._libs.tslibs.timestamps.Timestamp'>
<class 'pandas._libs.tslibs.timestamps.Timestamp'>
<class 'pandas._libs.tslibs.timestamps.Timestamp'>
<class 'pandas._libs.tslibs.timestamps.Timestamp'>
<class 'pandas._libs.tslibs.timestamps.Timestamp'>
<class 'pandas._libs.tslibs.timestamps.Timestamp'>
<class 'pandas._libs.tslibs.tim

<class 'datetime.datetime'>
<class 'datetime.datetime'>
<class 'datetime.datetime'>
<class 'datetime.datetime'>
<class 'datetime.datetime'>
<class 'datetime.datetime'>
<class 'datetime.datetime'>
<class 'datetime.datetime'>
<class 'datetime.datetime'>
<class 'datetime.datetime'>
<class 'datetime.datetime'>
<class 'datetime.datetime'>
<class 'datetime.datetime'>
<class 'datetime.datetime'>
<class 'datetime.datetime'>
<class 'datetime.datetime'>
<class 'datetime.datetime'>
<class 'datetime.datetime'>
<class 'datetime.datetime'>
<class 'datetime.datetime'>
<class 'datetime.datetime'>
<class 'datetime.datetime'>
<class 'datetime.datetime'>
<class 'datetime.datetime'>
<class 'datetime.datetime'>
<class 'datetime.datetime'>
<class 'datetime.datetime'>
<class 'datetime.datetime'>
<class 'datetime.datetime'>
<class 'datetime.datetime'>
<class 'datetime.datetime'>
<class 'datetime.datetime'>
<class 'datetime.datetime'>
<class 'datetime.datetime'>
<class 'datetime.datetime'>
<class 'datetime.dat

<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class

<class 'datetime.datetime'>
<class 'datetime.datetime'>
<class 'datetime.datetime'>
<class 'datetime.datetime'>
<class 'datetime.datetime'>
<class 'datetime.datetime'>
<class 'datetime.datetime'>
<class 'datetime.datetime'>
<class 'datetime.datetime'>
<class 'datetime.datetime'>
<class 'datetime.datetime'>
<class 'datetime.datetime'>
<class 'datetime.datetime'>
<class 'datetime.datetime'>
<class 'datetime.datetime'>
<class 'datetime.datetime'>
<class 'datetime.datetime'>
<class 'datetime.datetime'>
<class 'datetime.datetime'>
<class 'datetime.datetime'>
<class 'datetime.datetime'>
<class 'datetime.datetime'>
<class 'datetime.datetime'>
<class 'datetime.datetime'>
<class 'datetime.datetime'>
<class 'datetime.datetime'>
<class 'datetime.datetime'>
<class 'datetime.datetime'>
<class 'datetime.datetime'>
<class 'datetime.datetime'>
<class 'datetime.datetime'>
<class 'datetime.datetime'>
<class 'datetime.datetime'>
<class 'datetime.datetime'>
<class 'datetime.datetime'>
<class 'datetime.dat

<class 'datetime.datetime'>
<class 'datetime.datetime'>
<class 'datetime.datetime'>
<class 'datetime.datetime'>
<class 'datetime.datetime'>
<class 'datetime.datetime'>
<class 'datetime.datetime'>
<class 'datetime.datetime'>
<class 'datetime.datetime'>
<class 'datetime.datetime'>
<class 'datetime.datetime'>
<class 'datetime.datetime'>
<class 'datetime.datetime'>
<class 'datetime.datetime'>
<class 'datetime.datetime'>
<class 'datetime.datetime'>
<class 'datetime.datetime'>
<class 'datetime.datetime'>
<class 'datetime.datetime'>
<class 'datetime.datetime'>
<class 'datetime.datetime'>
<class 'datetime.datetime'>
<class 'datetime.datetime'>
<class 'datetime.datetime'>
<class 'datetime.datetime'>
<class 'datetime.datetime'>
<class 'datetime.datetime'>
<class 'datetime.datetime'>
<class 'datetime.datetime'>
<class 'datetime.datetime'>
<class 'datetime.datetime'>
<class 'datetime.datetime'>
<class 'datetime.datetime'>
<class 'datetime.datetime'>
<class 'datetime.datetime'>
<class 'datetime.dat

In [25]:
testing_merge['Edited Exited Office'] = testing_merge['Exited Office'].apply(lambda x: x.date() if type(x) == datetime.datetime or type(x) == pd._libs.tslibs.timestamps.Timestamp else datetime.date(2020,1,1))
testing_merge['article_exit_delta'] = testing_merge['Edited Exited Office']-testing_merge[67]
testing_merge['within_month'] = testing_merge.article_exit_delta.apply(lambda x: 'Within One Month' if x<datetime.timedelta(30) else 'In Office')
testing_merge['within_week'] = testing_merge.article_exit_delta.apply(lambda x: 'Within One Week' if x<datetime.timedelta(8) else 'In Office')
testing_merge

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,Party,President_y,Role,collectionname,label,Edited Exited Office,article_exit_delta,within_month,within_week,Month_Label
0,2005-02-03,alberto gonzales,Republican,George W. Bush,Attorney General,0.030361,0.035776,0.020756,0.032054,0.083589,...,Republican,George W. Bush,Attorney General,albertogonzales,1,2007-09-17,187 days,In Office,In Office,0
1,2005-02-03,alberto gonzales,Republican,George W. Bush,Attorney General,0.035611,0.032437,0.014364,0.036164,0.101557,...,Republican,George W. Bush,Attorney General,albertogonzales,1,2007-09-17,183 days,In Office,In Office,0
2,2005-02-03,alberto gonzales,Republican,George W. Bush,Attorney General,0.020474,0.022648,0.023567,0.027367,0.093895,...,Republican,George W. Bush,Attorney General,albertogonzales,1,2007-09-17,8 days,Within One Month,In Office,0
3,2005-02-03,alberto gonzales,Republican,George W. Bush,Attorney General,0.029846,0.036838,0.026381,0.015100,0.078399,...,Republican,George W. Bush,Attorney General,albertogonzales,1,2007-09-17,21 days,Within One Month,In Office,0
4,2005-02-03,alberto gonzales,Republican,George W. Bush,Attorney General,0.027691,0.024990,0.022792,0.024741,0.075789,...,Republican,George W. Bush,Attorney General,albertogonzales,1,2007-09-17,3 days,Within One Month,Within One Week,0
5,2005-02-03,alberto gonzales,Republican,George W. Bush,Attorney General,0.015365,0.000000,0.014762,0.057619,0.031154,...,Republican,George W. Bush,Attorney General,albertogonzales,1,2007-09-17,21 days,Within One Month,In Office,0
6,2005-02-03,alberto gonzales,Republican,George W. Bush,Attorney General,0.029346,0.026211,0.020045,0.024589,0.086151,...,Republican,George W. Bush,Attorney General,albertogonzales,1,2007-09-17,21 days,Within One Month,In Office,0
7,2005-02-03,alberto gonzales,Republican,George W. Bush,Attorney General,0.026485,0.031677,0.019999,0.021126,0.074035,...,Republican,George W. Bush,Attorney General,albertogonzales,1,2007-09-17,20 days,Within One Month,In Office,0
8,2005-02-03,alberto gonzales,Republican,George W. Bush,Attorney General,0.024313,0.030679,0.019852,0.029046,0.067449,...,Republican,George W. Bush,Attorney General,albertogonzales,1,2007-09-17,21 days,Within One Month,In Office,0
9,2005-02-03,alberto gonzales,Republican,George W. Bush,Attorney General,0.031559,0.035973,0.019029,0.031720,0.074275,...,Republican,George W. Bush,Attorney General,albertogonzales,1,2007-09-17,18 days,Within One Month,In Office,0


In [26]:
y_month = testing_merge.within_month
y_within_month_res = []
for i in range(len(list(testing_merge.article_exit_delta))):
    if list(testing_merge.label)[i] == 1 and list(testing_merge.within_month)[i] == 'Within One Month':
        y_within_month_res.append(1)
    else:
        y_within_month_res.append(0)

In [27]:
testing_merge['Month_Label'] = y_within_month_res

In [28]:
testing_merge['Month_Label'].mean()

0.010063162402312387

In [29]:
testing_merge

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,Party,President_y,Role,collectionname,label,Edited Exited Office,article_exit_delta,within_month,within_week,Month_Label
0,2005-02-03,alberto gonzales,Republican,George W. Bush,Attorney General,0.030361,0.035776,0.020756,0.032054,0.083589,...,Republican,George W. Bush,Attorney General,albertogonzales,1,2007-09-17,187 days,In Office,In Office,0
1,2005-02-03,alberto gonzales,Republican,George W. Bush,Attorney General,0.035611,0.032437,0.014364,0.036164,0.101557,...,Republican,George W. Bush,Attorney General,albertogonzales,1,2007-09-17,183 days,In Office,In Office,0
2,2005-02-03,alberto gonzales,Republican,George W. Bush,Attorney General,0.020474,0.022648,0.023567,0.027367,0.093895,...,Republican,George W. Bush,Attorney General,albertogonzales,1,2007-09-17,8 days,Within One Month,In Office,1
3,2005-02-03,alberto gonzales,Republican,George W. Bush,Attorney General,0.029846,0.036838,0.026381,0.015100,0.078399,...,Republican,George W. Bush,Attorney General,albertogonzales,1,2007-09-17,21 days,Within One Month,In Office,1
4,2005-02-03,alberto gonzales,Republican,George W. Bush,Attorney General,0.027691,0.024990,0.022792,0.024741,0.075789,...,Republican,George W. Bush,Attorney General,albertogonzales,1,2007-09-17,3 days,Within One Month,Within One Week,1
5,2005-02-03,alberto gonzales,Republican,George W. Bush,Attorney General,0.015365,0.000000,0.014762,0.057619,0.031154,...,Republican,George W. Bush,Attorney General,albertogonzales,1,2007-09-17,21 days,Within One Month,In Office,1
6,2005-02-03,alberto gonzales,Republican,George W. Bush,Attorney General,0.029346,0.026211,0.020045,0.024589,0.086151,...,Republican,George W. Bush,Attorney General,albertogonzales,1,2007-09-17,21 days,Within One Month,In Office,1
7,2005-02-03,alberto gonzales,Republican,George W. Bush,Attorney General,0.026485,0.031677,0.019999,0.021126,0.074035,...,Republican,George W. Bush,Attorney General,albertogonzales,1,2007-09-17,20 days,Within One Month,In Office,1
8,2005-02-03,alberto gonzales,Republican,George W. Bush,Attorney General,0.024313,0.030679,0.019852,0.029046,0.067449,...,Republican,George W. Bush,Attorney General,albertogonzales,1,2007-09-17,21 days,Within One Month,In Office,1
9,2005-02-03,alberto gonzales,Republican,George W. Bush,Attorney General,0.031559,0.035973,0.019029,0.031720,0.074275,...,Republican,George W. Bush,Attorney General,albertogonzales,1,2007-09-17,18 days,Within One Month,In Office,1


In [30]:
y_test = testing_merge.Month_Label

In [31]:
X_test = merged.drop([1,2,3,4], axis=1)

In [32]:
X_test

Unnamed: 0,0,5,6,7,8,9,10,11,12,13,...,President,Secretary of Commerce,Secretary of Defense,Secretary of HHS,Secretary of HUD,Secretary of Homeland Security,Secretary of Labor,Secretary of Transportation,Secretary of the Treasury,Vice President
0,2005-02-03,0.030361,0.035776,0.020756,0.032054,0.083589,0.032914,0.000000,0.021336,0.000000,...,0,0,0,0,0,0,0,0,0,0
1,2005-02-03,0.035611,0.032437,0.014364,0.036164,0.101557,0.039266,0.010129,0.011040,0.013592,...,0,0,0,0,0,0,0,0,0,0
2,2005-02-03,0.020474,0.022648,0.023567,0.027367,0.093895,0.039139,0.012631,0.032093,0.012392,...,0,0,0,0,0,0,0,0,0,0
3,2005-02-03,0.029846,0.036838,0.026381,0.015100,0.078399,0.039081,0.000000,0.030049,0.024499,...,0,0,0,0,0,0,0,0,0,0
4,2005-02-03,0.027691,0.024990,0.022792,0.024741,0.075789,0.027616,0.014314,0.032439,0.012235,...,0,0,0,0,0,0,0,0,0,0
5,2005-02-03,0.015365,0.000000,0.014762,0.057619,0.031154,0.043359,0.029048,0.000000,0.014762,...,0,0,0,0,0,0,0,0,0,0
6,2005-02-03,0.029346,0.026211,0.020045,0.024589,0.086151,0.044669,0.012050,0.025979,0.011564,...,0,0,0,0,0,0,0,0,0,0
7,2005-02-03,0.026485,0.031677,0.019999,0.021126,0.074035,0.020849,0.011684,0.036159,0.014245,...,0,0,0,0,0,0,0,0,0,0
8,2005-02-03,0.024313,0.030679,0.019852,0.029046,0.067449,0.021711,0.011808,0.031621,0.012448,...,0,0,0,0,0,0,0,0,0,0
9,2005-02-03,0.031559,0.035973,0.019029,0.031720,0.074275,0.025826,0.011925,0.019154,0.011689,...,0,0,0,0,0,0,0,0,0,0


In [33]:
# pd.to_pickle(X_test, 'data/X_test.pkl')
# pd.to_pickle(y_test, 'data/y_test.pkl')

In [45]:
testing_merge[testing_merge.Month_Label==1][1].unique()

array(['alberto gonzales', 'kathleen sebelius', 'scott pruitt',
       'tom price'], dtype=object)