In [1]:
import treeUtil
import pickle
import pandas as pd
import os
# import matplotlib.pyplot as plt
# plt.style.use('ggplot')

In [2]:
import pickle
with open('../data/filteredConvote.pickle', 'rb') as f:
    convote = pickle.load(f)
with open('../data/filteredIBC.pickle', 'rb') as f:
    ibc = pickle.load(f)
df = convote.append(ibc)

In [3]:
convote.shape

(27562, 2)

In [4]:
ibc.shape

(22621, 2)

In [5]:
df = convote.append(ibc)
df.shape

(50183, 2)

# Load data

In [2]:
!ls -l ~/data

total 787088
-rw-rw-r-- 1 alex alex 203539364 Aug 20  2017 articles1.csv
-rw-rw-r-- 1 alex alex 225757056 Aug 20  2017 articles2.csv
-rw-rw-r-- 1 alex alex 240344348 Aug 20  2017 articles3.csv
-rw-r--r-- 1 alex alex  90963694 Jun 30 13:34 filtered_sentences.pickle
-rw------- 1 alex alex  45357539 Apr 26  2014 ibcData.pkl


In [3]:
os.listdir('../data')

['ibcData.pkl',
 'articles2.csv',
 'conGrams.pickle',
 'conCons.pickle',
 'libGrams.pickle',
 'convote_v1.1.tar.gz',
 'articles3.csv',
 'filteredIBC.pickle',
 'libCons.pickle',
 'filtered_sentences.pickle',
 'articles1.csv',
 'convote_v1.1',
 'filteredConvote.pickle']

In [6]:
with open(r'../data/ibcData.pkl', 'rb') as f:
    [lib, con, neutral] = pickle.load(f)

In [5]:
print(lib[0].get_words())

Forcing middle-class workers to bear a greater share of the cost of government weakens their support for needed investments and stirs resentment toward those who depend on public services the most .


In [6]:
print(con[0].get_words())

Gore is getting rich from environmentalism , not just by being paid a whopping $ 175,000 per speech but by using political pressure to force government policy in a direction that benefits his business interests .


In [7]:
print(neutral[0].get_words())

In this country , the beneficiaries of Apple 's success are , first , the designers , who have done wonders working with Steve Jobs to produce products that are beautiful and effective .


In [8]:
type(lib[0]) == treeUtil.nodeObj

True

# Figure out how to work with custom data types

In [9]:
tree = lib[0]
for node in tree:
    if isinstance(node, treeUtil.nodeObj):
        if hasattr(node, 'label'):
            print(node.pos,':', node.label, ':', node.get_words())
        else:
            print(node.pos,':', 'No Label', ':', node.get_words())

ROOT+S : Liberal : Forcing middle-class workers to bear a greater share of the cost of government weakens their support for needed investments and stirs resentment toward those who depend on public services the most .
S+VP : No Label : Forcing middle-class workers to bear a greater share of the cost of government
VP|<NP-S> : No Label : middle-class workers to bear a greater share of the cost of government
NP : No Label : middle-class workers
S+VP : No Label : to bear a greater share of the cost of government
VP : No Label : bear a greater share of the cost of government
NP : No Label : a greater share of the cost of government
NP : No Label : a greater share
NP|<JJR-NN> : No Label : greater share
PP : No Label : of the cost of government
NP : No Label : the cost of government
NP : No Label : the cost
PP : No Label : of government
S|<VP-.> : No Label : weakens their support for needed investments and stirs resentment toward those who depend on public services the most .
VP : Liberal : w

In [10]:
for leaf in tree.get_leaves():
    leaf.print_leaf()

Forcing :  VBG
middle-class :  NN
workers :  NNS
to :  TO
bear :  VB
a :  DT
greater :  JJR
share :  NN
of :  IN
the :  DT
cost :  NN
of :  IN
government :  NP+NN
weakens :  VBZ
their :  PRP$
support :  NN
for :  IN
needed :  JJ
investments :  NNS
and :  CC
stirs :  VBZ
resentment :  NP+NN
toward :  IN
those :  NP+DT
who :  WHNP+WP
depend :  VBP
on :  IN
public :  JJ
services :  NNS
the :  DT
most :  JJS
. :  .


In [11]:
t = lib[0]
for node in t:
    if isinstance(node, treeUtil.nodeObj):
        if hasattr(node, 'label'):
            label = node.label
        else:
            label = None
        print(label, ':', node.get_words())

Liberal : Forcing middle-class workers to bear a greater share of the cost of government weakens their support for needed investments and stirs resentment toward those who depend on public services the most .
None : Forcing middle-class workers to bear a greater share of the cost of government
None : middle-class workers to bear a greater share of the cost of government
None : middle-class workers
None : to bear a greater share of the cost of government
None : bear a greater share of the cost of government
None : a greater share of the cost of government
None : a greater share
None : greater share
None : of the cost of government
None : the cost of government
None : the cost
None : of government
None : weakens their support for needed investments and stirs resentment toward those who depend on public services the most .
Liberal : weakens their support for needed investments and stirs resentment toward those who depend on public services the most
None : weakens their support for needed 

# Pull data into individual DataFrames

In [7]:
def readNodes(tree, root_id):
    treeList = []
    for node in tree:
        if isinstance(node, treeUtil.nodeObj):
            if hasattr(node, 'label'):
                label = node.label.lower()
            else:
                label = None
            is_root = True if 'root' in node.pos.lower() else False
            text = node.get_words()
            treeList.append([text, label, node.pos, is_root, root_id])
    return treeList

In [8]:
def trees_to_df(trees):
    cols = [
        'text','label','position','is_root','root_id',
    ]
    treeList = []
    for root_id, tree in enumerate(trees):
        t = readNodes(tree, root_id)
        treeList.extend(t)
    df = pd.DataFrame(treeList, columns=cols)
    
    return df

## original individual DFs

In [14]:
libDf = trees_to_df(lib)
libDf.head()

Unnamed: 0,text,label,position,is_root,root_id
0,Forcing middle-class workers to bear a greater...,liberal,ROOT+S,True,0
1,Forcing middle-class workers to bear a greater...,,S+VP,False,0
2,middle-class workers to bear a greater share o...,,VP|<NP-S>,False,0
3,middle-class workers,,NP,False,0
4,to bear a greater share of the cost of government,,S+VP,False,0


In [15]:
libDf.loc[libDf.label != None].shape

(84345, 5)

In [16]:
conDf = trees_to_df(con)
conDf.head()

Unnamed: 0,text,label,position,is_root,root_id
0,"Gore is getting rich from environmentalism , n...",conservative,ROOT+S,True,0
1,"is getting rich from environmentalism , not ju...",,S|<VP-.>,False,0
2,"is getting rich from environmentalism , not ju...",conservative,VP,False,0
3,"getting rich from environmentalism , not just ...",conservative,VP,False,0
4,"getting rich from environmentalism , not just ...",conservative,VP,False,0


In [17]:
conDf.loc[conDf.label != None].shape

(69252, 5)

In [18]:
neutDf = trees_to_df(neutral)
neutDf.head()

Unnamed: 0,text,label,position,is_root,root_id
0,"In this country , the beneficiaries of Apple '...",neutral,ROOT+S,True,0
1,In this country,,PP,False,0
2,this country,,NP,False,0
3,", the beneficiaries of Apple 's success are , ...",,"S|<,-NP-VP-.>",False,0
4,"the beneficiaries of Apple 's success are , fi...",,S|<NP-VP-.>,False,0


In [19]:
neutDf.loc[neutDf.label != None].shape

(22554, 5)

# Single df with all data

In [9]:
def singleDf(lib, con, neutral):
    cols = [
        'text','label','position','is_root','root_id',
    ]
    treeList = []
    root_id = 0
    for trees in [lib, con, neutral]:
        for tree in trees:
            t = readNodes(tree, root_id)
            treeList.extend(t)
            root_id += 1
    df = pd.DataFrame(treeList, columns=cols)
    
    return df

In [10]:
df = singleDf(lib, con, neutral)
df.shape

(176151, 5)

In [22]:
df.sample(5)

Unnamed: 0,text,label,position,is_root,root_id
35658,the Hebrew Bible evident in the New Testament ...,,NP,False,853
19932,not hesitate to voice his opinion about his Re...,,VP|<RB-VP>,False,478
23265,", Rendell has overseen the first state-level i...",,"S|<,-NP-VP>",False,554
107091,can benevolently pick the right winners and pu...,,VP,False,2589
141939,ecological damage,,NP|<JJ-NN>,False,3440


In [11]:
df.groupby('label').count()

Unnamed: 0_level_0,text,position,is_root,root_id
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
conservative,5969,5969,5969,5969
liberal,7809,7809,7809,7809
neutral,8843,8843,8843,8843


In [24]:
df.shape

(176151, 5)

In [25]:
df[df.is_root].shape

(4326, 5)

In [26]:
len(df[df.is_root].root_id.unique())

4326

In [27]:
df.loc[df.label.notna()].shape

(22621, 5)

In [28]:
df = df.dropna(subset=['label'])
df.sample(5)

Unnamed: 0,text,label,position,is_root,root_id
110173,faced by future taxpayers,neutral,VP,False,2666
41772,a far smaller effect on interstate commerce,neutral,NP,False,1002
74886,Fung and his colleagues have catalogued fiftee...,liberal,ROOT+S,True,1798
11502,were to be `` a global agreement on a new ( st...,liberal,VP,False,278
127629,"In the case of the retirement age , we should ...",conservative,ROOT+S,True,3096


In [12]:
savepath = r'../data'
# with open(os.path.join(savepath, 'filteredIBC.pickle'), 'wb') as f:
#     pickle.dump(df.loc[:,['label','text']], f, pickle.HIGHEST_PROTOCOL)
with open(os.path.join(savepath, 'filteredIBC_rootOnly.pickle'), 'wb') as f:
    pickle.dump(df[df.is_root].loc[:,['label','text']], f, pickle.HIGHEST_PROTOCOL)

In [13]:
# with open(os.path.join(savepath, 'filteredIBC.pickle'), 'rb') as f:
#     df2 = pickle.load(f)
with open(os.path.join(savepath, 'filteredIBC_rootOnly.pickle'), 'rb') as f:
    test = pickle.load(f)

In [14]:
test.shape

(4326, 2)

# tokenizing (just messing around, not needed)

In [99]:
from tensorflow.keras.preprocessing.text import Tokenizer

In [100]:
df = libDf.append(conDf).append(neutDf)
df.shape, libDf.shape

((176151, 5), (84345, 5))

In [67]:
filters='!"#$%&()*+,./:;<=>?@[\\]^_`{|}~\t\n'
t = Tokenizer(filters=filters)

In [68]:
t.fit_on_texts(set(libWords))

In [69]:
seq = t.texts_to_sequences(libWords)
len(seq)

992619

In [51]:
list(libWords)[:5]

['angered', 'mere', 'credit-constrained', 'Gregory', 'dollars']

In [52]:
seq[:5]

[[871], [872], [873], [874], [875]]

In [53]:
longWords = [w for w,s in zip(libWords, seq) if len(s) > 1]
longWords

['8.46',
 '7.9',
 '352,000',
 'AT&T',
 '2003.4',
 'U.S.-India',
 'oligarchy\\/plutocracy',
 '1.8',
 '30,000',
 '12\\/23\\/09',
 '25,000',
 '33.8',
 '2,000',
 '1\\/3\\/11',
 '1,200',
 '9\\/25\\/08',
 '3.3',
 'creatures\\/who',
 'IMF\\/World',
 'created\\/humans',
 'U.S.-led',
 'them.As',
 'U.S.',
 '1.5',
 'F\\/A',
 '250,000',
 'U.S.A.',
 'O.sub',
 '1941.3',
 '57,000',
 '9\\/12',
 '0.6',
 '100,000',
 '20,000',
 'OpenSecrets.org',
 '81.5',
 '3.2',
 'ethics\\/social',
 '381,000',
 '3.6',
 'D.C.',
 '9\\/11',
 '36,500',
 'November\\/December',
 '5.7',
 '630,000',
 'rich\\/poor',
 '150,000',
 'food\\/energy',
 '6.5',
 '5,600',
 '0.1',
 'and\\/or',
 '12.5',
 'U.S.-based',
 '40,000',
 'U.S.-bound',
 '109,858',
 'CFR\\/SPLC',
 '4,000',
 '45.3',
 'News\\/Washington',
 '0.01',
 'R&D',
 '1.9',
 '1.28',
 'work\\/family',
 '30.22',
 '200,000']

In [54]:
len(longWords)

69

In [None]:
def plotWordDist(df):
    lab = df[df.label != None]
    

In [75]:
labs = pd.get_dummies(libDf.loc[libDf.label != None].label)

In [76]:
labs

Unnamed: 0,Conservative,Liberal,Neutral
0,0,1,0
1,0,0,0
2,0,0,0
3,0,0,0
4,0,0,0
5,0,0,0
6,0,0,0
7,0,0,0
8,0,0,0
9,0,0,0
