In [1]:
import treeUtil
import pickle
import pandas as pd

# Load raw data
The data was given to us in a pre-processed state using custom data objects and a custom package called treeUtil. This workbook includes learning how to read information from the data in the custom format it was provided to us in, as well as extracting the data and storing it in a dataframe for easy use with our model later on

In [2]:
with open(r'../data/ibcData.pkl', 'rb') as f:
    [lib, con, neutral] = pickle.load(f)

# Print example sentences from custom objects

In [3]:
print(lib[0].get_words())

Forcing middle-class workers to bear a greater share of the cost of government weakens their support for needed investments and stirs resentment toward those who depend on public services the most .


In [4]:
print(con[0].get_words())

Gore is getting rich from environmentalism , not just by being paid a whopping $ 175,000 per speech but by using political pressure to force government policy in a direction that benefits his business interests .


In [5]:
print(neutral[0].get_words())

In this country , the beneficiaries of Apple 's success are , first , the designers , who have done wonders working with Steve Jobs to produce products that are beautiful and effective .


In [6]:
type(lib[0]) == treeUtil.nodeObj

True

# Figure out how to work with custom data types

In [7]:
# experimenting with custom data type -- printing all nodes in tree
tree = lib[0]
for node in tree:
    if isinstance(node, treeUtil.nodeObj):
        if hasattr(node, 'label'):
            print(node.pos,':', node.label, ':', node.get_words())
        else:
            print(node.pos,':', 'No Label', ':', node.get_words())

ROOT+S : Liberal : Forcing middle-class workers to bear a greater share of the cost of government weakens their support for needed investments and stirs resentment toward those who depend on public services the most .
S+VP : No Label : Forcing middle-class workers to bear a greater share of the cost of government
VP|<NP-S> : No Label : middle-class workers to bear a greater share of the cost of government
NP : No Label : middle-class workers
S+VP : No Label : to bear a greater share of the cost of government
VP : No Label : bear a greater share of the cost of government
NP : No Label : a greater share of the cost of government
NP : No Label : a greater share
NP|<JJR-NN> : No Label : greater share
PP : No Label : of the cost of government
NP : No Label : the cost of government
NP : No Label : the cost
PP : No Label : of government
S|<VP-.> : No Label : weakens their support for needed investments and stirs resentment toward those who depend on public services the most .
VP : Liberal : w

In [8]:
# printing labeled nodes (phrases) in sentence tree stucture
t = lib[0]
for node in t:
    if isinstance(node, treeUtil.nodeObj):
        if hasattr(node, 'label'):
            label = node.label
        else:
            label = None
        print(label, ':', node.get_words())

Liberal : Forcing middle-class workers to bear a greater share of the cost of government weakens their support for needed investments and stirs resentment toward those who depend on public services the most .
None : Forcing middle-class workers to bear a greater share of the cost of government
None : middle-class workers to bear a greater share of the cost of government
None : middle-class workers
None : to bear a greater share of the cost of government
None : bear a greater share of the cost of government
None : a greater share of the cost of government
None : a greater share
None : greater share
None : of the cost of government
None : the cost of government
None : the cost
None : of government
None : weakens their support for needed investments and stirs resentment toward those who depend on public services the most .
Liberal : weakens their support for needed investments and stirs resentment toward those who depend on public services the most
None : weakens their support for needed 

# Pull data into individual DataFrames

In [9]:
# define function to read data from tree objects into dataframes
def readNodes(tree, root_id):
    treeList = []
    for node in tree:
        # if the leaf is a nodeObj, extract its label or define its label as NoneType
        if isinstance(node, treeUtil.nodeObj):
            if hasattr(node, 'label'):
                label = node.label.lower()
            else:
                label = None
                
            # define a variable denoting whether or not the node is the root sentence or not
            is_root = True if 'root' in node.pos.lower() else False
            
            # grab the content of the sentence
            text = node.get_words()
            
            # append the extracted data to a list
            treeList.append([text, label, node.pos, is_root, root_id])
            
    return treeList

In [10]:
# function to pull data from each tree in the dataset and enter it into a dataframe
def trees_to_df(trees):
    cols = ['text','label','pos','is_root','root_id']
    treeList = []
    for root_id, tree in enumerate(trees):
        t = readNodes(tree, root_id)
        treeList.extend(t)
    df = pd.DataFrame(treeList, columns=cols)
    
    return df

# Generate individual DFs for each ideology label

In [11]:
# data labeled "liberal"
libDf = trees_to_df(lib)
libDf.head()

Unnamed: 0,text,label,pos,is_root,root_id
0,Forcing middle-class workers to bear a greater...,liberal,ROOT+S,True,0
1,Forcing middle-class workers to bear a greater...,,S+VP,False,0
2,middle-class workers to bear a greater share o...,,VP|<NP-S>,False,0
3,middle-class workers,,NP,False,0
4,to bear a greater share of the cost of government,,S+VP,False,0


In [12]:
# total number of subphrases in data, total number of labeled phrases
libDf.shape, libDf.dropna().shape

((84345, 5), (10920, 5))

In [13]:
# data labeled 'conservative'
conDf = trees_to_df(con)
conDf.head()

Unnamed: 0,text,label,pos,is_root,root_id
0,"Gore is getting rich from environmentalism , n...",conservative,ROOT+S,True,0
1,"is getting rich from environmentalism , not ju...",,S|<VP-.>,False,0
2,"is getting rich from environmentalism , not ju...",conservative,VP,False,0
3,"getting rich from environmentalism , not just ...",conservative,VP,False,0
4,"getting rich from environmentalism , not just ...",conservative,VP,False,0


In [14]:
# total number of subphrases in data, total number of labeled phrases
conDf.shape, conDf.dropna().shape

((69252, 5), (8192, 5))

In [15]:
# data labeled 'neutral'
neutDf = trees_to_df(neutral)
neutDf.head()

Unnamed: 0,text,label,pos,is_root,root_id
0,"In this country , the beneficiaries of Apple '...",neutral,ROOT+S,True,0
1,In this country,,PP,False,0
2,this country,,NP,False,0
3,", the beneficiaries of Apple 's success are , ...",,"S|<,-NP-VP-.>",False,0
4,"the beneficiaries of Apple 's success are , fi...",,S|<NP-VP-.>,False,0


In [16]:
# total number of subphrases in data, total number of labeled phrases
neutDf.shape, neutDf.dropna().shape

((22554, 5), (3509, 5))

# Single df with all data

In [17]:
# function to pull in all data into single dataframe
def singleDf(lib, con, neutral):
    cols = ['text','label','pos','is_root','root_id']
    treeList = []
    root_id = 0
    for trees in [lib, con, neutral]:
        for tree in trees:
            t = readNodes(tree, root_id)
            treeList.extend(t)
            root_id += 1
    df = pd.DataFrame(treeList, columns=cols)
    
    return df

In [18]:
# total number of subphrases in dataset, total number of labled data in dataset, total number of root sentences in dataset
df = singleDf(lib, con, neutral)
df.shape, df.dropna().shape, df.loc[df.is_root == True].shape

((176151, 5), (22621, 5), (4326, 5))

In [19]:
# examples of root sentences
for label, text in df.loc[df.is_root == True, ['label','text']].sample(20).values:
    print(label.upper(),'-', text,'\n')

LIBERAL - The gay-marriage effort has been a cause as well as an effect in this change : while same-sex marriage is disturbing to many Americans , it is reassuring to others , suggesting as it does loyalty to a middle-class ideal . 

LIBERAL - Interestingly , on the basis of this argument , the advocates should support our first strategy for economic recovery : higher public investment . 

CONSERVATIVE - Egypt is a brutal dictatorship , strongly supported by President Obama who has said straight out that he 's not going to criticize them because Egypt helps us maintain stability in the Middle East . 

LIBERAL - But leading students of central banking today , such as Charles Goodhart , argue strongly that , with the collapse of effective regulation over the past two decades , thin equity layers at many leading banks ( in combination with limited liability of shareholders ) are completely inappropriate for maintaining a stable financial system . 

LIBERAL - Nor does giving him money or g

In [20]:
# check counts of each label for balance
df.groupby('label').count()

Unnamed: 0_level_0,text,pos,is_root,root_id
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
conservative,5969,5969,5969,5969
liberal,7809,7809,7809,7809
neutral,8843,8843,8843,8843


In [21]:
# remove data that does not contain a label
df = df.dropna(subset=['label'])

In [12]:
# save full dataset with subphrases
with open('../data/filteredIBC.pickle', 'wb') as f:
    pickle.dump(df.loc[:,['label','text']], f, pickle.HIGHEST_PROTOCOL)

In [12]:
# save roots-only dataset
with open('../data/filteredIBC_rootOnly.pickle', 'wb') as f:
    pickle.dump(df[df.is_root].loc[:,['label','text']], f, pickle.HIGHEST_PROTOCOL)