# Imports & Data Loading

In [None]:
#!pip install tensorflow-hub

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import string
from sklearn.model_selection import train_test_split
import numpy as np
import tensorflow as tf
import tensorflow_hub as hub

import tensorflow.keras as keras 
from keras.layers import Input, Lambda, Dense
from keras.models import Model
import keras.backend as K

Using TensorFlow backend.


In [2]:
import spacy
from tqdm import tqdm
import re
import time
import pickle

In [3]:
raw_df = pd.read_csv('data/Political-media-DFE.csv',encoding='latin')
raw_df.head().T

Unnamed: 0,0,1,2,3,4
_unit_id,766192484,766192485,766192486,766192487,766192488
_golden,False,False,False,False,False
_unit_state,finalized,finalized,finalized,finalized,finalized
_trusted_judgments,1,1,1,1,1
_last_judgment_at,8/4/15 21:17,8/4/15 21:20,8/4/15 21:14,8/4/15 21:08,8/4/15 21:26
audience,national,national,national,national,national
audience:confidence,1,1,1,1,1
bias,partisan,partisan,neutral,neutral,partisan
bias:confidence,1,1,1,1,1
message,policy,attack,support,policy,policy


# Data Cleaning & Initial EDA

In [4]:
df = raw_df[['bias','message','embed','label','source','text']]
df.head().T

Unnamed: 0,0,1,2,3,4
bias,partisan,partisan,neutral,neutral,partisan
message,policy,attack,support,policy,policy
embed,"<blockquote class=""twitter-tweet"" width=""450"">...","<blockquote class=""twitter-tweet"" width=""450"">...","<blockquote class=""twitter-tweet"" width=""450"">...","<blockquote class=""twitter-tweet"" width=""450"">...","<blockquote class=""twitter-tweet"" width=""450"">..."
label,From: Trey Radel (Representative from Florida),From: Mitch McConnell (Senator from Kentucky),From: Kurt Schrader (Representative from Oregon),From: Michael Crapo (Senator from Idaho),From: Mark Udall (Senator from Colorado)
source,twitter,twitter,twitter,twitter,twitter
text,RT @nowthisnews: Rep. Trey Radel (R- #FL) slam...,VIDEO - #Obamacare: Full of Higher Costs and ...,Please join me today in remembering our fallen...,RT @SenatorLeahy: 1st step toward Senate debat...,.@amazon delivery #drones show need to update ...


In [5]:
df['bias'].value_counts()

neutral     3689
partisan    1311
Name: bias, dtype: int64

In [6]:
df['message'].value_counts()

policy          1411
personal        1170
support          921
information      647
media            277
attack           172
constituency     152
mobilization     129
other            121
Name: message, dtype: int64

In [7]:
#congressmen_2015 = pd.DataFrame(df['label'].unique())

In [8]:
#type(congressmen_2015)

In [9]:
#congressmen_2015.to_csv('congressmen_2015.csv')

In [10]:
def remove_punctuations(text):
    '''Removes punctuation from strings'''
    for punctuation in string.punctuation:
        text = text.replace(punctuation, '')
    return text

In [11]:
df['text'] = df.loc[:,'text'].apply(remove_punctuations)
df['label'] = df['label'].str.replace('From: ','')
df['purpose_and_bias'] = df['message'] + '_' + df['bias']
df['text'] = df['text'].str.lower()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See

In [12]:
df.head()

Unnamed: 0,bias,message,embed,label,source,text,purpose_and_bias
0,partisan,policy,"<blockquote class=""twitter-tweet"" width=""450"">...",Trey Radel (Representative from Florida),twitter,rt nowthisnews rep trey radel r fl slams obama...,policy_partisan
1,partisan,attack,"<blockquote class=""twitter-tweet"" width=""450"">...",Mitch McConnell (Senator from Kentucky),twitter,video obamacare full of higher costs and bro...,attack_partisan
2,neutral,support,"<blockquote class=""twitter-tweet"" width=""450"">...",Kurt Schrader (Representative from Oregon),twitter,please join me today in remembering our fallen...,support_neutral
3,neutral,policy,"<blockquote class=""twitter-tweet"" width=""450"">...",Michael Crapo (Senator from Idaho),twitter,rt senatorleahy 1st step toward senate debate ...,policy_neutral
4,partisan,policy,"<blockquote class=""twitter-tweet"" width=""450"">...",Mark Udall (Senator from Colorado),twitter,amazon delivery drones show need to update law...,policy_partisan


In [13]:
df['purpose_and_bias'].value_counts()

personal_neutral         1073
support_neutral           751
policy_partisan           706
policy_neutral            705
information_neutral       565
media_neutral             247
support_partisan          170
attack_partisan           160
constituency_neutral      129
other_neutral             117
personal_partisan          97
mobilization_neutral       90
information_partisan       82
mobilization_partisan      39
media_partisan             30
constituency_partisan      23
attack_neutral             12
other_partisan              4
Name: purpose_and_bias, dtype: int64

In [14]:
congressmen_df = pd.read_csv('congressmen_2015.csv')
congressmen_df.head()

Unnamed: 0,First,Last,congressman,affiliation
0,Gregorio,Sablan,Gregorio Sablan (Representative from NA),d
1,Robert,Aderholt,Robert Aderholt (Representative from Alabama),r
2,Lamar,Alexander,Lamar Alexander (Senator from Tennessee),r
3,Justin,Amash,Justin Amash (Representative from Michigan),r
4,Mark,Amodei,Mark Amodei (Representative from Nevada),r


In [15]:
df = df.merge(congressmen_df, how='left',left_on='label',right_on='congressman')

In [16]:
df.loc[df.bias == 'partisan', 'target'] = df['affiliation']
df.loc[df.bias == 'neutral', 'target'] = df['bias']

In [17]:
df.head()

Unnamed: 0,bias,message,embed,label,source,text,purpose_and_bias,First,Last,congressman,affiliation,target
0,partisan,policy,"<blockquote class=""twitter-tweet"" width=""450"">...",Trey Radel (Representative from Florida),twitter,rt nowthisnews rep trey radel r fl slams obama...,policy_partisan,Trey,Radel,Trey Radel (Representative from Florida),r,r
1,partisan,attack,"<blockquote class=""twitter-tweet"" width=""450"">...",Mitch McConnell (Senator from Kentucky),twitter,video obamacare full of higher costs and bro...,attack_partisan,Mitch,McConnell,Mitch McConnell (Senator from Kentucky),r,r
2,neutral,support,"<blockquote class=""twitter-tweet"" width=""450"">...",Kurt Schrader (Representative from Oregon),twitter,please join me today in remembering our fallen...,support_neutral,Kurt,Schrader,Kurt Schrader (Representative from Oregon),d,neutral
3,neutral,policy,"<blockquote class=""twitter-tweet"" width=""450"">...",Michael Crapo (Senator from Idaho),twitter,rt senatorleahy 1st step toward senate debate ...,policy_neutral,Michael,Crapo,Michael Crapo (Senator from Idaho),r,neutral
4,partisan,policy,"<blockquote class=""twitter-tweet"" width=""450"">...",Mark Udall (Senator from Colorado),twitter,amazon delivery drones show need to update law...,policy_partisan,Mark,Udall,Mark Udall (Senator from Colorado),d,d


In [18]:
df['target'].value_counts()

neutral    3689
r           791
d           490
i            17
Name: target, dtype: int64

In [19]:
len(df[df['target'].isnull()])

13

In [20]:
df.dropna(axis=0,inplace=True)

In [21]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4929 entries, 0 to 4999
Data columns (total 12 columns):
bias                4929 non-null object
message             4929 non-null object
embed               4929 non-null object
label               4929 non-null object
source              4929 non-null object
text                4929 non-null object
purpose_and_bias    4929 non-null object
First               4929 non-null object
Last                4929 non-null object
congressman         4929 non-null object
affiliation         4929 non-null object
target              4929 non-null object
dtypes: object(12)
memory usage: 500.6+ KB


## Inputs (X): Tweets
## Target (Y): Bias

In [22]:
import re
def cleanText(text):
    text = text.strip().replace("\n", " ").replace("\r", " ")
    text = replace_contraction(text)
    text = replace_links(text, "link")
    text = remove_numbers(text)
    text = re.sub(r'[,!@#$%^&*)(|/><";:.?\'\\}{]',"",text)
    text = text.lower()
    return text

In [23]:
X = np.array(df['text'])
y = np.array(df['target'])

In [24]:
df_copy = df.copy()

In [25]:
train_set = df_copy.sample(frac=0.75, random_state=0)
test_set = df_copy.drop(train_set.index)

In [26]:
train_set.shape, test_set.shape

((3697, 12), (1232, 12))

In [27]:
train_set['clean_text'] = train_set['text'].apply(lambda x: re.sub(r'http\S+', '', x))

test_set['clean_text'] = test_set['text'].apply(lambda x: re.sub(r'http\S+', '', x))


In [28]:
#!python -m spacy download en

In [29]:
# import spaCy's language model
nlp = spacy.load('en', disable=['parser', 'ner'])

# function to lemmatize text
def lemmatization(texts):
    output = []
    for i in texts:
        s = [token.lemma_ for token in nlp(i)]
        output.append(' '.join(s))
    return output

In [30]:
train_set['clean_text'] = lemmatization(train_set['clean_text'])
test_set['clean_text'] = lemmatization(test_set['clean_text'])




In [31]:
train_set.sample(10)

Unnamed: 0,bias,message,embed,label,source,text,purpose_and_bias,First,Last,congressman,affiliation,target,clean_text
4833,partisan,personal,"<div id=""fb-root""></div> <script>(function(d, ...",Marc Veasey (Representative from Texas),facebook,tonight iûªll be in miami to speak about comp...,personal_partisan,Marc,Veasey,Marc Veasey (Representative from Texas),d,d,tonight iûªll be in miami to speak about comp...
3042,neutral,policy,"<div id=""fb-root""></div> <script>(function(d, ...",Susan Davis (Representative from California),facebook,our students deserve a chance to achieve the a...,policy_neutral,Susan,Davis,Susan Davis (Representative from California),d,neutral,-PRON- student deserve a chance to achieve the...
4337,neutral,personal,"<div id=""fb-root""></div> <script>(function(d, ...",Ileana Ros-Lehtinen (Representative from Florida),facebook,high schooler marcel bozasûªs ûïiconic board...,personal_neutral,Ileana,Ros-Lehtinen,Ileana Ros-Lehtinen (Representative from Florida),r,neutral,high schooler marcel bozasûªs ûïiconic board...
2158,neutral,support,"<blockquote class=""twitter-tweet"" width=""450"">...",Brad Schneider (Representative from Illinois),twitter,hope today illinois joins the growing list of ...,support_neutral,Brad,Schneider,Brad Schneider (Representative from Illinois),d,neutral,hope today illinois join the grow list of stat...
838,neutral,support,"<blockquote class=""twitter-tweet"" width=""450"">...",Ileana Ros-Lehtinen (Representative from Florida),twitter,congrats to new miamiherald publisher alex vil...,support_neutral,Ileana,Ros-Lehtinen,Ileana Ros-Lehtinen (Representative from Florida),r,neutral,congrat to new miamiherald publisher alex vill...
4974,neutral,media,"<div id=""fb-root""></div> <script>(function(d, ...",John Yarmuth (Representative from Kentucky),facebook,ill be on foxbusiness shortly after noon to di...,media_neutral,John,Yarmuth,John Yarmuth (Representative from Kentucky),d,neutral,ill be on foxbusiness shortly after noon to di...
1049,neutral,support,"<blockquote class=""twitter-tweet"" width=""450"">...",Jerrold Nadler (Representative from New York),twitter,proud to standwithpeggy pregnant workers shoul...,support_neutral,Jerrold,Nadler,Jerrold Nadler (Representative from New York),d,neutral,proud to standwithpeggy pregnant worker should...
2283,neutral,support,"<blockquote class=""twitter-tweet"" width=""450"">...",Lucille Roybal-Allard (Representative from Cal...,twitter,join me and other healthequity supporters for ...,support_neutral,Lucille,Roybal-Allard,Lucille Roybal-Allard (Representative from Cal...,d,neutral,join -PRON- and other healthequity supporter f...
4813,neutral,policy,"<div id=""fb-root""></div> <script>(function(d, ...",Tom Udall (Senator from New Mexico),facebook,join me today in supporting safer internet day...,policy_neutral,Tom,Udall,Tom Udall (Senator from New Mexico),d,neutral,join -PRON- today in support safe internet day...
986,partisan,policy,"<blockquote class=""twitter-tweet"" width=""450"">...",Bill Cassidy (Representative from Louisiana),twitter,ocares medical device tax will cost our nation...,policy_partisan,Bill,Cassidy,Bill Cassidy (Representative from Louisiana),r,r,ocare medical device tax will cost -PRON- nati...


In [32]:
# Download the module, and uncompress it to the destination folder.
#!curl -L "https://tfhub.dev/google/elmo/2?tf-hub-format=compressed" | tar -zxvC module/module_elmo2

In [33]:
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()

W0620 09:24:09.455492 140735514080128 deprecation.py:323] From /anaconda3/envs/learn-env/lib/python3.6/site-packages/tensorflow/python/compat/v2_compat.py:65: disable_resource_variables (from tensorflow.python.ops.variable_scope) is deprecated and will be removed in a future version.
Instructions for updating:
non-resource variables are not supported in the long term


In [34]:
elmo = hub.Module("module/module_elmo2/", trainable=False)

In [35]:
def elmo_vectors(x):
    embeddings = elmo(x.tolist(), signature="default", as_dict=True)["elmo"]

    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        sess.run(tf.tables_initializer())
    # return average of ELMo features
    return sess.run(tf.reduce_mean(embeddings,1))

In [36]:
list_train = [train_set[i:i+100] for i in range(0,train_set.shape[0],100)]
list_test = [test_set[i:i+100] for i in range(0,test_set.shape[0],100)]



In [None]:
#elmo_train = [elmo_vectors(x['clean_text']) for x in list_train]
#elmo_test = [elmo_vectors(x['clean_text']) for x in list_test]

In [None]:
from keras.engine.input_layer import InputLayer
from keras.models import Sequential

model = Sequential()
model.add(InputLayer(input_shape=(1,), dtype="string"))
model.add(Lambda(ELMoEmbedding, output_shape=(1024, ))) #
model.add(Dense(256, activation='relu', kernel_regularizer=keras.regularizers.l2(0.001)))
model.add(Dense(4, activation='softmax'))

Output: TensorShape([Dimension(1), Dimension(8), Dimension(1024)])

The output is a 3 dimensional tensor of shape (1, 8, 1024):

The first dimension of this tensor represents the number of training samples. This is 1 in our case
The second dimension represents the maximum length of the longest string in the input list of strings. Since we have only 1 string in our input list, the size of the 2nd dimension is equal to the length of the string – 8
The third dimension is equal to the length of the ELMo vector

# ELMO Embedding in a Simple Neural Network Classifier

In [None]:
import re
def cleanText(text):
    text = text.strip().replace("\n", " ").replace("\r", " ")
    text = replace_contraction(text)
    text = replace_links(text, "link")
    text = remove_numbers(text)
    text = re.sub(r'[,!@#$%^&*)(|/><";:.?\'\\}{]',"",text)
    text = text.lower()
    return text


In [None]:
embed = hub.Module("module/module_elmo2")
def ELMoEmbedding(x):
    return embed(tf.squeeze(tf.cast(x, tf.string)), signature="default", as_dict=True)["default"]





In [None]:
def build_model(): 
    input_text = Input(shape=(1,), dtype="string")
    embedding = Lambda(ELMoEmbedding, output_shape=(1024, ))(input_text)
    dense = Dense(256, activation='relu', kernel_regularizer=keras.regularizers.l2(0.001))(embedding)
    pred = Dense(1, activation='sigmoid')(dense)
    model = Model(inputs=[input_text], outputs=pred)
    model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
    return model

In [None]:
model_elmo = build_model()
model_elmo.summary()

In [None]:
with tf.Session() as session:
    K.set_session(session)
    session.run(tf.global_variables_initializer())  
    session.run(tf.tables_initializer())
    history = model_elmo.fit(X, y, epochs=5, batch_size=256, validation_split = 0.2)
    model_elmo.save_weights('./model_elmo_weights.h5')