In [128]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.linear_model import LinearRegression
from sklearn_pandas import DataFrameMapper, cross_val_score
from sklearn_pandas.pipeline import Pipeline
# from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
# from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import FunctionTransformer, LabelEncoder, label_binarize, LabelBinarizer
from sklearn.metrics import roc_auc_score, roc_curve, accuracy_score, auc, precision_score,f1_score,recall_score
import string
import re
import matplotlib.pyplot as plt

In [95]:
raw_df = pd.read_csv('data/Political-media-DFE.csv',encoding='latin')
raw_df.head().T

Unnamed: 0,0,1,2,3,4
_unit_id,766192484,766192485,766192486,766192487,766192488
_golden,False,False,False,False,False
_unit_state,finalized,finalized,finalized,finalized,finalized
_trusted_judgments,1,1,1,1,1
_last_judgment_at,8/4/15 21:17,8/4/15 21:20,8/4/15 21:14,8/4/15 21:08,8/4/15 21:26
audience,national,national,national,national,national
audience:confidence,1,1,1,1,1
bias,partisan,partisan,neutral,neutral,partisan
bias:confidence,1,1,1,1,1
message,policy,attack,support,policy,policy


In [96]:
# Data Cleaning & Initial EDA
df = raw_df[['bias','message','embed','label','source','text']]
df['bias'].value_counts()

neutral     3689
partisan    1311
Name: bias, dtype: int64

In [97]:
#congressmen_2015 = pd.DataFrame(df['label'].unique())

#type(congressmen_2015)

#congressmen_2015.to_csv('congressmen_2015.csv')

def remove_punctuations(text):
    '''Removes punctuation from strings'''
    for punctuation in string.punctuation:
        text = text.replace(punctuation, '')
    return text

In [98]:
df['text'] = df.loc[:,'text'].apply(remove_punctuations)
df['label'] = df['label'].str.replace('From: ','')
df['purpose_and_bias'] = df['message'] + '_' + df['bias']
df['text'] = df['text'].str.lower()
congressmen_df = pd.read_csv('congressmen_2015.csv')
congressmen_df.head()
df = df.merge(congressmen_df, how='left',left_on='label',right_on='congressman')
df.loc[df.bias == 'partisan', 'target'] = df['affiliation']
df.loc[df.bias == 'neutral', 'target'] = df['bias']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See

In [99]:
df.dropna(axis=0,inplace=True)
df = df[df['target'] != 'i']
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4912 entries, 0 to 4999
Data columns (total 12 columns):
bias                4912 non-null object
message             4912 non-null object
embed               4912 non-null object
label               4912 non-null object
source              4912 non-null object
text                4912 non-null object
purpose_and_bias    4912 non-null object
First               4912 non-null object
Last                4912 non-null object
congressman         4912 non-null object
affiliation         4912 non-null object
target              4912 non-null object
dtypes: object(12)
memory usage: 498.9+ KB


In [100]:
df['target'].value_counts()

neutral    3631
r           791
d           490
Name: target, dtype: int64

In [101]:
def replace_contraction(text):
    '''Replaces contractions in words'''
    contraction_patterns = [ (r'won\'t', 'will not'), (r'can\'t', 'can not'), (r'i\'m', 'i am'), (r'ain\'t', 'is not'), (r'(\w+)\'ll', '\g<1> will'), (r'(\w+)n\'t', '\g<1> not'),
                         (r'(\w+)\'ve', '\g<1> have'), (r'(\w+)\'s', '\g<1> is'), (r'(\w+)\'re', '\g<1> are'), (r'(\w+)\'d', '\g<1> would'), (r'&', 'and'), (r'dammit', 'damn it'), (r'dont', 'do not'), (r'wont', 'will not') ]
    patterns = [(re.compile(regex), repl) for (regex, repl) in contraction_patterns]
    for (pattern, repl) in patterns:
        (text, count) = re.subn(pattern, repl, text)
    return text

def replace_links(text, filler=' '):
    '''Replaces Links with spaces'''
    text = re.sub(r'((http|https)\:\/\/)?[a-zA-Z0-9\.\/\?\:@\-_=#]+\.([a-zA-Z]){2,6}([a-zA-Z0-9\.\&\/\?\:@\-_=#])*',
                  filler, text).strip()
    return text

def remove_numbers(text):
    '''Removes digits from text'''
    text = ''.join([i for i in text if not i.isdigit()])
    return text

def cleanText(text):
    '''Executes all cleaning steps - line breaks, lowercase, punctuation, etc.'''
    text = text.strip().replace("\n", " ").replace("\r", " ")
    text = replace_contraction(text)
    text = replace_links(text, "link")
    text = remove_numbers(text)
    text = re.sub(r'[,!@#$%^&*)(|/><";:.?\'\\}{]',"",text)
    text = text.lower()
    return text

In [109]:
X = df['text'].apply(cleanText)
y = df['target']
#y = label_binarize(y, classes=['neutral','r','d'])
y[0:5]

0          r
1          r
2    neutral
3    neutral
4          d
Name: target, dtype: object

In [110]:
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=0,test_size=0.20)
X_train.head()

4126    great news for seniors today ûò thanks to slo...
1016    mt goulaglenn looking forward to hearing congp...
4536    renewing femas safer program will help local s...
1390    today the house voted to ensure that our heroe...
2562    the beautiful grimes county courthouse in ande...
Name: text, dtype: object

In [111]:
y_train[0:5]

4126    neutral
1016    neutral
4536    neutral
1390    neutral
2562    neutral
Name: target, dtype: object

In [112]:
tfidf = TfidfVectorizer()

In [113]:
logreg = LogisticRegression(multi_class='multinomial', solver='lbfgs')

In [114]:
pipe = Pipeline(steps=[
    ('tfidf', tfidf),
    ('logreg', logreg)
])

In [115]:
pipe.fit(X_train,y_train)

Pipeline(memory=None,
     steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,...enalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False))])

In [116]:
pipe.classes_

array(['d', 'neutral', 'r'], dtype=object)

In [121]:
preds_prob = pipe.predict_proba(X_test)
preds_prob[0:5]

array([[0.04237326, 0.87558554, 0.0820412 ],
       [0.17292255, 0.43084048, 0.39623697],
       [0.03111769, 0.13844658, 0.83043574],
       [0.11400523, 0.75825668, 0.12773809],
       [0.15749353, 0.78894841, 0.05355806]])

In [120]:
preds_label = pipe.predict(X_test)
preds_label[0:5]

array(['neutral', 'neutral', 'r', 'neutral', 'neutral'], dtype=object)

In [122]:
pipe.score(X=X_test,y=y_test)

0.7558494404883012

In [124]:
def multiclass_roc_auc_score(y_test, y_pred, average="macro"):
    '''Compute multiclass roc_auc_score'''
    lb = LabelBinarizer()
    lb.fit(y_test)
    y_test = lb.transform(y_test)
    y_pred = lb.transform(y_pred)
    return roc_auc_score(y_test, y_pred, average=average)

In [132]:
multiclass_roc_auc_score(y_test,preds_label,average='micro')

0.8168870803662258

In [138]:
print(f"Accuracy: {accuracy_score(y_test,preds_label)}")
print(f"Recall: {recall_score(y_test,preds_label,average='micro')}")
print(f"Precision: {precision_score(y_test,preds_label,average='micro')}")
print(f"F1 Score: {f1_score(y_test,preds_label,average='micro')}")

Accuracy: 0.7558494404883012
Recall: 0.7558494404883012
Precision: 0.7558494404883012
F1 Score: 0.7558494404883012


- Micro- and macro-averages (for whatever metric) will compute slightly different things, and thus their interpretation differs. A macro-average will compute the metric independently for each class and then take the average (hence treating all classes equally), whereas a micro-average will aggregate the contributions of all classes to compute the average metric (better with class imbalance, which we have here).