In [1]:
# import libraries
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble

import pandas, xgboost, numpy, textblob, string
from keras.preprocessing import text, sequence
from keras import layers, models, optimizers

Using TensorFlow backend.


In [2]:
# read csv into dataframe
articles = pandas.read_csv('articles_copy.csv',encoding = 'cp1252')
articles.head(3)

Unnamed: 0,political leaning,article
0,liberal,Carrying a lethal weapon onto church property ...
1,liberal,There is a tiny bit of good news at the tail e...
2,liberal,McConnell Says Republicans Have Votes to Set T...


In [3]:
# download nltk stuff
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\XJY\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\XJY\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\XJY\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

## Functions to Clean Text

In [4]:
# methods to clean up articles
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk import word_tokenize
import string
import re

stop_words_ = set(stopwords.words('english'))
wn = WordNetLemmatizer()
def accept_words(token):
    return token not in stop_words_ and token not in list(string.punctuation)
def clean_txt(text):
    clean_text = []
    text = re.sub("'", "",text)
    text=re.sub("[\d\W]+"," ",text)  
    clean_text = [ wn.lemmatize(word) for word in word_tokenize(text.lower()) if accept_words(word)]
    return " ".join(clean_text)

In [5]:
# example of how articles get cleaned
articles['article'][0]

'Carrying a lethal weapon onto church property was considered "inappropriate" under the policies of the Church of Jesus Christ of Latter-day Saints. Now, it\'s prohibited.\nThe tweaked language can be seen in the church\'s "Handbook 2," which says: "Churches are dedicated for the worship of God and as havens from the cares and concerns of the world. With the exception of current law enforcement officers, the carrying of lethal weapons on church property, concealed or otherwise, is prohibited."\nThe previous rule said the carrying of lethal weapons was inappropriate.\nLethal weapons include a number of possible items including guns, said Daniel Woodruff,a spokesman for the Church of Jesus Christ of Latter-day Saints.\nThe handbook update took effect in the first week of August, Woodruff said, but the change will be formally communicated to local Church leaders as new meetinghouse safety guidelines in the near future.\nThose leaders will then be responsible for sharing the guidelines wit

In [6]:
clean_txt(articles['article'][0])

'carrying lethal weapon onto church property considered inappropriate policy church jesus christ latter day saint prohibited tweaked language seen church handbook say church dedicated worship god haven care concern world exception current law enforcement officer carrying lethal weapon church property concealed otherwise prohibited previous rule said carrying lethal weapon inappropriate lethal weapon include number possible item including gun said daniel woodruff spokesman church jesus christ latter day saint handbook update took effect first week august woodruff said change formally communicated local church leader new meetinghouse safety guideline near future leader responsible sharing guideline member added change applies entire church due texas law change regarding firearm place worship letter referencing prohibition recently sent church leader texas shared member woodruff said next month new law go effect texas allow licensed handgun owner legally carry weapon place worship one str

## Functions to Generate Additional Features

In [7]:
# additional features (subjectivity and polarity)
# however, have not used polarity since naive bayes can't take negative values
from textblob import TextBlob
from sklearn.feature_extraction import DictVectorizer
import textstat

def hard_words(text):
    total_words = len(text.split())
    return textstat.difficult_words(text)/total_words
def subj_txt(text):
    return TextBlob(text).sentiment[1]
def polarity_txt(text):
    return (TextBlob(text).sentiment[0] + 1)/2
def readability(text):
    return textstat.automated_readability_index(text)
def unique_words(text):
    return len(set(clean_txt(text).split()))/ len(text.split())

In [8]:
articles['subj'] = articles['article'].apply(subj_txt)
articles['pol'] = articles['article'].apply(polarity_txt)
articles['difficult_words'] = articles['article'].apply(hard_words)
articles['readability'] = articles['article'].apply(readability)
articles['unique_words'] = articles['article'].apply(unique_words)

## Custom Pipeline For Feature Union

In [9]:
# Custom class for feature union
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import FeatureUnion
from sklearn.feature_extraction import DictVectorizer
class item_select(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key

    def fit(self, x, y=None):
        return self

    def transform(self, data_dict):
        return data_dict[self.key]


class text_data(BaseEstimator, TransformerMixin):
    """Extract features from each document for DictVectorizer"""

    def fit(self, x, y=None):
        return self

    def transform(self, data):
        return [{'sub' : row['subj'],'difficult_words' : row['difficult_words'],
                 'pol' : row['pol'],'readability' : row['readability'],
                 'unique_words' : row['unique_words']} \
                for _, row in data.iterrows()]


In [10]:
# custom pipeline
from sklearn.pipeline import Pipeline
pipeline = Pipeline([
    ('union', FeatureUnion(
        transformer_list=[

#             Pipeline for pulling features from the text
            ('article', Pipeline([
                ('selector', item_select(key='article')),
                ('tfidf', TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}',
                                          ngram_range=(1,3), max_features=100,
                                         preprocessor=clean_txt)),
            ])),

#             Pipeline for pulling metadata features
            ('stats', Pipeline([
                ('selector', item_select(key=['subj','difficult_words','pol','readability','unique_words'])),
                ('stats', text_data()),  # returns a list of dicts
                ('vect', DictVectorizer()),  # list of dicts -> feature matrix
            ])),

        ],
    ))
])

## Train Models

In [11]:
# features and labels
X = articles[['article','subj','difficult_words','pol','readability','unique_words']]
y = articles['political leaning']
print(y)

0           liberal
1           liberal
2           liberal
3           liberal
4           liberal
5           liberal
6           liberal
7           liberal
8           liberal
9           liberal
10          liberal
11          liberal
12          liberal
13          liberal
14          liberal
15          liberal
16          liberal
17          liberal
18          liberal
19          liberal
20          liberal
21          liberal
22          liberal
23          liberal
24          liberal
25          liberal
26          liberal
27          liberal
28          liberal
29          liberal
           ...     
103    conservative
104    conservative
105    conservative
106    conservative
107    conservative
108    conservative
109    conservative
110    conservative
111    conservative
112    conservative
113    conservative
114    conservative
115    conservative
116    conservative
117    conservative
118    conservative
119    conservative
120    conservative
121    conservative


In [12]:
# split training and testing samples
seed = 12
encoder = preprocessing.LabelEncoder()
y = encoder.fit_transform(y)
print(y)
x_train, x_test, y_train, y_test = model_selection.train_test_split(X,y,test_size = 0.2, random_state = seed)

[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]


In [13]:
# fit pipeline
pipeline.fit(x_train)

Pipeline(memory=None,
         steps=[('union',
                 FeatureUnion(n_jobs=None,
                              transformer_list=[('article',
                                                 Pipeline(memory=None,
                                                          steps=[('selector',
                                                                  item_select(key='article')),
                                                                 ('tfidf',
                                                                  TfidfVectorizer(analyzer='word',
                                                                                  binary=False,
                                                                                  decode_error='strict',
                                                                                  dtype=<class 'numpy.float64'>,
                                                                                  encoding='utf-8',
                 

In [14]:
train_vec = pipeline.transform(x_train)
test_vec = pipeline.transform(x_test)

In [15]:
# models to test
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

clf_sv = LinearSVC(C=1, class_weight='balanced', multi_class='ovr', random_state=40, max_iter=10000)
clf_nb = naive_bayes.MultinomialNB()
clf_lr = LogisticRegression()
clf_rf = RandomForestClassifier(n_estimators = 800)

## Test Models for Accuracy

In [16]:
# cross val score of models
from sklearn.model_selection import cross_val_score
clfs = {'SVC' : clf_sv, 'NB' : clf_nb, 'LR' : clf_lr, 'RF' : clf_rf}
cv = 3
for name, clf in clfs.items():
    scores = cross_val_score(clf,test_vec,y_test,cv = cv, scoring = "accuracy")
    print(name)
    print(scores)
    print(("Mean score: {0:.3f} (+/-{1:.3f})").format(
            numpy.mean(scores),numpy.std(scores)))
    print('---------------------------')

SVC
[0.8        0.77777778 0.75      ]
Mean score: 0.776 (+/-0.020)
---------------------------
NB
[0.7        0.66666667 0.75      ]
Mean score: 0.706 (+/-0.034)
---------------------------
LR
[0.7        0.66666667 0.75      ]
Mean score: 0.706 (+/-0.034)
---------------------------




RF
[0.7        0.66666667 0.75      ]
Mean score: 0.706 (+/-0.034)
---------------------------


In [17]:
articles.groupby('political leaning')['unique_words'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
political leaning,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
conservative,65.0,0.383572,0.05663,0.229767,0.349231,0.384884,0.409615,0.525292
liberal,68.0,0.379702,0.091994,0.271176,0.336601,0.362013,0.399954,1.0


In [18]:
articles.groupby('political leaning')['difficult_words'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
political leaning,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
conservative,65.0,0.19697,0.033931,0.068493,0.183976,0.204118,0.211628,0.280769
liberal,68.0,0.203127,0.101307,0.136855,0.17484,0.192993,0.209151,1.0


In [19]:
articles.groupby('political leaning')['pol'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
political leaning,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
conservative,65.0,0.537479,0.026123,0.470833,0.520218,0.54102,0.554459,0.618867
liberal,68.0,0.537054,0.028515,0.470833,0.516725,0.540621,0.554742,0.618867


In [20]:
articles.groupby('political leaning')['readability'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
political leaning,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
conservative,65.0,16.158462,4.35233,6.8,13.7,15.7,17.4,28.3
liberal,68.0,16.3,3.722341,11.3,14.1,15.6,17.225,28.3


In [21]:
articles.groupby('political leaning')['subj'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
political leaning,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
conservative,65.0,0.411786,0.072663,0.174653,0.36949,0.416766,0.456375,0.583675
liberal,68.0,0.398969,0.083444,0.0,0.35828,0.411783,0.445913,0.531565


## Test Model Without Additional Features

In [22]:
# Testing model with just tfidf
X = articles['article']
y = articles['political leaning']
y = encoder.fit_transform(y)
x_train, x_test, y_train, y_test = model_selection.train_test_split(X,y,test_size = 0.2, random_state = seed)
tfidf_vec = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}',
                                          ngram_range=(1,3), max_features=100,
                                         preprocessor=clean_txt)
tfidf_vec.fit(x_train)
train_x = tfidf_vec.transform(x_train)
test_x = tfidf_vec.transform(x_test)

In [23]:
for name, clf in clfs.items():
    scores = cross_val_score(clf,test_x,y_test,cv = cv, scoring = "accuracy")
    print(name)
    print(scores)
    print(("Mean score: {0:.3f} (+/-{1:.3f})").format(
            numpy.mean(scores),numpy.std(scores)))

SVC
[0.7        0.66666667 0.75      ]
Mean score: 0.706 (+/-0.034)
NB
[0.7        0.66666667 0.75      ]
Mean score: 0.706 (+/-0.034)
LR
[0.7        0.66666667 0.75      ]
Mean score: 0.706 (+/-0.034)




RF
[0.7        0.77777778 0.875     ]
Mean score: 0.784 (+/-0.072)


## Test Models on a Character Basis (n-gram)

In [24]:
tfidf_vect_ngram_chars = TfidfVectorizer(analyzer='char', token_pattern=r'\w{1,}', ngram_range=(1,3), max_features=100)
tfidf_vect_ngram_chars.fit(x_train)
train_x = tfidf_vect_ngram_chars.transform(x_train)
test_x = tfidf_vect_ngram_chars.transform(x_test)

In [25]:
for name, clf in clfs.items():
    scores = cross_val_score(clf,test_x,y_test,cv = cv, scoring = "accuracy")
    print(name)
    print(scores)
    print(("Mean score: {0:.3f} (+/-{1:.3f})").format(
            numpy.mean(scores),numpy.std(scores)))

SVC
[0.6  1.   0.75]
Mean score: 0.783 (+/-0.165)
NB
[0.7        0.66666667 0.75      ]
Mean score: 0.706 (+/-0.034)
LR
[0.7        0.66666667 0.75      ]
Mean score: 0.706 (+/-0.034)




RF
[0.7        0.66666667 0.875     ]
Mean score: 0.747 (+/-0.091)


## Test Model With Count Vectorizer

In [26]:
count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=100)
count_vect.fit(x_train)
train_x = count_vect.transform(x_train)
test_x = count_vect.transform(x_test)

In [27]:
for name, clf in clfs.items():
    scores = cross_val_score(clf,test_x,y_test,cv = cv, scoring = "accuracy")
    print(name)
    print(scores)
    print(("Mean score: {0:.3f} (+/-{1:.3f})").format(
            numpy.mean(scores),numpy.std(scores)))

SVC
[0.6        0.77777778 0.5       ]
Mean score: 0.626 (+/-0.115)
NB
[0.5        0.88888889 0.875     ]
Mean score: 0.755 (+/-0.180)
LR
[0.7        0.77777778 0.5       ]
Mean score: 0.659 (+/-0.117)




RF
[0.7        0.77777778 0.75      ]
Mean score: 0.743 (+/-0.032)


## Function to Predict New Text Based on Just TF-IDF

In [28]:
tfidf_vec = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}',
                                          ngram_range=(1,3), max_features=100,
                                         preprocessor=clean_txt)
encoder = preprocessing.LabelEncoder()
X = articles['article']
y = articles['political leaning']
tfidf_vec.fit(X)
X = tfidf_vec.transform(X)

y = encoder.fit_transform(y)
clf_nb = naive_bayes.MultinomialNB()
clf_nb.fit(X,y)
def predictor(text):
    x = tfidf_vec.transform(text)
    result = clf_nb.predict(x)
    return encoder.inverse_transform([result])

In [29]:
text = ['I am a liberal.']
print(predictor(text))

['liberal']


  y = column_or_1d(y, warn=True)
