In [1]:
# import libraries
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble

import pandas, xgboost, numpy, textblob, string
from keras.preprocessing import text, sequence
from keras import layers, models, optimizers

Using TensorFlow backend.


In [2]:
# read csv into dataframe
articles = pandas.read_csv('articles_copy.csv',encoding = 'cp1252')
articles.head(3)

Unnamed: 0,political leaning,article
0,liberal,Carrying a lethal weapon onto church property ...
1,liberal,There is a tiny bit of good news at the tail e...
2,liberal,McConnell Says Republicans Have Votes to Set T...


In [3]:
# download nltk stuff
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Kevin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Kevin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Kevin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [4]:
# methods to clean up articles
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk import word_tokenize
import string
import re

stop_words_ = set(stopwords.words('english'))
wn = WordNetLemmatizer()
def accept_words(token):
    return token not in stop_words_ and token not in list(string.punctuation)
def clean_txt(text):
    clean_text = []
    text = re.sub("'", "",text)
    text=re.sub("[\d\W]+"," ",text)  
    clean_text = [ wn.lemmatize(word) for word in word_tokenize(text.lower()) if accept_words(word)]
    return " ".join(clean_text)

In [5]:
# example of how articles get cleaned
articles['article'][0]

'Carrying a lethal weapon onto church property was considered "inappropriate" under the policies of the Church of Jesus Christ of Latter-day Saints. Now, it\'s prohibited.\nThe tweaked language can be seen in the church\'s "Handbook 2," which says: "Churches are dedicated for the worship of God and as havens from the cares and concerns of the world. With the exception of current law enforcement officers, the carrying of lethal weapons on church property, concealed or otherwise, is prohibited."\nThe previous rule said the carrying of lethal weapons was inappropriate.\nLethal weapons include a number of possible items including guns, said Daniel Woodruff,a spokesman for the Church of Jesus Christ of Latter-day Saints.\nThe handbook update took effect in the first week of August, Woodruff said, but the change will be formally communicated to local Church leaders as new meetinghouse safety guidelines in the near future.\nThose leaders will then be responsible for sharing the guidelines wit

In [6]:
clean_txt(articles['article'][0])

'carrying lethal weapon onto church property considered inappropriate policy church jesus christ latter day saint prohibited tweaked language seen church handbook say church dedicated worship god haven care concern world exception current law enforcement officer carrying lethal weapon church property concealed otherwise prohibited previous rule said carrying lethal weapon inappropriate lethal weapon include number possible item including gun said daniel woodruff spokesman church jesus christ latter day saint handbook update took effect first week august woodruff said change formally communicated local church leader new meetinghouse safety guideline near future leader responsible sharing guideline member added change applies entire church due texas law change regarding firearm place worship letter referencing prohibition recently sent church leader texas shared member woodruff said next month new law go effect texas allow licensed handgun owner legally carry weapon place worship one str

In [7]:
# additional features (subjectivity and polarity)
# however, have not used polarity since naive bayes can't take negative values
from textblob import TextBlob
from sklearn.feature_extraction import DictVectorizer

def subj_txt(text):
    return TextBlob(text).sentiment[1]
def polarity_txt(text):
    return TextBlob(text).sentiment[0]

In [8]:
articles['subj'] = articles['article'].apply(subj_txt)
articles['pol'] = articles['article'].apply(polarity_txt)

In [9]:
articles['pol'].sort_values(ascending = True)

24    -0.058333
90    -0.058333
76    -0.045475
10    -0.045475
45    -0.023294
         ...   
34     0.170934
100    0.170934
118    0.175443
93     0.237734
27     0.237734
Name: pol, Length: 133, dtype: float64

In [10]:
# Custom class for feature union
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import FeatureUnion
from sklearn.feature_extraction import DictVectorizer
class ItemSelector(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key

    def fit(self, x, y=None):
        return self

    def transform(self, data_dict):
        return data_dict[self.key]


class TextStats(BaseEstimator, TransformerMixin):
    """Extract features from each document for DictVectorizer"""

    def fit(self, x, y=None):
        return self

    def transform(self, data):
        return [{'sub': row['subj']} for _, row in data.iterrows()]

In [11]:
# custom pipeline
from sklearn.pipeline import Pipeline
pipeline = Pipeline([
    ('union', FeatureUnion(
        transformer_list=[

            # Pipeline for pulling features from the text
            ('article', Pipeline([
                ('selector', ItemSelector(key='article')),
                ('tfidf', TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}',
                                          ngram_range=(1,3), max_features=300,
                                         preprocessor=clean_txt)),
            ])),

            # Pipeline for pulling metadata features
            ('stats', Pipeline([
                ('selector', ItemSelector(key=['subj'])),
                ('stats', TextStats()),  # returns a list of dicts
                ('vect', DictVectorizer()),  # list of dicts -> feature matrix
            ])),

        ],
    ))
])

In [12]:
# features and labels
X = articles[['article','subj']]
y = articles['political leaning']
print(y)

0           liberal
1           liberal
2           liberal
3           liberal
4           liberal
           ...     
128    conservative
129    conservative
130    conservative
131    conservative
132    conservative
Name: political leaning, Length: 133, dtype: object


In [13]:
# split training and testing samples
seed = 42
encoder = preprocessing.LabelEncoder()
y = encoder.fit_transform(y)
print(y)
x_train, x_test, y_train, y_test = model_selection.train_test_split(X,y,test_size = 0.2, random_state = seed)

[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]


In [14]:
# fit pipeline
pipeline.fit(x_train)

Pipeline(memory=None,
         steps=[('union',
                 FeatureUnion(n_jobs=None,
                              transformer_list=[('article',
                                                 Pipeline(memory=None,
                                                          steps=[('selector',
                                                                  ItemSelector(key='article')),
                                                                 ('tfidf',
                                                                  TfidfVectorizer(analyzer='word',
                                                                                  binary=False,
                                                                                  decode_error='strict',
                                                                                  dtype=<class 'numpy.float64'>,
                                                                                  encoding='utf-8',
                

In [15]:
train_vec = pipeline.transform(x_train)
test_vec = pipeline.transform(x_test)

In [16]:
# a method to train model
def train_model(classifier, feature_vector_train, label, feature_vector_valid):
    # fit the training dataset on the classifier
    classifier.fit(feature_vector_train, label)
    
    # predict the labels on validation dataset
    predictions = classifier.predict(feature_vector_valid)
    
#     print(predictions)
#     print(y_test)
    return metrics.accuracy_score(predictions, y_test)

In [17]:
# models to test
from sklearn.svm import LinearSVC
clf_sv = LinearSVC(C=1, class_weight='balanced', multi_class='ovr', random_state=40, max_iter=10000)
clf_nb = naive_bayes.MultinomialNB()

In [18]:
# cross val score of models
from sklearn.model_selection import cross_val_score
clfs = [clf_sv, clf_nb]
cv = 5
for clf in clfs:
    scores = cross_val_score(clf,train_vec,y_train,cv = cv, scoring = "accuracy")
    print(scores)
    print(("Mean score: {0:.3f} (+/-{1:.3f})").format(
            numpy.mean(scores),numpy.std(scores)))

[0.59090909 0.66666667 0.57142857 0.66666667 0.57142857]
Mean score: 0.613 (+/-0.044)
[0.59090909 0.66666667 0.57142857 0.66666667 0.66666667]
Mean score: 0.632 (+/-0.042)
