In [178]:
import json
import sklearn
import numpy
import random
import scipy
import sklearn.datasets
import sklearn.feature_extraction.text
import sklearn.linear_model
import sklearn.naive_bayes
import sklearn.metrics
import sklearn.utils
import string
import io

import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import StanfordNERTagger
import matplotlib.pyplot as plt

import os
java_path = "C:/Program Files/Java/jdk1.8.0_131/bin/java.exe"
os.environ['JAVAHOME'] = java_path

In [67]:
with open('articles_with_texts_final.json') as data_file:
    articles = json.load(data_file)

In [68]:
article_keys = articles[u'data'].keys()
vectorizer = \
sklearn.feature_extraction.text.TfidfVectorizer(sublinear_tf=True, 
                                                max_df=0.5, 
                                                stop_words='english')
_ = vectorizer.fit(articles[u'data'][k][u'text'] for k in article_keys)

In [69]:
T_keys = [k for k in article_keys if (articles[u'data'][k]['most_emailed'] or articles[u'data'][k]['most_shared'] or \
                articles[u'data'][k]['most_viewed'])]
F_keys = [k for k in article_keys if not (articles[u'data'][k]['most_emailed'] or articles[u'data'][k]['most_shared'] or \
                articles[u'data'][k]['most_viewed'])]

In [215]:
#def feature1(c_articles) : #c_articles is [articles[u'data'][k] for k in keys]

#author_vectorizer = sklearn.feature_extraction.text.CountVectorizer()
#_ = author_vectorizer.fit(string.replace(string.replace(ca[u'byline'][u'original'], ' ', ''), '.', '') if ca[u'byline'] != [] else "" for ca in [articles[u'data'][k] for k in article_keys])

def author(cas):
    return author_vectorizer.transform(string.replace(string.replace(ca[u'byline'][u'original'], ' ', ''), '.', '') if ca[u'byline'] != [] else "" for ca in cas)
    
# truth with noise
def truth(cas):
    if len(cas) == 4000 :
        return numpy.reshape([(1 if random.randint(1,4) > 1 else 0) if k in T_keys else (1 if random.randint(1,4) == 1 else 0) for k in article_keys[:4000]], (-1, 1))
    else :
        return numpy.reshape([(1 if random.randint(1,4) > 1 else 0) if k in T_keys else (1 if random.randint(1,4) == 1 else 0) for k in article_keys[4000:]], (-1, 1))

st = StanfordNERTagger('C:\stanford-ner\classifiers\english.all.3class.distsim.crf.ser.gz', 'C:\stanford-ner\stanford-ner.jar', encoding='utf-8')
    
def NE_count(text):
    tokenized_text = word_tokenize(text)
    t = st.tag(tokenized_text)
    n_persons = 0.0
    n_org = 0.0
    n_loc = 0.0
    if len(t) == 0 :
        return [0.0, 0.0, 0.0]
    (a, b) = t[0]
    if b == u'PERSON' :
        n_persons += 1
    elif b == u'LOCATION' :
        n_loc += 1
    elif b == u'ORGANIZATION' :
        n_org += 1
    prev = b
    for (a, b) in t[1:] :
        if b != prev :
            prev = b
            if b == u'PERSON' :
                n_persons += 1
            elif b == u'LOCATION' :
                n_loc += 1
            elif b == u'ORGANIZATION' :
                n_org += 1
    return [n_persons/len(tokenized_text), n_loc/len(tokenized_text), n_org/len(tokenized_text)]

def named_entity(cas):
    return numpy.matrix([NE_count(ca[u'text']) for ca in cas])

features = [named_entity]



def getX(key_list):
    x = vectorizer.transform(articles[u'data'][k][u'text'] for k in key_list)
    c_articles = [articles[u'data'][k] for k in key_list]
    for f in features :
        x_new = f(c_articles)
        x = scipy.sparse.hstack([x, x_new])
    return x

In [216]:
#[string.replace(string.replace(ca[u'byline'][u'original'], ' ', ''), '.', '') if ca[u'byline'] != [] else "" for ca in [articles[u'data'][k] for k in article_keys[:4000]]]

In [217]:
X_train = getX(article_keys[:4000])
y_train = [k in T_keys for k in article_keys[:4000]]
X_dev = getX(article_keys[4000:])
y_dev = [k in T_keys for k in article_keys[4000:]]

In [218]:
classifier = \
sklearn.linear_model.SGDClassifier(loss="log",
                                   penalty="elasticnet",
                                   n_iter=5)

_ = classifier.fit(X_train, y_train)

In [219]:
pred = classifier.predict(X_dev)
print sklearn.metrics.accuracy_score(y_dev, pred)

0.760755508919


In [220]:
index_to_word = vectorizer.get_feature_names()
best_elts = numpy.argpartition(classifier.coef_, classifier.coef_.size - 20)[0][-20:]
best_words = [index_to_word[x] if x < len(index_to_word) else x-len(index_to_word) for x in best_elts]
best_words

[u'victims',
 u'dr',
 u'according',
 u'millions',
 u'netflix',
 u'twitter',
 u'far',
 u'ran',
 u'told',
 u'obama',
 u'obamacare',
 u'statement',
 u'white',
 u'interview',
 u'trump',
 u'asked',
 u'reported',
 u'poor',
 u'designed',
 u'replace']

In [221]:
worst_elts = numpy.argpartition(classifier.coef_, 19)[0][:20]
worst_words = [index_to_word[x] if x < len(index_to_word) else x-len(index_to_word) for x in worst_elts]
worst_words

[u'york',
 u'_____',
 1,
 0,
 u'march',
 u'article',
 u'play',
 u'season',
 u'prison',
 u'china',
 u'puzzle',
 u'writer',
 u'european',
 u'editor',
 u'trial',
 u'com',
 u'april',
 u'bank',
 u'film',
 u'blasio']

In [222]:
author([articles[u'data'][k] for k in T_keys])

<1687x1652 sparse matrix of type '<type 'numpy.int64'>'
	with 1783 stored elements in Compressed Sparse Row format>

In [223]:
author([articles[u'data'][k] for k in F_keys])

<3266x1652 sparse matrix of type '<type 'numpy.int64'>'
	with 2859 stored elements in Compressed Sparse Row format>

In [224]:
NE_train = X_train.tocsc()[:,-1]

In [225]:
NE_dev = X_dev.tocsc()[:,-1]

In [226]:
NE_count(articles[u'data'][T_keys[0]][u'text'])

[0.02005730659025788, 0.02148997134670487, 0.02148997134670487]

In [227]:
NE_train

<4000x1 sparse matrix of type '<type 'numpy.float64'>'
	with 3398 stored elements in Compressed Sparse Column format>

In [228]:
NE_dev

<953x1 sparse matrix of type '<type 'numpy.float64'>'
	with 802 stored elements in Compressed Sparse Column format>

In [229]:
with open("NE_count_dev.txt", 'w') as f :
    for i in NE_dev:
        f.write(str(i))
        f.write('\n')

In [230]:
tokenized_text = word_tokenize(articles[u'data'][article_keys[0]][u'text'])
t = st.tag(tokenized_text)

In [231]:
t

[(u'Although', u'O'),
 (u'members', u'O'),
 (u'and', u'O'),
 (u'supporters', u'O'),
 (u'of', u'O'),
 (u'the', u'O'),
 (u'Islamic', u'ORGANIZATION'),
 (u'State', u'ORGANIZATION'),
 (u'have', u'O'),
 (u'frequently', u'O'),
 (u'commented', u'O'),
 (u'on', u'O'),
 (u'President', u'O'),
 (u'Trump', u'PERSON'),
 (u'and', u'O'),
 (u'his', u'O'),
 (u'policies', u'O'),
 (u',', u'O'),
 (u'analysts', u'O'),
 (u'have', u'O'),
 (u'been', u'O'),
 (u'puzzled', u'O'),
 (u'by', u'O'),
 (u'the', u'O'),
 (u'terrorist', u'O'),
 (u'group\u2019s', u'O'),
 (u'official', u'O'),
 (u'silence', u'O'),
 (u'about', u'O'),
 (u'him', u'O'),
 (u'.', u'O'),
 (u'Mr.', u'O'),
 (u'Trump', u'PERSON'),
 (u'had', u'O'),
 (u'not', u'O'),
 (u'been', u'O'),
 (u'mentioned', u'O'),
 (u'in', u'O'),
 (u'any', u'O'),
 (u'of', u'O'),
 (u'the', u'O'),
 (u'group\u2019s', u'O'),
 (u'official', u'O'),
 (u'media', u'O'),
 (u'\u2014', u'O'),
 (u'all', u'O'),
 (u'frequent', u'O'),
 (u'venues', u'O'),
 (u'for', u'O'),
 (u'criticism', u'O'),