In [None]:
import json
import sklearn
import numpy
import random
import scipy
import sklearn.datasets
import sklearn.feature_extraction.text
import sklearn.linear_model
import sklearn.naive_bayes
import sklearn.metrics
import sklearn.utils
import string
import io

import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import StanfordNERTagger
import matplotlib.pyplot as plt

import os
java_path = "C:/Program Files/Java/jdk1.8.0_131/bin/java.exe"
os.environ['JAVAHOME'] = java_path

st = StanfordNERTagger('C:\stanford-ner\classifiers\english.all.3class.distsim.crf.ser.gz', 'C:\stanford-ner\stanford-ner.jar', encoding='utf-8')


In [None]:
with open('articles_with_texts_final.json') as data_file:
    articles = json.load(data_file)

In [None]:
article_keys = articles[u'data'].keys()
vectorizer = \
sklearn.feature_extraction.text.TfidfVectorizer(sublinear_tf=True, 
                                                max_df=0.5, 
                                                stop_words='english')
_ = vectorizer.fit(articles[u'data'][k][u'text'] for k in article_keys[:4000])

In [None]:
T_keys = [k for k in article_keys if (articles[u'data'][k]['most_emailed'] or articles[u'data'][k]['most_shared'] or \
                articles[u'data'][k]['most_viewed'])]
F_keys = [k for k in article_keys if not (articles[u'data'][k]['most_emailed'] or articles[u'data'][k]['most_shared'] or \
                articles[u'data'][k]['most_viewed'])]

In [None]:
#def feature1(c_articles) : #c_articles is [articles[u'data'][k] for k in keys]

#author_vectorizer = sklearn.feature_extraction.text.CountVectorizer()
#_ = author_vectorizer.fit(string.replace(string.replace(ca[u'byline'][u'original'], ' ', ''), '.', '') if ca[u'byline'] != [] else "" for ca in [articles[u'data'][k] for k in article_keys])

def author(cas):
    return author_vectorizer.transform(string.replace(string.replace(ca[u'byline'][u'original'], ' ', ''), '.', '') if ca[u'byline'] != [] else "" for ca in cas)
    
# truth with noise
def truth(cas):
    if len(cas) == 4000 :
        return numpy.reshape([(1 if random.randint(1,4) > 1 else 0) if k in T_keys else (1 if random.randint(1,4) == 1 else 0) for k in article_keys[:4000]], (-1, 1))
    else :
        return numpy.reshape([(1 if random.randint(1,4) > 1 else 0) if k in T_keys else (1 if random.randint(1,4) == 1 else 0) for k in article_keys[4000:]], (-1, 1))

NE_count_file = 'NE_counts.txt'
    
def NE_count(text):
    tokenized_text = word_tokenize(text)
    t = st.tag(tokenized_text)
    n_persons = 0.0
    n_org = 0.0
    n_loc = 0.0
    if len(t) == 0 :
        return [0.0, 0.0, 0.0]
    (a, b) = t[0]
    if b == u'PERSON' :
        n_persons += 1
    elif b == u'LOCATION' :
        n_loc += 1
    elif b == u'ORGANIZATION' :
        n_org += 1
    prev = b
    for (a, b) in t[1:] :
        if b != prev :
            prev = b
            if b == u'PERSON' :
                n_persons += 1
            elif b == u'LOCATION' :
                n_loc += 1
            elif b == u'ORGANIZATION' :
                n_org += 1
    return [n_persons/len(tokenized_text), n_loc/len(tokenized_text), n_org/len(tokenized_text)]

def named_entity(keys):
    NE_count_dict = {}
    if NE_count_file != None :
        with open(NE_count_file, 'r') as f:
            for line in f:
                l = line.split()
                NE_count_dict[l[0].encode()] = [float(l[1]), float(l[2]), float(l[3])]
        return numpy.matrix([NE_count_dict[key] for key in keys])
    else:
        return numpy.matrix([NE_count(articles[u'data'][key][u'text']) for key in keys])

features = []
features_k = [named_entity]


def getX(key_list):
    x = vectorizer.transform(articles[u'data'][k][u'text'] for k in key_list)
    c_articles = [articles[u'data'][k] for k in key_list]
    for f in features :
        x_new = f(c_articles)
        x = scipy.sparse.hstack([x, x_new])
    for f in features_k :
        x_new = f(key_list)
        x = scipy.sparse.hstack([x, x_new])
    return x

In [None]:
#[string.replace(string.replace(ca[u'byline'][u'original'], ' ', ''), '.', '') if ca[u'byline'] != [] else "" for ca in [articles[u'data'][k] for k in article_keys[:4000]]]

In [None]:
X_train = getX(article_keys[:4000])
y_train = [k in T_keys for k in article_keys[:4000]]
X_dev = getX(article_keys[4000:])
y_dev = [k in T_keys for k in article_keys[4000:]]

In [None]:
classifier = \
sklearn.linear_model.SGDClassifier(loss="log",
                                   penalty="elasticnet",
                                   n_iter=5)

_ = classifier.fit(X_train, y_train)

In [None]:
pred = classifier.predict(X_dev)
print sklearn.metrics.accuracy_score(y_dev, pred)

In [None]:
index_to_word = vectorizer.get_feature_names()
best_elts = numpy.argpartition(classifier.coef_, classifier.coef_.size - 20)[0][-20:]
best_words = [index_to_word[x] if x < len(index_to_word) else x-len(index_to_word) for x in best_elts]
best_words

In [None]:
worst_elts = numpy.argpartition(classifier.coef_, 19)[0][:20]
worst_words = [index_to_word[x] if x < len(index_to_word) else x-len(index_to_word) for x in worst_elts]
worst_words

In [None]:
#with open('NE_counts.txt', 'w') as f:
#    for i in range(4000):
#        f.write(article_keys[i] + ' ' + str(NE_train[i][0]) + ' ' + str(NE_train[i][1]) + ' ' + str(NE_train[i][2]) + '\n')
#    for i in range(len(NE_dev)):
#        f.write(article_keys[4000+i] + ' ' + str(NE_dev[i][0]) + ' ' + str(NE_dev[i][1]) + ' ' + str(NE_dev[i][2]) + '\n')

In [None]:
getX(article_keys[:4000])