In [1]:
%load_ext autoreload
%autoreload 2

In [171]:
import pandas as pd
import pickle
import os
import numpy as np
from nlp_surveillance.utils.my_utils import get_sentence_and_date_from_annotated_span
from nlp_surveillance.annotator import *
from nlp_surveillance.edb_clean import get_cleaned_edb
from nlp_surveillance.who_scraper import get_annotated_2018_whos
from nlp_surveillance.optimize_date_and_count import get_date_optimization_edb, _extract_sentences_from_spans
import datetime

# Testing

# WHO DF

In [None]:
parsed_whos_df = get_annotated_2018_whos()

### EDB 

In [None]:
edb = get_cleaned_edb()

## Optimize date

In [89]:
edb = get_date_optimization_edb(use_pickle=False)

HBox(children=(IntProgress(value=0, max=146), HTML(value='')))




HBox(children=(IntProgress(value=0, max=146), HTML(value='')))




In [91]:
from nlp_surveillance.utils.text_from_url import clean_text
edb['sentence'] = edb['sentence'].apply(clean_text)

In [322]:
edb = edb.reset_index(drop=True)

# Handmade Naïve Bayes

In [378]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()

In [411]:
feature_matrix = cv.fit_transform(edb['sentence'].apply(
    lambda x: ' '.join(x)).tolist())
print(feature_matrix[0])

  (0, 1546)	1
  (0, 2713)	1
  (0, 2036)	1
  (0, 843)	1
  (0, 593)	1
  (0, 1473)	1
  (0, 1998)	1
  (0, 2483)	1
  (0, 430)	1
  (0, 1827)	1
  (0, 185)	1


In [455]:
feature_matrix.shape

(6587, 3252)

In [414]:
feature_names = cv.get_feature_names()
print(feature_names[8])

ability


In [415]:
feature_mapping = cv.vocabulary_
print(feature_mapping['ability'])

8


In [387]:
from collections import defaultdict
def get_label_index(labels):
    label_index = defaultdict(list)
    for index, label in enumerate(labels):
        label_index[label].append(index)
    return label_index

In [388]:
label_index = get_label_index(edb['is_label'].tolist())

In [420]:
def get_prior(label_index):
    prior = {label: len(index) for label, index 
            in label_index.items()}
    total_count = sum(prior.values())
    for label in prior:
        prior[label] /= float(total_count)
    return prior

In [421]:
prior = get_prior(dict(label_index))
print(prior)

{False: 0.9427660543494762, True: 0.05723394565052376}


In [437]:
np.asarray(feature_matrix[label_index[True], :].sum(axis=0))[0].

array([4, 2, 0, ..., 0, 0, 0], dtype=int64)

In [443]:
def get_likelihood(feature_matrix, label_index, smoothing=1):
    likelihood = {}
    for label, index in label_index.items():
        likelihood[label] = (feature_matrix[index, :].sum(axis=0)
                             + smoothing)
        likelihood[label] = np.asarray(likelihood[label])[0]
        total_count = likelihood[label].sum()
        likelihood[label] = likelihood[label] / float(total_count)
    return likelihood

In [444]:
likelihood = get_likelihood(feature_matrix, label_index, 1)

In [445]:
def get_posterior(feature_matrix, prior, likelihood):
    num_example = feature_matrix.shape[0]
    posteriors = []
    for i in range(num_example):
        posterior = {key: np.log(prior_label) 
                     for key, prior_label in prior.items()}
        for label, likelihood_label in likelihood.items():
            feature_matrix_vector = feature_matrix.getrow(i)
            counts = feature_matrix_vector.data
            indices = feature_matrix_vector.indices
            for count, index in zip(counts, indices):
                posterior[label] += np.log(likelihood_label[index]) * count
        min_log_posterior = min(posterior.values())
        for label in posterior:
            try:
                posterior[label] = (np.exp(posterior[label]) 
                                    - min_log_posterior)
            except:
                posterior[label] = float('inf')
            sum_posterior = sum(posterior.values())
            for label in posterior:
                if posterior[label] == float('inf'):
                    posterior[label] = 1
                else:
                    posterior[label] /= sum_posterior
            posteriors.append(posterior.copy())
    return posteriors

In [446]:
test_matrx = (edb['sentence'].iloc[200])
test = cv.transform(test_matrx)

In [447]:
get_posterior(test, prior, likelihood)

[{False: 2034.7428620443238, True: -2033.7428620443238},
 {False: 0.9960331733578045, True: 0.003966826642195497},
 {False: 1078.4785507300758, True: -1077.4785507300758},
 {False: 0.9931892151659599, True: 0.006810784834040037},
 {False: 120.22158021229004, True: -119.22158021229004},
 {False: 0.9523939127212122, True: 0.04760608727878788},
 {False: 145482.13944258034, True: -145481.13944258034},
 {False: 0.9999154035727699, True: 8.459642723007814e-05}]

# Working Naïve Bayes

In [525]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline

In [526]:
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', MultinomialNB())])

In [527]:
training_sentences = edb['sentence'].apply(lambda x: ' '.join(x))[::2]
training_label = edb['is_label'][::2]
text_clf.fit(training_sentences, training_label)

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...inear_tf=False, use_idf=True)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [532]:
test_sentences = edb['sentence'].apply(lambda x: ' '.join(x))[1::2]
test_label = edb['is_label'][1::2]

In [533]:
predicted = text_clf.predict(test_sentences)

In [534]:
from sklearn import metrics
print(metrics.classification_report(test_label, predicted))

              precision    recall  f1-score   support

       False       0.94      0.99      0.97      3102
        True       0.07      0.01      0.02       191

   micro avg       0.93      0.93      0.93      3293
   macro avg       0.51      0.50      0.49      3293
weighted avg       0.89      0.93      0.91      3293



In [535]:
metrics.confusion_matrix(test_label, predicted)

array([[3076,   26],
       [ 189,    2]])

## Apply SVM

In [542]:
from sklearn.linear_model import SGDClassifier
text_clf_2 = Pipeline([('vect', CountVectorizer()),
                       ('tfidf', TfidfTransformer()),
                       ('clf', SGDClassifier(loss='hinge', penalty='l2', 
                                             alpha=1e-3, random_state=42,
                                             max_iter=10, tol=1e-3))])

In [543]:
text_clf_2.fit(training_sentences, training_label)

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...om_state=42, shuffle=True, tol=0.001,
       validation_fraction=0.1, verbose=0, warm_start=False))])

In [544]:
predicted_2 = text_clf_2.predict(test_sentences)

In [545]:
print(metrics.classification_report(test_label, predicted_2))

              precision    recall  f1-score   support

       False       0.94      1.00      0.97      3102
        True       0.00      0.00      0.00       191

   micro avg       0.94      0.94      0.94      3293
   macro avg       0.47      0.50      0.49      3293
weighted avg       0.89      0.94      0.91      3293



  'precision', 'predicted', average, warn_for)


In [547]:
metrics.confusion_matrix(test_label, predicted_2)

array([[3102,    0],
       [ 191,    0]])