# Test Project: Classify federal register abstracts

## Step 0: Load libraries

In [1]:
import pandas as pd
import numpy as np
import spacy
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
# Load the pre-defined English model:
# nlp = spacy.load('en')
nlp = spacy.load('en_core_web_md')

**References:**    
https://stackoverflow.com/questions/53885198/using-spacy-as-tokenizer-in-sklearn-pipeline    
https://datascience.stackexchange.com/questions/43131/pipeline-with-linearsvm-and-lstm    
https://github.com/RaRe-Technologies/gensim/blob/develop/docs/notebooks/doc2vec-IMDB.ipynb

## Step 1: Get data

In [14]:
# Read in a CSV file with a column of text abstracts. Keep only the columns we need.
df = pd.read_csv('resources/fedreg_18-05-22-14-45.csv')
df = df[['agency', 'abstract', 'type']]
df['abstract']=df['abstract'].astype(str) # Make sure all values are strings. There were some floats in here.
df = df.dropna(how='any') # get rid of missing data.
df=df.reset_index()
df.shape

(2000, 4)

In [15]:
# What are the top agencies in the dataset? This could also be a good target for the classifier.
df['agency'].value_counts().head()

Health and Human Services Department    254
Commerce Department                     224
Transportation Department               183
Homeland Security Department            155
Energy Department                       147
Name: agency, dtype: int64

In [16]:
# Create the target variable (note: we have imbalanced classes).
df['target'] = (df['agency'] == 'Health and Human Services Department')
df['target'].value_counts()

False    1746
True      254
Name: target, dtype: int64

In [17]:
# Establish the text variable
df['abstract'][0]

'We are superseding Airworthiness Directive (AD) 2017-11-03 for DG Flugzeugbau GmbH Model DG-500MB gliders that are equipped with a Solo 2625 02 engine modified with a fuel injection system following the instructions of Solo Kleinmoteren GmbH Technische Mitteilung 4600-3 and identified as Solo 2625 02i. This AD results from mandatory continuing airworthiness information (MCAI) issued by an aviation authority of another country to identify and correct an unsafe condition on an aviation product. The MCAI describes the unsafe condition as failure of the connecting rod bearing resulting from too much load on the rod bearings from the engine control unit. This AD adds a model to the applicability. We are issuing this AD to require actions to address the unsafe condition on these products.'

In [18]:
# Remove all other variables
df=df[['target', 'abstract']]

## Step 2: Preprocess and tokenize text data with `spacy`

In [19]:
# Preprocess and vectorize the text column.
df['tokens'] = df['abstract'].apply(lambda x: nlp(x))

In [21]:
# Display POS tagging for first abstract.
spacy.displacy.render(df['tokens'][1], style='ent',jupyter=True)

## Step 3: Split and Vectorize the data

In [22]:
# Establish our X and y
X = df['tokens']
y = df['target']

In [23]:
# Let's set aside 33% of our data for testing.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

In [48]:
# Creating a Spacy Parser
from spacy.lang.en import English
parser = English()

def spacy_tokenizer(sentence):
    mytokens = parser(sentence)
    mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" 
else word.lower_ for word in mytokens ]
    mytokens = [word for word in mytokens if word not in stopwords and 
word not in punctuations]
    return mytokens

In [52]:
 # Using Tfidf
tfvectorizer = TfidfVectorizer(tokenizer = spacy_tokenizer)


## Step 4: Instantiate and Train a Logistic Regression Classifier

In [51]:
# train a logistic regression model
logreg = LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1.0, 
                            fit_intercept=True, intercept_scaling=1, class_weight=None, 
                            random_state=None, solver='lbfgs', max_iter=100, multi_class='warn', 
                            verbose=0, warm_start=False, n_jobs=None)

In [46]:
# Fit the model to the training data
logreg.fit(X_train, y_train)

ValueError: setting an array element with a sequence.

In [None]:
vectorizer = CountVectorizer(min_df=5, max_df=0.9, stop_words='english', 
                             lowercase=True, token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}')
data_vectorized = vectorizer.fit_transform(wines["processed_description"])

In [None]:
# make predictions for testing set
y_pred_class = logreg.predict(X_test)

## Step 5: Classify the test data

In [None]:
#Create another matrix of tfidf scores for the documents in the test set
tvec_test = vectorizer.transform(X_test)

In [None]:
# Predict on the test data
predictions=model_lr.predict(tvec_test)
predictions

In [None]:
# Probabilities
probabilities = model_lr.predict_proba(tvec_test)[:,1]
probabilities

In [None]:
# Score
model_lr.score(tvec_test, y_test)

In [None]:
# Convert each component to a pandas dateframe
df_probs=pd.DataFrame(probabilities, columns=['probabilities']).reset_index(drop=True)
df_preds=pd.DataFrame(predictions, columns=['predictions']).reset_index(drop=True)
df_Xtest=pd.DataFrame(X_test).reset_index(drop=True)
df_ytest=pd.DataFrame(y_test).reset_index(drop=True)
# Reset a new index because we removed all the training data but never reset the index, so it has gaps
# and drop=True gets rid of the old index

final=pd.concat([df_Xtest, df_ytest, df_probs, df_preds], axis=1)
final.head()

In [None]:
# Let's look at some that had high probability of being assigned to the target type=rule
final[(final['probabilities']<0.1)].head(5)

In [None]:
# Let's look at some that had high probability of NOT being assigned to the target agency
final[(final['probabilities']>0.9)].head(5)

In [25]:
# Let's look at some that had intermediate probability
final[(final['probabilities']!=0.0) & (final['probabilities']!=1.0)].head(10)

NameError: name 'final' is not defined

## Step 6: Evaluate Results
<p>Nice summary of different formulas for accuracy, precision, recall, etc 
<a href="http://www.damienfrancois.be/blog/files/modelperfcheatsheet.pdf">here</a></p>

In [None]:
#Write out results
final.head()

In [None]:
#Now let's see how well this model did, true positives, false positives, etc
def accuracy(tp, tn, fp, fn):
    return ((tp + tn)/(tp + tn + fp + fn))

def error_rate(tp, tn, fp, fn):
    return ((fp + fn)/ (tp + tn + fp + fn))

true_positives = len(final.loc[(final['target'] == True) & (final['predictions'] == True)])
false_positives = len(final.loc[(final['target'] == False) & (final['predictions'] == True)])
true_negatives = len(final.loc[(final['target'] == False) & (final['predictions'] == False)])
false_negatives = len(final.loc[(final['target'] == True) & (final['predictions'] == False)])
print("Results\nTrue Positives\tTrue_Negatives\tFalse_Positives\False_Negatives\n")
print("\t".join(map(str, [true_positives, true_negatives, false_positives, false_negatives])))

print("Classifier Accuracy: {}\n".format(accuracy(true_positives, true_negatives, 
                                                  false_positives, false_negatives)))
print("Classifier Error Rate: {}\n".format(error_rate(true_positives, true_negatives,
                                                      false_positives, false_negatives)))


### Pickle my results for use in dashboard

In [None]:
#First get the predicted class label for each document
predictions = model_lr.predict(tvec_test)
print("Accuracy score for your classifier: {:.3f}\n".format(model_lr.score(tvec_test, y_test)))
print("Error rate for your classifier: {:.3f}\n".format(1-model_lr.score(tvec_test, y_test)))
classifierStats = dict()
classifierStats['accuracy'] = model_lr.score(tvec_test, y_test)
classifierStats['errorRate'] = (1 - model_lr.score(tvec_test, y_test))

In [None]:
#Also store the predicted class probabilities
predictProbabilities = model_lr.predict_proba(tvec_test)
predictProbabilities[0]

In [None]:
def truth_value(myRow):
    if (myRow['ground_truth'] == True and myRow['predicted_value'] == True):
        return 'truePositive'
    elif (myRow['ground_truth'] == True and myRow['predicted_value'] == False):
        return 'falseNegative'
    elif (myRow['ground_truth'] == False and myRow['predicted_value'] == True):
        return 'falsePositive'
    elif (myRow['ground_truth'] == False and myRow['predicted_value'] == False):
        return 'trueNegative'
    else:
        return None

In [None]:
print(len(y_test))
print(len(predictions))

In [None]:
y_test.head()

In [None]:
print(type(predictions))
print(type(y_test))

In [None]:
y_test_array=y_test.values

In [None]:
print(type(predictions))
print(type(y_test_array))

In [None]:
# Define the results dataset
results = [(y_test_array[i], predictions[i]) for i in range(0,len(predictions))]

In [None]:
y_test_array

In [None]:
#Add in the email id, subject, and body, then truePos, falsePos, trueNeg, falseNeg, posProbability, negProbability
enrichedResults = pd.DataFrame.from_records(results, 
    columns = ['ground_truth', 'predicted_value'])

In [None]:
enrichedResults.head()

In [None]:
enrichedResults['truthValue'] = enrichedResults.apply(lambda row: truth_value(row), axis=1)
enrichedResults.head()

In [None]:
enrichedResults['abstract'] = X_test.tolist()
enrichedResults['posProbability'] = [prob[1] for prob in predictProbabilities]
enrichedResults['negProbability'] = [prob[0] for prob in predictProbabilities]
enrichedResults.head()

In [None]:
counts = enrichedResults['truthValue'].value_counts()
for i in range(0,len(counts)):
    classifierStats[counts.index[i]] = counts[i]
with open('classifierStats.pyc', 'wb') as f:
    pickle.dump(classifierStats, f)
f.close()
with open('classifierTestResults.pyc', 'wb') as f1:
    pickle.dump(enrichedResults, f1)
f1.close()

### Evaluate NB using metrics module

In [26]:
from sklearn import metrics
print(metrics.classification_report(y_test, predictions))

NameError: name 'predictions' is not defined

In [None]:
# Our ROC-AUC score measures the trade-off between specificity and sensitivity
from sklearn.metrics import roc_auc_score
roc_score=100*roc_auc_score(y_test, predictions)
print(roc_score)

### Which features are most informative?

In [None]:
# Create a dataframe of the features by importance
importance=pd.DataFrame(sorted(zip(model_lr.coef_[0], feature_names)), columns=['modelCoef', 'term'])
# Remember, the target is defined as df['type'] == 'Notice' rather than 'Rule'

In [None]:
importance.head()

In [None]:
# Features most associated with target=False
high5=pd.DataFrame(importance.head(5))
low5=pd.DataFrame(importance.tail(5))
top10=pd.concat([high5, low5], ignore_index=True)
top10.head()

In [None]:
# Features most associated with target=False
high50=pd.DataFrame(importance.head(50))
low50=pd.DataFrame(importance.tail(50))
top100=pd.concat([high50, low50], ignore_index=True)
top100.head()

In [None]:
top100.to_csv('termScores.csv', index=False)

In [27]:
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set(style="whitegrid", color_codes=True)
ax = top10.set_index('term').plot(kind='bar', legend=False, fontsize=18, figsize=(15, 7))
plt.title('Features with greatest predictive power',  fontsize=19)

plt.xticks(rotation = 45,  fontsize=18)
plt.xlabel('Features least or most associated with target', fontsize=18)
plt.yticks(rotation = 0,  fontsize=18)
plt.ylabel('Coefficient', rotation=90,  fontsize=18)

ModuleNotFoundError: No module named 'seaborn'