In [None]:

#######################################################
#######################################################
############    COPYRIGHT - DATA SOCIETY   ############
#######################################################
#######################################################

## ADVANCED TEXT MINING PART2 ##

## NOTE: To run individual pieces of code, select the line of code and
##       press ctrl + enter for PCs or command + enter for Macs



In [None]:
#=================================================-
#### Slide 16: Directory settings  ####

from pathlib import Path
# Set `home_dir` to the root directory of your computer.
home_dir = Path.home()

# Set `main_dir` to the location of your `booz-allen-hamilton` folder.
main_dir = home_dir / "Desktop" / "booz-allen-hamilton"

# Make `data_dir` from the `main_dir` and remainder of the path to data directory.
data_dir = main_dir / "data"


In [None]:
#=================================================-
#### Slide 17: Loading packages  ####

# Helper packages.
import os
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt

# Packages with tools for text processing.
import nltk

# Packages for working with text data and analyzing sentiment
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression

# Packages to build and measure the performance of a logistic regression model
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn import preprocessing



In [None]:
#=================================================-
#### Slide 18: Working directory  ####

# Set working directory.
os.chdir(data_dir)
# Check working directory.
print(os.getcwd())



In [None]:
#=================================================-
#### Slide 20: Text classification - classify snippets  ####

cleaned_txt = pickle.load(open((data_dir + '/NYT_clean_list.sav'),"rb"))



In [None]:
#=================================================-
#### Slide 21: Text classification - classify snippets  ####

print(cleaned_txt[0:10])



In [None]:
#=================================================-
#### Slide 25: Text classification - classify (cont'd)  ####

# Initialize the `SentimentIntensityAnalyzer().`
sid = SentimentIntensityAnalyzer()

# Iterate through each sentence printing out the scores for each.
for sentence in cleaned_txt:
    print(sentence)
    ss = sid.polarity_scores(sentence)
    for k in ss:
        print('{0}: {1}, '.format(k, ss[k]), end='')
    print()



In [None]:
#=================================================-
#### Slide 27: Text classification - classify (cont'd)  ####

# This function outputs a list of labels for snippet:
def sentiment_analysis(texts):
    list_of_scores = []
    for text in texts:
        sid = SentimentIntensityAnalyzer()
        compound = sid.polarity_scores(text)["compound"]
        if compound >= 0:
            list_of_scores.append("positive")
        else:
            list_of_scores.append("negative")
        return(list_of_scores)
    
score_labels = sentiment_analysis(cleaned_txt)
print(score_labels[1:5])



In [None]:
#=================================================-
#### Slide 28: Text classification - Load the DTM  ####

DTM_matrix = pickle.load(open((data_dir + '/DTM_matrix.sav'),"rb"))
DTM_array = DTM_matrix.toarray()

# Let's look at the first few rows of the finalized array.
print(DTM_array[1:4])



In [None]:
#=================================================-
#### Slide 33: Model building - split the dataset  ####

X_train, X_test, y_train, y_test  = train_test_split(
    DTM_array,
    score_labels,
    train_size = 0.70,
    random_state = 1234)



In [None]:
#=================================================-
#### Slide 34: Model building - split the dataset  ####

print(len(X_train))
print(len(X_test))
print(len(y_train))
print(len(y_test))



In [None]:
#=================================================-
#### Slide 36: Exercise 1  ####





In [None]:
#=================================================-
#### Slide 46: Categorical to binary target variable  ####

# Initiate the Label Binarizer.
lb = preprocessing.LabelBinarizer()

# Convert y_test to binary integer format.
y_test= lb.fit_transform(y_test)



In [None]:
#=================================================-
#### Slide 53: Logistic regression: build  ####

# Set up logistic regression model.
log_model = LogisticRegression()
print(log_model)



In [None]:
#=================================================-
#### Slide 54: Logistic regression: fit  ####

# Fit the model.
log_model = log_model.fit(X = X_train, y = y_train)



In [None]:
#=================================================-
#### Slide 56: Logistic regression: predict (cont'd)  ####

# Predict on test data.
y_pred = log_model.predict(X_test)
print(y_pred)

# Convert y_pred to binary integer format.
y_pred= lb.fit_transform(y_pred)



In [None]:
#=================================================-
#### Slide 58: Exercise 2  ####





In [None]:
#=================================================-
#### Slide 68: Confusion matrix and accuracy  ####

# Take a look at test data confusion matrix.
conf_matrix_test = metrics.confusion_matrix(y_test, y_pred)
print(conf_matrix_test)

# Compute test model accuracy score.
test_accuracy_score = metrics.accuracy_score(y_test, y_pred)
print("Accuracy on test data: ", test_accuracy_score)



In [None]:
#=================================================-
#### Slide 69: Classification report  ####

# Create a list of target names to interpret class assignments.
target_names = ['Negative', 'Positive']

# Print an entire classification report.
class_report = metrics.classification_report(y_test,
                                             y_pred,
                                             target_names = target_names)

print(class_report)



In [None]:
#=================================================-
#### Slide 70: Classification report (cont'd)  ####

print(class_report)



In [None]:
#=================================================-
#### Slide 71: Getting probabilities instead of class labels  ####

# Get probabilities instead of predicted values.
test_probabilities = log_model.predict_proba(X_test)
print(test_probabilities[0:5, :])

# Get probabilities of test predictions only.
test_predictions = test_probabilities[: , 1]
print(test_probabilities[0:5])



In [None]:
#=================================================-
#### Slide 72: Computing FPR, TPR and threshold  ####

# Get FPR, TPR and threshold values.
fpr, tpr, threshold = metrics.roc_curve(y_test,           #<- test data labels
                                        test_predictions) #<- predicted probabilities

print("False positive: ", fpr)
print("True positive: ", tpr)
print("Threshold: ", threshold)



In [None]:
#=================================================-
#### Slide 73: Computing AUC  ####

# Get AUC by providing the FPR and TPR.
auc = metrics.roc_auc_score(y_test,y_pred)
print("Area under the ROC curve: ", auc)



In [None]:
#=================================================-
#### Slide 74: Putting it all together: ROC plot  ####

# Make an ROC curve plot.
plt.title('Receiver Operator Characteristic')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.show()



In [None]:
#=================================================-
#### Slide 76: Exercise 3  ####


