In [5]:
# core python
import io, os

# numerical/scientific computing
import random
import numpy as np

# data management
import pandas as pd

# machine learning
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

# plotting
import matplotlib.pyplot as plt

# set envrionment
##root_dir = os.path.dirname(os.path.abspath(__file__))
root_dir = os.path.dirname(os.path.abspath("."))
data_dir = os.path.join("..", "dat")

# MANAGING CLASSIFICATION DATA (from cl_build_data.py)
DATA = pd.read_csv(os.path.join(data_dir,"CLASS_DATA.csv"), index_col = 0)

/home/knielbo/Documents/projects/baoding_lab


In [8]:
## CLASS DIST AND BIAS
def printdist(DF):
    for label in set(DF['class']):
        print("number of " + label + ": {}".format(sum(DF['class'] == label)))

printdist(DATA)# BIASED
print("Accuracy for free {}".format(round(9461/float((9461+856)),2)))

number of old_testament: 9461
number of new_testament: 856
Accuracy for free 0.92


In [11]:
### UNBIAS DATA
def balance(df, n, classcol = 'class'):
    random.seed(1234)
    res = pd.DataFrame(columns = DATA.columns)
    C = list(set(df[classcol]))
    for c in C:
        idx = df[df[classcol] == c].index.tolist()
        df_c = df.loc[random.sample(idx, n),]# label based indexing
        res = res.append(df_c)
    return res.reindex(np.random.permutation(res.index))#shuffle order for classifirer

DATB = balance(DATA, 800)
printdist(DATB)
DATB.to_csv("../dat/CLASS_DATA_NONBIAS.csv")

number of old_testament: 800
number of new_testament: 800


In [17]:
# SPLIT DATA SET
ratio = .8
mask = np.random.rand(len(DATB)) <= ratio
TRAIN = DATB[mask]
TEST = DATB[~mask]

## training set
X_train = TRAIN['text'].values
y_train = TRAIN['class'].values
## test set
X_test = TEST['text'].values
y_test = TEST['class'].values

In [15]:
### INTERMEZZO: DOCUMENT REPRESENTATIONS AND UNDERSTANDING VECTORIZERS ###
vectorizer = CountVectorizer()# INSTANTIATE VECTORIZER
print(vectorizer)

TEXTS = ['This is the first document.',
        'This is the second second document.',
        'And the third one.',
        'Is this the first document? This is right.']

DTM = vectorizer.fit_transform(TEXTS)

print('DTM: {}'.format(DTM.todense()))
print('vocabulary (index): {}'.format(vectorizer.get_feature_names()))

# replace textmining with something more efficient
np.savetxt("../dat/DTM.csv", DTM.todense(), delimiter=",")
lexicon = vectorizer.get_feature_names()
with open('../dat/LEXICON.txt','w') as f:
    for i in lexicon:
        f.write("%s\n" % i)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)
DTM: [[0 1 1 1 0 0 0 1 0 1]
 [0 1 0 1 0 0 2 1 0 1]
 [1 0 0 0 1 0 0 1 1 0]
 [0 1 1 2 0 1 0 1 0 2]]
vocabulary (index): ['and', 'document', 'first', 'is', 'one', 'right', 'second', 'the', 'third', 'this']


In [18]:
# FEATURE EXTRACTION FOR UNSTRUCTURED DATA
vectorizer = CountVectorizer(ngram_range = (1,2), stop_words = 'english',
    lowercase = True, max_df = .95, min_df = .01, max_features = 500)

FEAT_train = vectorizer.fit_transform(X_train)# fit vector space
FEAT_test =  vectorizer.transform(X_test)# !only transform, ignoring features not occurring in training set!
FEAT_names = vectorizer.get_feature_names()

In [19]:
# TRAIN CLASSIFIER
nb_classifier = MultinomialNB()
nb_classifier.fit(FEAT_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [20]:
# EVALUATION
pred = nb_classifier.predict(FEAT_test)
confmat = metrics.confusion_matrix(y_test, pred)# horizontal: predicted label; vertical: true label
# obeserved accuracy
print("Accurracy: {}".format(round(metrics.accuracy_score(y_test, pred),2)))

# cohen's kappa
print("K: {}".format(metrics.cohen_kappa_score(y_test, pred)))
# model summary
print(metrics.classification_report(y_test, pred))

Accurracy: 0.86
K: 0.7319121447028424
               precision    recall  f1-score   support

new_testament       0.78      0.98      0.87       155
old_testament       0.98      0.76      0.86       177

     accuracy                           0.86       332
    macro avg       0.88      0.87      0.86       332
 weighted avg       0.89      0.86      0.86       332



In [21]:
## ADVANCED VALIDATION
### obtain class possiblities
y_scores = nb_classifier.fit(FEAT_train, y_train).predict_proba(FEAT_test)

### ROC and AUC for ROC
FPR, TPR, thresholds = metrics.roc_curve(y_test, y_scores[:,1], pos_label = 'old_testament')
AUC = round(metrics.auc(FPR, TPR),2)# alternative Accurracy measure

#### PLOT ROC
plt.title('ROC')
plt.plot(FPR, TPR, c='r', label=('AUC = {}'.format(AUC)))
plt.legend(loc='lower right', prop={'size':8})
plt.plot([0,1],[0,1], color='lightgrey', linestyle='--')
plt.xlim([-0.01,1.0])
plt.ylim([0.0,1.01])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.savefig('../fig/ROC.png', dpi = 300)
plt.close()

### precision-recall curve
precision, recall, thresholds = metrics.precision_recall_curve(y_test, y_scores[:,1], pos_label = 'old_testament')
plt.title('PR Curve')
plt.plot(recall, precision, c='r')
plt.xlim([-0.01,1.0])
plt.ylim([0.0,1.01])
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.savefig('../fig/PRC.png', dpi = 300)
plt.close()