In [1]:
##### text classification

In [2]:
import pandas as pd

In [3]:
# read the data into a pandas dataframe
import os
def data2df (path, label):
    file, text = [], []
    for f in os.listdir(path):
        file.append(f)
        fhr = open(path+f, 'r', encoding='utf-8', errors='ignore') 
        t = fhr.read()
        text.append(t)
        fhr.close()
    return(pd.DataFrame({'file': file, 'text': text, 'class':label}))

dfneg = data2df('HealthProNonPro/NonPro/', 0) # NEG
dfpos = data2df('HealthProNonPro/Pro/', 1) # POS

df = pd.concat([dfpos, dfneg], axis=0)
df.sample(frac=0.005)

Unnamed: 0,file,text,class
919,a54325.txt,I strongly urge people to avoid over the count...,0
603,ans1541.txt,There are many causes of shoulder-arm pain as ...,1
58,a24290.txt,LET YOUR DOG BITE IT AND SWING ON IT FOR 6 MO...,0
103,a24335.txt,i have the same problem and have been told hat...,0
363,ans1325.txt,The causes of back pain can usually be diagnos...,1
299,a24531.txt,Excercise more than you eat! Eat foods low in ...,0
1664,a7451.txt,I wouldnt mind knowing the answer to this ques...,0
207,a24439.txt,"If I know what it is thats stressing me out, I...",0
442,a24674.txt,The best way is to get them all together and j...,0
1274,a69406.txt,"Awwwww, poor baby - that sounds painful - GOOD...",0


In [4]:
def custom_tokenizer(doc):

    # clean up text
    tokens = [token.lemma_.lower() # lemmatize and lower-case 
                        for token in doc 
                               if (
                                    len(token) >= 2 and # only preserve tokens that are 2 or more characters long
                                    # token.pos_ in ['PROPN', 'NOUN', 'ADJ', 'VERB', 'ADV'] and # only preserve specific pos
                                    # token.text in nlp.vocab and # check if token in vocab
                                    #token.is_alpha and # only preserve tokens that are fully alpha (not numeric or alpha-numeric)
                                    #not token.is_digit and # get rid of tokens that are fully numeric
                                    not token.is_punct and # get rid of tokens that are punctuations
                                    not token.is_space and # get rid of tokens that are spaces
                                    not token.is_stop # get rid of tokens that are stop words
                                )
                   ]

    # return cleaned-up text
    return ' '.join(tokens)

In [5]:
# setup the data
X, y = df['text'], df['class']

In [6]:
# split data into train and test
from sklearn.model_selection import train_test_split
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.2)

Xtrain = Xtrain.copy()
Xtest = Xtest.copy()
ytrain = ytrain.copy()
ytest = ytest.copy()

In [7]:
# use spacy to preprocess the train data
import spacy
nlp = spacy.load("en_core_web_md", disable=['parser', 'ner'])
corpus = nlp.pipe(list(Xtrain))
clean_corpus = [custom_tokenizer(doc) for doc in corpus]
Xtrain = pd.Series(clean_corpus)
Xtrain.head()

0    accept beatiful mind good course get help wieg...
1                           yes diet aerobic anaerobic
2    cause feel weak diverse muscle problem polymyo...
3    condition totally unrelated blood pus discharg...
4    high dose steroid certainly lead sleep problem...
dtype: object

In [8]:
# setup the preprocessing->model pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline

clf = Pipeline(steps=[
    ('tfidf', TfidfVectorizer(
        binary=False, # tf - bow
        #sublinear_tf=False, 
        use_idf=True, smooth_idf=True, # idf  - with smoothing
        norm='l2', # tfidf - l2 norm
        #lowercase=True, stop_words='english', 
        #token_pattern='(?u)\\b\\w\\w+\\b', 
        min_df=1, max_df=1.0, max_features=None, 
        ngram_range=(1, 1)
    )),
    ('np', MultinomialNB(
        #alpha=1.0, # laplace add-one smoothing
        fit_prior=True, # learn class prior-probabilities from data
        class_prior=None # none - go with whatever fit-prior says
    )) 
    ])

In [9]:
# setup grid search with cross validation
from sklearn.model_selection import GridSearchCV

param_grid = {
    'np__alpha':[0.01, 0.1, 0.5, 1],
    'tfidf__sublinear_tf':[True, False]
}

gscv = GridSearchCV(clf, param_grid, iid=False, cv=4, return_train_score=False)

In [10]:
# search for best parameters/estimator using train data
gscv.fit(Xtrain, ytrain)

print(gscv.best_estimator_, "\n")
print(gscv.best_score_, "\n")
print(gscv.best_params_, "\n")
print(gscv.cv_results_, "\n")

Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('np',
                 MultinomialNB(alpha=0.1, class_prior=None, fit_prior=True))],
         verbose=False) 

0.9388661202185793 

{'np__alpha': 0.1, 'tf

In [11]:
# use spacy to preprocess the test data
import spacy
nlp = spacy.load("en_core_web_md", disable=['parser', 'ner'])
corpus = nlp.pipe(list(Xtest))
clean_corpus = [custom_tokenizer(doc) for doc in corpus]
Xtest = pd.Series(clean_corpus)
Xtest.head()

0                                  ummm sure like wear
1    middle ear connect throat tube like structure ...
2    trans fat type fat distinct type appear human ...
3    take contraceptive pill period delay prolong m...
4    urine pregnancy test best week sexual act advi...
dtype: object

In [12]:
# evaluate best_estimator_ on test data
ypred = gscv.best_estimator_.predict(Xtest)

from sklearn import metrics
print (metrics.accuracy_score(ytest, ypred))
print (metrics.confusion_matrix(ytest, ypred))
print (metrics.classification_report(ytest, ypred))

0.937244201909959
[[308  39]
 [  7 379]]
              precision    recall  f1-score   support

           0       0.98      0.89      0.93       347
           1       0.91      0.98      0.94       386

    accuracy                           0.94       733
   macro avg       0.94      0.93      0.94       733
weighted avg       0.94      0.94      0.94       733



In [13]:
# deriving the various metrics above:
#
# if for label=1:
#
#                    predicted
#                   0           1
# actual  0    478        52
#            1    4             565
#
# then:
#  - tn, fp, fn, tp = 478, 52, 4, 565
#  - precision, recall, f1score= 0.92, 0.99, 0.95
#

# unravel confusion matrix 
tn, fp, fn, tp = metrics.confusion_matrix(y_true=ytest, y_pred=ypred).ravel()

# overall accuracy
accuracy = (tp + tn)/(tp + tn + fp + fn)

# label 0 metrics
precision0 = tn/(tn + fn)
recall0 = tn/(tn + fp)
f1score0 = 2*tn/(2*tn + fp + fn)
support0 = tn + fp

# label 1 metrics
precision1 = tp/(tp + fp)
recall1 = tp/(tp + fn)
f1score1 = 2*tp/(2*tp + fp + fn)
support1 = tp + fn

# micro average matrics - calcualted globally by counting total tn, fp, fn, tp
# microprecision = (tn + tp)/(tn + fn + tp + fp)
# microrecall = (tn + tp)/(tn + fp + tp + fn)
# microf1score = (2*tn + 2*tp)/(2*tn + fp + fn + 2*tp + fp + fn)
# microsupport = tp + tn + fp + fn

# macro average metrics - average of individual metrics
# macroprecision = (precision0+precision1)/2
# macrorecall = (recall0+recall1)/2
# macrof1score = (f1score0+f1score1)/2
# macrosupport = tp + tn + fp + fn 

# print all 
print (round(accuracy,2))
print (tn, fp, fn, tp)
print (round(precision0,2), round(recall0,2), round(f1score0,2)) #, support0)
print (round(precision1,2), round(recall1,2), round(f1score1,2)) #, support1)
# print (round(macroprecision,2), round(microrecall,2), round(microf1score,2), microsupport)
# print (round(macroprecision,2), round(macrorecall,2), round(macrof1score,2), macrosupport)

0.94
308 39 7 379
0.98 0.89 0.93
0.91 0.98 0.94
