In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import ComplementNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn import svm
import copy
from multiprocessing import Pool
from sklearn.metrics import roc_curve, auc
import sklearn
import time
import utils
import os

In [None]:
import pickle
text_train,label_train,text_test,label_test = utils.load_raw_data_new()
train_len = len(text_train)
text_train.extend(text_test)
label_train.extend(label_test)
text_total = text_train
label_total = label_train
new_text_total = utils.back_to_doc(text_total)

In [None]:
import spacy
nlp = spacy.load('en',disable = ["parser", "ner"])
def lemma(doc):
    doc = nlp(doc)
    lemmatized_doc = ''
    for token in doc:
        if not token.lemma_ == '-PRON-':
            lemmatized_doc += ' ' + token.lemma_
    return lemmatized_doc

In [None]:
from tqdm import tqdm
lemmatized_text = []
for doc in tqdm(new_text_total):
    doc = lemma(doc)
    lemmatized_text.append(doc)

In [None]:
new_text_train = lemmatized_text[:train_len]
label_train = label_total[:train_len]
new_text_test = lemmatized_text[train_len:]
label_test = label_total[train_len:]

In [None]:
def _svm_pipeline(global_idf):
    Tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,2),min_df =5,stop_words='english',max_df = 0.8)
    #--------------------Vectorize----------------------
    # Use the idf of whole text or only the idf of training dataset
    if global_idf:
        Tfidf_vectorizer = Tfidf_vectorizer.fit(new_text_total)
    else:
        Tfidf_vectorizer = Tfidf_vectorizer.fit(new_text_train)
    X_train = Tfidf_vectorizer.transform(new_text_train)
    y_train = label_train
    X_test = Tfidf_vectorizer.transform(new_text_test)
    y_test = label_test
    print(np.shape(X_train))
    #--------------------Training----------------------
    SVC_clf = svm.SVC(class_weight = 'balanced',kernel = 'linear',probability=1)
    SVC_clf.fit(X_train,y_train)
    #--------------------Evaluation----------------------
    y_pred = SVC_clf.predict(X_test)

    y_pred_proba = SVC_clf.predict_proba(X_test)
    y_pred = np.argmax(y_pred_proba,axis=-1)

    print(metrics.classification_report(y_test, y_pred, target_names=['No_pressure_ulcer','pressure_ulcer']))
    print(metrics.confusion_matrix(y_test,y_pred))
    
    y_pred_prob2 = y_pred_proba[:,1]
    fpr,tpr,thres = roc_curve(y_test,y_pred_prob2)
    print(sklearn.metrics.roc_auc_score(y_test, y_pred_prob2))
    return sklearn.metrics.roc_auc_score(y_test, y_pred_prob2)

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline_helper import PipelineHelper
X = new_text_total
Y = label_total
pipe = Pipeline([
    ('vectorizer', PipelineHelper([
        ('count', CountVectorizer(stop_words='english'),
        ('tfidf', TfidfVectorizer(stop_words='english'),
    ])),
    ('classifier', PipelineHelper([
        ('Multinomial', MultinomialNB()),
        ('Complement', ComplementNB()),
    ])),
])

params = {
    'vectorizer__selected_model': pipe.named_steps['vectorizer'].generate({
        'count__ngram_range': [(1,1),(1,2),(1,3)],
        'count__min_df': [5,10],
        'count__max_df': [0.4,0.6,0.8],
        'tfidf__ngram_range': [(1,1),(1,2),(1,3)],
        'tfidf__min_df':[5,10],
        'tfidf__max_df':[0.4,0.6,0.8],
    }),
    'classifier__selected_model': pipe.named_steps['classifier'].generate({
        'Multinomial__alpha': [1.0],
        'Complement__alpha': [1.0],
    })
}
grid = GridSearchCV(pipe, params, scoring='accuracy', verbose=1,cv=5,n_jobs=-1)
grid.fit(X, Y)
print(grid.best_params_)
print(grid.best_score_)


In [None]:
print(grid.best_params_)

In [None]:
Tfidf_vectorizer =TfidfVectorizer(ngram_range=(1,1),min_df =10,stop_words='english',token_pattern=r"(?u)\b\d?[a-z]{2,}\d?\b",max_df = 0.8)
Tfidf_vec = Tfidf_vectorizer.fit_transform(new_text_total)
print(np.shape(Tfidf_vec))
X_train, X_test, y_train, y_test = train_test_split(Tfidf_vec, label_total, test_size=0.2, random_state=41)
clf = MultinomialNB()
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
print(metrics.classification_report(y_test, y_pred, target_names=['No_pressure_ulcer','ressure_ulcer']))
print(metrics.confusion_matrix(y_test,y_pred))

In [None]:
from sklearn.metrics import roc_curve, auc
import sklearn
y_pred_prob2 = y_pred_proba[:,1]
fpr,tpr,thres = roc_curve(y_test,y_pred_prob2)
print(sklearn.metrics.roc_auc_score(y_test, y_pred_prob2))

import matplotlib.pyplot as plt
plt.figure(1)
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr, tpr, label='SVM')
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.title('ROC curve')
plt.legend(loc='best')
plt.show()

In [None]:
def _svm_pipeline_1gram(global_idf):
    Tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,1),min_df =5,stop_words='english',max_df = 0.8)
    #--------------------Vectorize----------------------
    # Use the idf of whole text or only the idf of training dataset
    if global_idf:
        Tfidf_vectorizer = Tfidf_vectorizer.fit(new_text_total)
    else:
        Tfidf_vectorizer = Tfidf_vectorizer.fit(new_text_train)
    X_train = Tfidf_vectorizer.transform(new_text_train)
    y_train = label_train
    X_test = Tfidf_vectorizer.transform(new_text_test)
    y_test = label_test
    print(np.shape(X_train))
    #--------------------Training----------------------
    SVC_clf = svm.SVC(class_weight = 'balanced',kernel = 'linear',probability=1)
    SVC_clf.fit(X_train,y_train)
    #--------------------Evaluation----------------------
    y_pred = SVC_clf.predict(X_test)

    y_pred_proba = SVC_clf.predict_proba(X_test)
    y_pred = np.argmax(y_pred_proba,axis=-1)

    print(metrics.classification_report(y_test, y_pred, target_names=['No_pressure_ulcer','pressure_ulcer']))
    print(metrics.confusion_matrix(y_test,y_pred))
    
    y_pred_prob2 = y_pred_proba[:,1]
    fpr,tpr,thres = roc_curve(y_test,y_pred_prob2)
    print(sklearn.metrics.roc_auc_score(y_test, y_pred_prob2))
    return sklearn.metrics.roc_auc_score(y_test, y_pred_prob2)