In [2]:
import pandas as pd
import numpy as np
import json
import csv

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn import metrics

np.random.seed(500)


In [5]:
ndc_codes = pd.read_csv('../../data/processed/ndc_codes_extracted.csv', dtype=object)['NDC'].to_list()

In [3]:
df = pd.read_csv(
    '../../data/processed/NOTEEVENTS_ML_DATASET.csv', dtype=object)

In [4]:
NUMBER_OF_RECORDS = 40000

drug_orders = df['TEXT_NORMALIZED_JOINED'].to_list()[:NUMBER_OF_RECORDS]
multi_class_labels = list(map(lambda x: json.loads(x), df['LABELS'].to_list()))[
    :NUMBER_OF_RECORDS]

NUMBER_OF_CLASSES = len(multi_class_labels[0])

# Vectorize drug_orders
vectorizer = TfidfVectorizer(ngram_range=(3, 3), analyzer='char')
drug_order_vectors = vectorizer.fit_transform(drug_orders)


In [7]:
model_metrics = []

for i in range(NUMBER_OF_CLASSES):
    print("Training Model on Label", i)
    labels = list(map(lambda x: x[i], multi_class_labels))
    # Split dataset into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(
        drug_order_vectors, labels, test_size=0.3, random_state=100)
    # Create a svm Classifier
    model = LinearSVC(random_state=100)  # Linear Kernel
    # Train the model using the training sets
    model.fit(X_train, y_train)
    # Predict the response for test dataset
    y_pred = model.predict(X_test)

    accuracy = metrics.accuracy_score(y_test, y_pred)
    precision = metrics.precision_score(y_test, y_pred, pos_label=1)
    recall = metrics.recall_score(y_test, y_pred, pos_label=1)

    model_metrics.append(
        {'NDC': ndc_codes[i], 'Model': 'Linear SVM (Trigate)', 'Accuracy': accuracy, 'Precision': precision, 'Recall': recall})
    # Model Accuracy: how often is the classifier correct?
    # print("Accuracy:", accuracy)
    # # Model Precision: what percentage of positive tuples are labeled as such?
    # print("Precision:", precision)
    # # Model Recall: what percentage of positive tuples are labelled as such?
    # print("Recall:", recall)


Training Model on Label 0
Training Model on Label 1
Training Model on Label 2
Training Model on Label 3
Training Model on Label 4
Training Model on Label 5
Training Model on Label 6
Training Model on Label 7
Training Model on Label 8
Training Model on Label 9
Training Model on Label 10
Training Model on Label 11
Training Model on Label 12
Training Model on Label 13
Training Model on Label 14
Training Model on Label 15
Training Model on Label 16
Training Model on Label 17
Training Model on Label 18
Training Model on Label 19
Training Model on Label 20
Training Model on Label 21


In [8]:
df_svm_metrics = pd.DataFrame(model_metrics)
df_svm_metrics.to_csv('../../data/results/SVM_results.csv',
                      index=False, quotechar='"', quoting=csv.QUOTE_ALL)
df_svm_metrics


Unnamed: 0,NDC,Model,Accuracy,Precision,Recall
0,713016550,Linear SVM (Trigate),0.904,0.835355,0.76352
1,487950125,Linear SVM (Trigate),0.922417,0.892415,0.763668
2,517391025,Linear SVM (Trigate),0.920167,0.92592,0.910889
3,51079001920,Linear SVM (Trigate),0.90175,0.90885,0.917075
4,11098003002,Linear SVM (Trigate),0.955583,0.888128,0.44559
5,54829725,Linear SVM (Trigate),0.937333,0.887879,0.720965
6,45025501,Linear SVM (Trigate),0.989333,0.8,0.059701
7,338055002,Linear SVM (Trigate),0.862917,0.885269,0.918978
8,409131230,Linear SVM (Trigate),0.946333,0.909598,0.591437
9,45152510,Linear SVM (Trigate),0.937917,0.878906,0.655977
