In [22]:
import pandas as pd
import numpy as np
import glob
import os
from nltk.stem import SnowballStemmer

from nltk.corpus import stopwords 
import re
import matplotlib.pyplot as plt
import itertools

In [23]:
data_path = './Data_and_Noun_Phrases/'

In [24]:
def clean_data(get_text):

    #stops = set(stopwords.words("english"))
    #text = [w for w in get_text if not w in stops]
    #text = " ".join(get_text)
    text = get_text

    text = re.sub(r'[^a-z0-9]', ' ', text)
    
    text = re.sub(r"(\d{2})",'digit', text)
    
    #convert single digit to keyword sin_key
    
    text = re.sub(r"(\d)",'sin_key',text)

    text = text.split()
    stemmer = SnowballStemmer('english')
    stemmed_words = [stemmer.stem(word) for word in text]
    text = " ".join(stemmed_words)
    #Combine multiple spaces as one
    text = re.sub(r'[ +]',' ',text)
    return text

In [25]:
def load_data(path):
    train=[]
    dirs=os.listdir(path)
    for files in glob.glob(path+"/data/*.txt"):
        file_input = open(files).read().lower().split(".")
        
        for each_line in file_input:
            processed_text = clean_data(each_line)
            if processed_text!=' ' or processed_text!='':
                train.append(processed_text)
    return train        


In [26]:
data_cardiology = load_data(data_path+"/cardiology")
data_neurology  = load_data(data_path+"/neurology")

In [27]:

relations = ['is','a','such','as','kind','of','has','part','member','instance','including','or','other',
            'and','especially','in']
stemmer = SnowballStemmer('english')
stemmed_words = [stemmer.stem(word) for word in relations]

In [8]:
#vectorize the inputs

def vectorize(input_data):
    temp_data = []
    train_data = []
    found = False
    counter = 0
    for each_line in input_data:
     
        for dict_word in stemmed_words:
            
            for word in each_line.split(' '):
                
                if str(dict_word)==word:
                    counter = counter+1
                    #temp_data.append((stemmed_words.index(dict_word))+1)
                    #found=True
            #if not found:
                #Not found flag
            temp_data.append(counter)
            counter=0
            #found=False
        train_data.append(temp_data)
        temp_data =[]
    return train_data
                

In [9]:
data = data_cardiology+data_neurology
train_data = vectorize(data)

train_data = np.asarray(train_data)

In [10]:
#find the sum of the sublists

summed_values =[]
for each_value in train_data:
    summed_values.append(np.sum(each_value))

In [11]:
y_data = []
for values in summed_values:
    if values>=5:
        y_data.append(1)
    else:
        y_data.append(0)


train_frame = pd.DataFrame(train_data,columns=relations)

train_frame['target']=y_data


In [12]:
my_targets = train_frame.target.copy()

In [13]:
#Drop the coloumn
train_frame.drop(['target'],inplace=True,axis=1)

In [14]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score,roc_auc_score,classification_report,confusion_matrix
from sklearn.svm import LinearSVC

In [15]:
#Split the data set 70,30

X_train,X_test,y_train,y_test = train_test_split(train_frame,my_targets,test_size=0.33,random_state=42)

In [16]:
def plot_confusion_matrix(cm, classes,
                          normalize=False,classifier_name="decision",
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)
    file_name ="" 
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
        file_name = "Normalized confusion matrix"
    else:
        print('Confusion matrix, without normalization')
        file_name = "Confusion matrix without normalization"

    print(cm)

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.savefig(file_name+"_"+classifier_name+".eps")


In [17]:
class_names = ['Unrelated',"Related"]

In [28]:
#Checking the decision treee classifer

dt = DecisionTreeClassifier(max_depth=15,random_state=42)
dt.fit(X_train,y_train)
#Predict on the test data

y_dt = dt.predict(X_test)
#Classification accuracy,matrix and reports
print "Decision Tree"
print "Accuracy "+str(accuracy_score(y_test,y_dt))
print "Confusion matrix "
print str(confusion_matrix(y_test,y_dt))
cnf_dt = confusion_matrix(y_test,y_dt)
print "Classification report "
print str(classification_report(y_test,y_dt))
np.set_printoptions(precision=2)

# Plot non-normalized confusion matrix
plt.figure()
plot_confusion_matrix(cnf_dt, classes=class_names,classifier_name="decision_tree",
                      title='Confusion matrix, without normalization',)

# Plot normalized confusion matrix
plt.figure()
plot_confusion_matrix(cnf_dt, classes=class_names, normalize=True,classifier_name="decision_tree",
                      title='Normalized confusion matrix')

plt.show()


Decision Tree
Accuracy 0.953939393939
Confusion matrix 
[[2525   36]
 [ 116  623]]
Classification report 
             precision    recall  f1-score   support

          0       0.96      0.99      0.97      2561
          1       0.95      0.84      0.89       739

avg / total       0.95      0.95      0.95      3300

Confusion matrix, without normalization
[[2525   36]
 [ 116  623]]
Normalized confusion matrix
[[ 0.99  0.01]
 [ 0.16  0.84]]


In [19]:
rf = RandomForestClassifier(n_estimators=300,max_depth=15,random_state=42)
rf.fit(X_train,y_train)
y_rf = rf.predict(X_test)
print "Random Forest"
print "Accuracy "+str(accuracy_score(y_test,y_rf))
print "Confusion matrix "
print str(confusion_matrix(y_test,y_rf))
print "Classification report "
print str(classification_report(y_test,y_rf))
cnf_rf = confusion_matrix(y_test,y_rf)
np.set_printoptions(precision=2)

# Plot non-normalized confusion matrix
plt.figure()
plot_confusion_matrix(cnf_rf, classes=class_names,classifier_name="Random_forest",
                      title='Confusion matrix, without normalization',)

# Plot normalized confusion matrix
plt.figure()
plot_confusion_matrix(cnf_rf, classes=class_names, normalize=True,classifier_name="Random_forest",
                      title='Normalized confusion matrix')

plt.show()


Random Forest
Accuracy 0.963333333333
Confusion matrix 
[[2541   20]
 [ 101  638]]
Classification report 
             precision    recall  f1-score   support

          0       0.96      0.99      0.98      2561
          1       0.97      0.86      0.91       739

avg / total       0.96      0.96      0.96      3300

Confusion matrix, without normalization
[[2541   20]
 [ 101  638]]
Normalized confusion matrix
[[ 0.99  0.01]
 [ 0.14  0.86]]


In [20]:
ad = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=8,random_state=42),learning_rate=0.9,n_estimators=500,
                       random_state=1332)
ad.fit(X_train,y_train)
y_ad = ad.predict(X_test)
print "Adaboost Classifier"
print "Accuracy "+str(accuracy_score(y_test,y_ad))
print "Confusion matrix "
print str(confusion_matrix(y_test,y_ad))
print "Classification report "
print str(classification_report(y_test,y_ad))
np.set_printoptions(precision=2)
cnf_ad = confusion_matrix(y_test,y_ad)
# Plot non-normalized confusion matrix
plt.figure()
plot_confusion_matrix(cnf_ad, classes=class_names,classifier_name="Adaboost",
                      title='Confusion matrix, without normalization',)

# Plot normalized confusion matrix
plt.figure()
plot_confusion_matrix(cnf_ad, classes=class_names, normalize=True,classifier_name="Adaboost",
                      title='Normalized confusion matrix')

plt.show()


Adaboost Classifier
Accuracy 0.957575757576
Confusion matrix 
[[2529   32]
 [ 108  631]]
Classification report 
             precision    recall  f1-score   support

          0       0.96      0.99      0.97      2561
          1       0.95      0.85      0.90       739

avg / total       0.96      0.96      0.96      3300

Confusion matrix, without normalization
[[2529   32]
 [ 108  631]]
Normalized confusion matrix
[[ 0.99  0.01]
 [ 0.15  0.85]]


In [21]:
sv = LinearSVC(random_state=22,max_iter=100)
sv.fit(X_train,y_train)
y_sv = sv.predict(X_test)
print "Linear SVM Classifier"
print "Accuracy "+str(accuracy_score(y_test,y_sv))
print "Confusion matrix "
print str(confusion_matrix(y_test,y_sv))
print "Classification report "
print str(classification_report(y_test,y_sv))
np.set_printoptions(precision=2)
cnf_sv = confusion_matrix(y_test,y_sv)
# Plot non-normalized confusion matrix
plt.figure()
plot_confusion_matrix(cnf_sv, classes=class_names,classifier_name="SVM",
                      title='Confusion matrix, without normalization',)

# Plot normalized confusion matrix
plt.figure()
plot_confusion_matrix(cnf_sv, classes=class_names, normalize=True,classifier_name="SVM",
                      title='Normalized confusion matrix')

plt.show()


Linear SVM Classifier
Accuracy 1.0
Confusion matrix 
[[2561    0]
 [   0  739]]
Classification report 
             precision    recall  f1-score   support

          0       1.00      1.00      1.00      2561
          1       1.00      1.00      1.00       739

avg / total       1.00      1.00      1.00      3300

Confusion matrix, without normalization
[[2561    0]
 [   0  739]]
Normalized confusion matrix
[[ 1.  0.]
 [ 0.  1.]]
