# Modeling

In [1]:
import os
import numpy as np
import cPickle as pkl

from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import word_tokenize

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import cross_val_score
from sklearn.metrics import f1_score

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.pipeline import Pipeline
from sklearn.externals import joblib

## Load data

In [2]:
os.chdir('data/')

data = []
for filename in os.listdir(os.getcwd()):
    if filename[-4:] == '.pkl':
        print 'Reading', filename, 
        with open(filename) as f:
            currentData = pkl.load(f)
        print "   ", len(currentData), "articles"
        data += currentData
        
os.chdir('../')

Reading Machine_learning_algorithms.pkl     94 articles
Reading Organs_(anatomy).pkl     420 articles
Reading Congenital_disorders.pkl     636 articles
Reading Medical_devices.pkl     182 articles
Reading Infectious_diseases.pkl     1067 articles
Reading Rare_diseases.pkl     907 articles
Reading Cancer.pkl     816 articles


In [3]:
len(data)

4122

In [4]:
article_text = []
other_numbers = []
target = []

for item in data:
    article_text.append(item[0])
    other_numbers.append([item[1], item[2], item[3], item[4]])
    target.append(item[5])

In [5]:
other_numbers = np.array(other_numbers)

### Split data into train and test and iterate over different values of tf-idf dimensions

In [6]:
wordnet = WordNetLemmatizer()

def my_tokenize(doc):
    tok = word_tokenize(doc)
    return[wordnet.lemmatize(x) for x in tok]

In [7]:
X_train, X_test, y_train, y_test = train_test_split(article_text, target)

max_features_list = [100, 300, 500, 700, 900, 1100] 

for maxFeatures in max_features_list:
    
    count_vect = CountVectorizer(stop_words = 'english', max_features = maxFeatures, tokenizer = my_tokenize)
    X_train_counts = count_vect.fit_transform(X_train)

    tfidf_transformer = TfidfTransformer()
    X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

    clf = MultinomialNB()
    print "%4d %0.5f" %(maxFeatures, sum(cross_val_score(clf, X_train_tfidf, y_train, cv = 5, scoring='f1_weighted'))/5.0)

 100 0.67529
 300 0.74886
 500 0.75843
 700 0.75764
 900 0.77011
1100 0.76902


## Select dimensionality of tf-idf and prepare train and test data

In [8]:
count_vect = CountVectorizer(stop_words = 'english', max_features = 900, tokenizer = my_tokenize)
X_train_counts = count_vect.fit_transform(X_train)

tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)


X_test_counts = count_vect.transform(X_test)
X_test_tfidf = tfidf_transformer.transform(X_test_counts)

#### helper function for model-testing

In [9]:
def inspect_misclassified(yp, yt):
    rare_disease_and_congenital_confustion = ["Congenital_disorders", "Rare_diseases"]
    num_rare_congenital = 0
    other_misclassified = 0
    
    for i in xrange(len(yt)):
        if yt[i] != yp[i]:
            if yt[i] in rare_disease_and_congenital_confustion and yp[i] in rare_disease_and_congenital_confustion:
                num_rare_congenital += 1
            else:
                other_misclassified += 1
      
    print '' 
    print "Percentage of misclassified points which mislabeled Rare_disease and Congenital_disorders =",\
                    num_rare_congenital/float(num_rare_congenital+other_misclassified)*100,"%"

In [10]:
def evaluate_model(model, X, y, Xt, yt):
    model.fit(X, y)
    yp = model.predict(Xt)
    
    print "accuracy =", sum(yp == yt)/float(len(yp))
    print "f1-score =", f1_score(yt, yp, average='weighted')
    
    inspect_misclassified(yp, yt)

## Naive Bayes

In [11]:
clf = MultinomialNB()
evaluate_model(clf, X_train_tfidf, y_train, X_test_tfidf, y_test )

accuracy = 0.775945683802
f1-score = 0.772526300013

Percentage of misclassified points which mislabeled Rare_disease and Congenital_disorders = 33.3333333333 %


## Logistic Regression

In [12]:
clist = [0.01, 0.05, 0.1, 0.5, 1, 1.5, 2, 2.5, 3, 3.5]
#clist = [1.6, 1.7, 1.8, 1.9]
print "c accuracy f1-score:"
for c in clist:
    logistic_model = LogisticRegression(C=c, class_weight="balanced")
    print "%3.1f  %0.4f  %0.4f" % (c, 
                sum(cross_val_score(logistic_model, X_train_tfidf, y_train, cv = 5, scoring='accuracy'))/5.0,
                sum(cross_val_score(logistic_model, X_train_tfidf, y_train, cv = 5, scoring='f1_weighted'))/5.0)

c accuracy f1-score:
0.0  0.7043  0.6700
0.1  0.7920  0.7858
0.1  0.7942  0.7899
0.5  0.8040  0.8022
1.0  0.8062  0.8043
1.5  0.8043  0.8024
2.0  0.8056  0.8040
2.5  0.8043  0.8028
3.0  0.8056  0.8041
3.5  0.8069  0.8055


In [13]:
logistic_model = LogisticRegression(C=1, class_weight="balanced")
evaluate_model(logistic_model, X_train_tfidf, y_train, X_test_tfidf, y_test)

accuracy = 0.815712900097
f1-score = 0.813081601883

Percentage of misclassified points which mislabeled Rare_disease and Congenital_disorders = 38.9473684211 %


## SVM  (SGDClassifier - linear kernel)

In [14]:
k = 5
alpha_list = [0.00001, 0.0001, 0.001, 0.01]
for alpha in alpha_list:
    svm_model = SGDClassifier(loss='hinge', penalty='l2', alpha=alpha, n_iter=1000)
    print "%0.5f   %0.5f" % (alpha, 
                    sum(cross_val_score(svm_model, X_train_tfidf, y_train, cv = k, scoring='accuracy'))/float(k))

0.00001   0.73408
0.00010   0.78550
0.00100   0.80654
0.01000   0.76903


In [15]:
svm_model = SGDClassifier(loss='hinge', penalty='l2', alpha=0.001, n_iter=1000, class_weight='balanced')
evaluate_model(svm_model, X_train_tfidf, y_train, X_test_tfidf, y_test)

accuracy = 0.811833171678
f1-score = 0.809269228859

Percentage of misclassified points which mislabeled Rare_disease and Congenital_disorders = 36.0824742268 %


## Random Forest

In [16]:
forest = RandomForestClassifier(n_estimators=700, oob_score=True)
#rand_forest.fit(X_train_tfidf.todense(), y_train)

In [17]:
#700
evaluate_model(forest, X_train_tfidf, y_train, X_test_tfidf, y_test)

accuracy = 0.770126091174
f1-score = 0.763706044522

Percentage of misclassified points which mislabeled Rare_disease and Congenital_disorders = 46.4135021097 %


## Pipeline

## Save Model

In [18]:
#joblib.dump(logistic_model, 'pickledModel/logistic_model.pkl')
#joblib.dump(count_vect, 'pickledModel/count_vect.pkl')
#joblib.dump(tfidf_transformer, "pickledModel/tfidf_tranformer.pkl")