# Modeling

In [19]:
import os
import numpy as np
import cPickle as pkl

from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import word_tokenize

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import cross_val_score
from sklearn.metrics import f1_score

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.pipeline import Pipeline
from sklearn.externals import joblib

## Load data

In [34]:
os.chdir('data/')

data = []
for filename in os.listdir(os.getcwd()):
    if filename[-4:] == '.pkl':
        print 'Reading', filename, 
        with open(filename) as f:
            currentData = pkl.load(f)
        print "   ", len(currentData), "articles"
        data += currentData
        
os.chdir('../')

Reading Machine_learning_algorithms.pkl     94 articles
Reading Organs_(anatomy).pkl     420 articles
Reading Congenital_disorders.pkl     636 articles
Reading Medical_devices.pkl     182 articles
Reading Infectious_diseases.pkl     1067 articles
Reading Rare_diseases.pkl     907 articles
Reading Cancer.pkl     816 articles


In [5]:
len(data)

4122

In [7]:
article_text = []
other_numbers = []
target = []

for item in data:
    article_text.append(item[0])
    other_numbers.append([item[1], item[2], item[3], item[4]])
    target.append(item[5])

In [8]:
other_numbers = np.array(other_numbers)

### Split data into train and test and iterate over different values of tf-idf dimensions

In [9]:
wordnet = WordNetLemmatizer()

def my_tokenize(doc):
    tok = word_tokenize(doc)
    return[wordnet.lemmatize(x) for x in tok]

In [10]:
X_train, X_test, y_train, y_test = train_test_split(article_text, target)

max_features_list = [100, 300, 500, 700, 900, 1100] 

for maxFeatures in max_features_list:
    
    count_vect = CountVectorizer(stop_words = 'english', max_features = maxFeatures, tokenizer = my_tokenize)
    X_train_counts = count_vect.fit_transform(X_train)

    tfidf_transformer = TfidfTransformer()
    X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

    clf = MultinomialNB()
    print "%4d %0.5f" %(maxFeatures, sum(cross_val_score(clf, X_train_tfidf, y_train, cv = 5, scoring='f1_weighted'))/5.0)

 100 0.68186
 300 0.74927
 500 0.76077
 700 0.76550
 900 0.76751
1100 0.77505


## Select dimensionality of tf-idf and prepare train and test data

In [11]:
count_vect = CountVectorizer(stop_words = 'english', max_features = 900, tokenizer = my_tokenize)
X_train_counts = count_vect.fit_transform(X_train)

tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)


X_test_counts = count_vect.transform(X_test)
X_test_tfidf = tfidf_transformer.transform(X_test_counts)

#### helper function for model-testing

In [25]:
def evaluate_model(model, X, y, Xt, yt):
    model.fit(X, y)
    yp = model.predict(Xt)
    
    print "accuracy =", sum(yp == yt)/float(len(yp))
    print "f1-score =", f1_score(yt, yp, average='weighted')

## Naive Bayes

In [26]:
clf = MultinomialNB()
evaluate_model(clf, X_train_tfidf, y_train, X_test_tfidf, y_test )

accuracy = 0.767216294859
f1-score = 0.765239546272


## Logistic Regression

In [27]:
clist = [0.01, 0.05, 0.1, 0.5, 1, 1.5, 2, 2.5, 3, 3.5]
#clist = [1.6, 1.7, 1.8, 1.9]
print "c accuracy f1-score:"
for c in clist:
    logistic_model = LogisticRegression(C=c, class_weight="balanced")
    print "%3.1f  %0.4f  %0.4f" % (c, 
                sum(cross_val_score(logistic_model, X_train_tfidf, y_train, cv = 5, scoring='accuracy'))/5.0,
                sum(cross_val_score(logistic_model, X_train_tfidf, y_train, cv = 5, scoring='f1_weighted'))/5.0)

c accuracy f1-score:
0.0  0.7030  0.6735
0.1  0.7939  0.7881
0.1  0.7975  0.7935
0.5  0.8062  0.8038
1.0  0.8114  0.8094
1.5  0.8098  0.8080
2.0  0.8098  0.8081
2.5  0.8114  0.8100
3.0  0.8095  0.8081
3.5  0.8075  0.8062


In [41]:
logistic_model = LogisticRegression(C=1, class_weight="balanced")
evaluate_model(logistic_model, X_train_tfidf, y_train, X_test_tfidf, y_test)

accuracy = 0.795344325897
f1-score = 0.793007087785


## SVM  (SGDClassifier - linear kernel)

In [30]:
k = 5
alpha_list = [0.00001, 0.0001, 0.001, 0.01]
for alpha in alpha_list:
    svm_model = SGDClassifier(loss='hinge', penalty='l2', alpha=alpha, n_iter=1000)
    print "%0.5f   %0.5f" % (alpha, 
                    sum(cross_val_score(svm_model, X_train_tfidf, y_train, cv = k, scoring='accuracy'))/float(k))

0.00001   0.73277
0.00010   0.78389
0.00100   0.80784
0.01000   0.77741


In [42]:
svm_model = SGDClassifier(loss='hinge', penalty='l2', alpha=0.001, n_iter=1000, class_weight='balanced')
evaluate_model(svm_model, X_train_tfidf, y_train, X_test_tfidf, y_test)

accuracy = 0.797284190107
f1-score = 0.793807859701


## Random Forest

In [39]:
rand_forest = RandomForestClassifier(n_estimators=700, oob_score=True)
rand_forest.fit(X_train_tfidf.todense(), y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=700, n_jobs=1,
            oob_score=True, random_state=None, verbose=0, warm_start=False)

In [40]:
evaluate_model(rand_forest, X_train_tfidf.todense(), y_train, X_test_tfidf.todense(), y_test)

accuracy = 0.786614936954
f1-score = 0.784123334384


## Pipeline

In [52]:
model = Pipeline([('vect', CountVectorizer(stop_words = 'english', max_features = 900, tokenizer = my_tokenize)),
                  ('tfidf', TfidfTransformer()),
                  ('clf', LogisticRegression(C=1, fit_intercept=True, intercept_scaling=1, class_weight= "auto",\
                                              solver='liblinear', max_iter=1000)) ])

In [45]:
os.getcwd()

'/home/koroviev/enliticWikipedia/wikiEnlitic'

## Save Model

In [48]:
joblib.dump(logistic_model, 'pickledModel/logistic_model.pkl')

['pickledModel/logistic_model.pkl',
 'pickledModel/logistic_model.pkl_01.npy',
 'pickledModel/logistic_model.pkl_02.npy',
 'pickledModel/logistic_model.pkl_03.npy',
 'pickledModel/logistic_model.pkl_04.npy']

In [47]:
joblib.dump(count_vect, 'pickledModel/count_vect.pkl')

['pickledModel/count_vect.pkl']

In [46]:
joblib.dump(tfidf_transformer, "pickledModel/tfidf_tranformer.pkl")

['pickledModel/tfidf_tranformer.pkl',
 'pickledModel/tfidf_tranformer.pkl_01.npy',
 'pickledModel/tfidf_tranformer.pkl_02.npy']