# Modeling

In [1]:
import os
try:
    import cPickle as pkl
except ImportError:
    import pickle as pkl

from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import word_tokenize

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import cross_val_score

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier

from sklearn.pipeline import Pipeline

In [58]:
from sklearn.externals import joblib

In [2]:
os.chdir('data/')

In [29]:
data = []
for filename in os.listdir(os.getcwd()):
    if filename[-4:] == '.pkl':
        print 'reading', filename
        with open(filename) as f:
            currentData = pkl.load(f)
        if len(currentData) < 2000:
            data += currentData
        else:
            data += currentData[:2000]

reading Cancer_data.pkl
reading Congenital_disorders_data.pkl
reading Infectious_diseases_data.pkl
reading Machine_learning_algorithms_data.pkl
reading Medical_devices_data.pkl
reading Organs_(anatomy)_data.pkl
reading Rare_diseases_data.pkl


In [30]:
article_text = []
other_numbers = []
target = []

for item in data:
    article_text.append(item[0])
    other_numbers.append([item[1], item[2], item[3], item[4]])
    target.append(item[5])

In [31]:
other_numbers = np.array(other_numbers)

In [32]:
wordnet = WordNetLemmatizer()

def my_tokenize(doc):
    tok = word_tokenize(doc)
    return[wordnet.lemmatize(x) for x in tok]

In [36]:
other_numbers.shape

(12320, 4)

In [37]:
X_train, X_test, y_train, y_test = train_test_split(article_text, target)

max_features_list = [3000, 7000] 
for maxFeatures in max_features_list:
    count_vect = CountVectorizer(stop_words = 'english', max_features = maxFeatures, tokenizer = my_tokenize)
    X_train_counts = count_vect.fit_transform(X_train)

    tfidf_transformer = TfidfTransformer()
    X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

    clf = MultinomialNB()
    print maxFeatures, sum(cross_val_score(clf, X_train_tfidf, y_train, cv = 3, scoring='f1_weighted'))/3.0

3000 0.747981050435
7000 0.759275571094


#### With full data set:
maxFeatures f1-score  
    500  0.616226571014  
    700  0.622994594677  
    1500 0.633639154017  
    2000 0.639304330555  
    3000 0.6475938908  
    5000 0.653765994152  

## Naive Bayes

In [66]:
clf = MultinomialNB()
print sum(cross_val_score(clf, X_train_tfidf, y_train, cv = 3, scoring='accuracy'))/3.0

0.761038316636


## Logistic Regression

In [40]:
#clist = [0.01, 0.05, 0.1, 0.5, 1, 1.5, 2, 2.5]
clist = [2.7, 3]
for c in clist:
    model = LogisticRegression(C=c, class_weight="auto")
    print c, sum(cross_val_score(model, X_train_tfidf, y_train, cv = 5, scoring='accuracy'))/5.0

2.7 0.812123448242
3 0.813097942851


## SVM 

In [42]:
clf = SGDClassifier(loss='hinge', penalty='l2', alpha=0.0001, n_iter=1000)
print sum(cross_val_score(clf, X_train_tfidf, y_train, cv = 3, scoring='accuracy'))/3.0

0.802165318754


In [47]:
clf.fit(X_train_tfidf, y_train)

SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', n_iter=1000, n_jobs=1,
       penalty='l2', power_t=0.5, random_state=None, shuffle=True,
       verbose=0, warm_start=False)

## Pipeline

In [52]:
model = Pipeline([('vect', CountVectorizer(stop_words = 'english', max_features = 7000, tokenizer = my_tokenize)),
                  ('tfidf', TfidfTransformer()),
                  ('clf', LogisticRegression(C=3, fit_intercept=True, intercept_scaling=1, class_weight= "auto",\
                                              solver='liblinear', max_iter=1000)) ])

In [105]:
X_train_tfidf.shape

(9240, 7000)

In [53]:
text_clf = model.fit(X_train, y_train)

In [99]:
model = LogisticRegression(C=c, class_weight="auto")
model.fit(X_train_tfidf, y_train)

LogisticRegression(C=3, class_weight='auto', dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr',
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0)

In [100]:
joblib.dump(model, 'model.pkl')

['model.pkl', 'model.pkl_01.npy', 'model.pkl_02.npy', 'model.pkl_03.npy']

In [59]:
joblib.dump(tfidf_transformer, "tfidf_tranformer.pkl")

['tfidf_tranformer.pkl',
 'tfidf_tranformer.pkl_01.npy',
 'tfidf_tranformer.pkl_02.npy']

In [61]:
joblib.dump(count_vect, 'count_vect.pkl')

['count_vect.pkl']

In [106]:
count_vect

CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=7000, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words='english',
        strip_accents=None, token_pattern=u'(?u)\\b\\w\\w+\\b',
        tokenizer=<function my_tokenize at 0x13adccb90>, vocabulary=None)

In [75]:
new_vect = joblib.load('tfidf_tranformer.pkl')

In [76]:
blah = joblib.load('count_vect.pkl')

In [77]:
new_doc1 = blah.transform(docs_new)
nd_tfidf = new_vect.transform(new_doc1)

#model.predict_proba(nd_tfidf)

In [78]:
clf.predict(nd_tfidf)

array(['Machine_learning_algorithms'], 
      dtype='|S27')

In [83]:
from sklearn.ensemble import RandomForestClassifier

In [101]:
randForest = RandomForestClassifier(n_estimators=500, oob_score=True )

In [102]:
randForest.fit(X_train_tfidf.todense(), y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=1,
            oob_score=True, random_state=None, verbose=0, warm_start=False)

In [91]:
(randForest.feature_importances_)

array([  8.75740658e-05,   2.62480329e-06,   2.30935653e-05, ...,
         1.09001951e-05,   1.86988893e-04,   4.80316880e-05])

In [92]:
randForest.predict_proba(nd_tfidf)

array([[ 0.        ,  0.08      ,  0.02644444,  0.24466667,  0.22      ,
         0.42888889,  0.        ]])

In [94]:
randForest.predict(nd_tfidf)

array(['Organs_(anatomy)'], 
      dtype='|S27')

In [96]:
clf.classes_

array(['Cancer', 'Congenital_disorders', 'Infectious_diseases',
       'Machine_learning_algorithms', 'Medical_devices',
       'Organs_(anatomy)', 'Rare_diseases'], 
      dtype='|S27')

In [104]:
randForest.oob_score_

0.79069264069264067