In [60]:
from __future__ import division
import os,time,pdb
import numpy as np

from sklearn.decomposition import NMF, LatentDirichletAllocation
    
#scikit-learn
from sklearn.decomposition import PCA
from sklearn.linear_model import SGDClassifier,LogisticRegression,Perceptron
from sklearn.svm import LinearSVC
from sklearn.cross_validation import train_test_split
from sklearn import metrics

from insight_hds.preprocessing import datasets
reload(datasets) 
from insight_hds.tools import words
reload(words)

<module 'insight_hds.tools.words' from '/home/bujan/storage/development/insight_hds/tools/words.pyc'>

## Get TF-IDFs from Reddit

In [37]:
subreddits = ['schizophrenia', 'depression', 'bipolar']

In [43]:
X, y = datasets.scrape_comments(subreddits)


998 comments from subreddit: schizophrenia fetched!

953 comments from subreddit: depression fetched!

999 comments from subreddit: bipolar fetched!


### Example text

In [49]:
X[0]

u"I had a major surgery last month and when I was told I'd be forced to go off my clozapine I was pretty concerned.  But I had no symptoms what so ever thankfully.\n\nAnyway, I'm sure they didn't put worms in your wound.  "

# Feature engineering (TF, TF-IDF) 

In [68]:
X_tfidf, names_tfidf = datasets.get_tfidf(X)

In [None]:
X_tf, names_tf = datasets.get_tf(X)

# Unsupervised learning with NMF and LDA

In [73]:
nmf = NMF(n_components=len(subreddits)).fit(X_tfidf)

In [74]:
words.print_top_words(nmf, names_tfidf, 10)

Topic #0:
you to your are if re can that be do
Topic #1:
and to the it my of me that was in
Topic #2:
thanks for sharing lot okay will ll tip mate thank


In [78]:
lda = LatentDirichletAllocation(n_topics=len(subreddits)).fit(X_tf)

In [79]:
words.print_top_words(lda, names_tf, 10)

Topic #0:
seroquel www com kratom http https gained carbs doses watch
Topic #1:
to and the you it that of my is in
Topic #2:
my its for been thanks depression thank helped or pizza


# Supervised learning with Softmax

In [86]:
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.20, random_state=42)

In [106]:
svc = LinearSVC(C=1e-2)
svc.fit(X_train, y_train)

LinearSVC(C=0.01, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [107]:
print(metrics.classification_report(y_test, svc.predict(X_test),target_names=subreddits))

               precision    recall  f1-score   support

schizophrenia       0.72      0.41      0.52       209
   depression       0.59      0.54      0.57       193
      bipolar       0.45      0.70      0.55       188

  avg / total       0.59      0.55      0.55       590



In [108]:
print(metrics.classification_report(y_train, svc.predict(X_train),target_names=subreddits))

               precision    recall  f1-score   support

schizophrenia       0.83      0.63      0.72       789
   depression       0.73      0.66      0.69       760
      bipolar       0.62      0.82      0.71       811

  avg / total       0.72      0.71      0.71      2360



In [109]:
best_words = np.array(names_tfidf)[np.argsort(clf.coef_,axis=1)[:,:10]]

In [110]:
best_words

array([[u'depression', u'bipolar', u'all', u'up', u'so', u'yourself',
        u'die', u're', u'felt', u'hate'],
       [u'meds', u'bipolar', u'schizophrenia', u'symptoms', u'was', u'had',
        u'illness', u'mental', u'psychiatrist', u'voices'],
       [u'schizophrenia', u'someone', u'voices', u'just', u'believe',
        u'things', u'is', u'birthday', u'feel', u'there']], 
      dtype='<U46')

In [128]:
sgdcl = SGDClassifier(alpha=1e-2,loss='log')
sgdcl.fit(X_train, y_train)   

SGDClassifier(alpha=0.01, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='log', n_iter=5, n_jobs=1,
       penalty='l2', power_t=0.5, random_state=None, shuffle=True,
       verbose=0, warm_start=False)

In [129]:
print(metrics.classification_report(y_test, sgdcl.predict(X_test),target_names=subreddits))

               precision    recall  f1-score   support

schizophrenia       0.67      0.49      0.57       209
   depression       0.53      0.63      0.57       193
      bipolar       0.52      0.57      0.54       188

  avg / total       0.57      0.56      0.56       590



In [120]:
print(metrics.classification_report(y_train, sgdcl.predict(X_train),target_names=subreddits))

               precision    recall  f1-score   support

schizophrenia       0.96      0.84      0.89       789
   depression       0.83      0.94      0.88       760
      bipolar       0.90      0.90      0.90       811

  avg / total       0.90      0.89      0.89      2360



In [125]:
sgdcl.predict_proba(X_test)

array([[ 0.49359244,  0.2551134 ,  0.25129416],
       [ 0.45119834,  0.22913429,  0.31966736],
       [ 0.33141141,  0.26629779,  0.4022908 ],
       ..., 
       [ 0.25704126,  0.49549784,  0.24746091],
       [ 0.22796095,  0.23451956,  0.5375195 ],
       [ 0.24829915,  0.510698  ,  0.24100284]])

# Keras (LSTMs, WordEmbedding)

In [130]:
from __future__ import print_function
import numpy as np
np.random.seed(1337)  # for reproducibility

from keras.preprocessing import sequence
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Embedding
from keras.layers import LSTM, SimpleRNN, GRU
from keras.datasets import imdb

Using Theano backend.


In [163]:
max_features = 20000
maxlen = 500
batch_size = 32

In [158]:
X_tfidf, names_tfidf = datasets.get_tfidf(X,{'max_features':max_features})

In [159]:
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.20, random_state=42)

In [160]:
X_pd_train = sequence.pad_sequences(X_train.toarray(), maxlen=maxlen)

In [161]:
X_pd_test = sequence.pad_sequences(X_test.toarray(), maxlen=maxlen)

In [None]:
print('Build model...')
model = Sequential()
model.add(Embedding(max_features, 128, dropout=0.2))
model.add(LSTM(128, dropout_W=0.2, dropout_U=0.2))
model.add(Dense(1))
model.add(Activation('sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

print('Train...')
model.fit(X_train.toarray(), y_train, batch_size=batch_size, nb_epoch=15,
          validation_data=(X_test.toarray(), y_test))
score, acc = model.evaluate(X_test.toarray(), y_test,
                            batch_size=batch_size)
print('Test score:', score)
print('Test accuracy:', acc)

Build model...
Train...
Train on 2360 samples, validate on 590 samples
Epoch 1/15
