## Scope: 
Given that there are know people who are giving somewhat qualified medical advice and people who are asking question, these have been binned into binary categories.

http://nadbordrozd.github.io/blog/2016/05/20/text-classification-with-word2vec/

In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import ExtraTreesClassifier

from collections import defaultdict
import time
from sklearn.model_selection import train_test_split, cross_val_score
import pandas as pd

In [2]:
class MeanEmbeddingVectorizer(object):
    """
    Average word vectors for all words in a text using n dimensional word embedding vector.
    """
    def __init__(self, word2vec):
        self.word2vec = word2vec
        self.dim = len(word2vec.values())

    def fit(self, X, y):
        return self

    def transform(self, X):
        # If a text is empty we should return a vector of zeros
        # with the same dimensionality as all the other vectors
        return np.array([
            np.mean([self.word2vec[w] for w in words if w in self.word2vec]
                    # If text is empty, return vector of zeros
                    or [np.zeros(self.dim)], axis=0)
            for words in X
        ])
    
class TfidfEmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        self.word2weight = None
        self.dim = len(word2vec.values())

    def fit(self, X, y):
        tfidf = TfidfVectorizer(analyzer=lambda x: x)
        tfidf.fit(X)
        # if a word was never seen - it must be at least as infrequent
        # as any of the known words - so the default idf is the max of 
        # known idf's
        max_idf = max(tfidf.idf_)
        self.word2weight = defaultdict(
            lambda: max_idf,
            [(w, tfidf.idf_[i]) for w, i in tfidf.vocabulary_.items()])

        return self

    def transform(self, X):
        return np.array([
                np.mean([self.word2vec[w] * self.word2weight[w]
                         for w in words if w in self.word2vec] or
                        [np.zeros(self.dim)], axis=0)
                for words in X
            ])

In [3]:
path_to_data = '../data/reddit_comments_askDocs_2014_to_2018_03.gz'
df = pd.read_csv(path_to_data,dtype={'body':str,'score_hidden':float})
print('Shape',df.shape)
df.head(2)

df.dropna(subset=['body'],inplace=True)
df['body'] = df['body'].astype(str)

# Optional remove all strings where no/little response
# df = df.loc[df['body'].apply(lambda r: len(str(r))> 2)]

df['is_clinician'] = df['author_flair_text'].apply(lambda r: 0 if r =='This user has not yet been verified.' else 1)
df.loc[df['body'].apply(lambda r: len(str(r))<4)].shape

  interactivity=interactivity, compiler=compiler, result=result)


Shape (557648, 21)


(1935, 22)

In [4]:
df['tokenized_sents'] = df['body'].apply(lambda row: str(row).strip().replace('\n','').lower().split(' '))

In [5]:
# Load pre-computed Glove embeddings
import numpy as np

with open("../data/glove.6B.50d.txt", "rb") as lines:
    w2v = {line.split()[0]: np.array(map(float, line.split()[1:]))
           for line in lines}

In [6]:
X = df['tokenized_sents'].tolist()
# Remove empty strings
X = [[t for t in sent if len(t)>1] for sent in X]

In [7]:
import gensim
# let X be a list of tokenized texts (i.e. list of lists of tokens)
model = gensim.models.Word2Vec(X, size=100)
w2v = dict(zip(model.wv.index2word, model.wv.syn0))

  after removing the cwd from sys.path.


In [8]:
etree_w2v = Pipeline([
    ("word2vec vectorizer", MeanEmbeddingVectorizer(w2v)),
    ("extra trees", ExtraTreesClassifier(n_estimators=200))])

etree_w2v_tfidf = Pipeline([
    ("word2vec vectorizer", TfidfEmbeddingVectorizer(w2v)),
    ("extra trees", ExtraTreesClassifier(n_estimators=200))])

Create pipeline:

In [9]:
# Create pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB

text_clf = Pipeline([('vect', CountVectorizer(lowercase=False,ngram_range=(1,2))),
                     ('tfidf', TfidfTransformer()),
                     ('clf', MultinomialNB())
                    ])

etree_w2v_nb = Pipeline([
    ("word2vec vectorizer", MeanEmbeddingVectorizer(w2v)),
    ("extra trees", MultinomialNB())])

## Split for performance evaluation

In [10]:
X_train, X_test, y_train, y_test = train_test_split(df['body'],df['is_clinician'] , test_size=0.1, random_state=329)

In [14]:
%%time

etree_w2v.fit(X_test,y_test)

CPU times: user 1h 8min 39s, sys: 36.9 s, total: 1h 9min 16s
Wall time: 10min 8s


Pipeline(memory=None,
     steps=[('word2vec vectorizer', <__main__.MeanEmbeddingVectorizer object at 0x19b7b1e48>), ('extra trees', ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
           max_depth=None, max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity...mators=200, n_jobs=-1,
           oob_score=False, random_state=None, verbose=0, warm_start=False))])

In [None]:
%%time


scores = cross_val_score(etree_w2v_nb, X_test, y_test, cv=10,n_jobs=6,scoring='f1_macro')
print("F1 macro: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))