In [1]:
#Proprietary content. ©Great Learning. All Rights Reserved. Unauthorized use or distribution prohibited

## Text classification using topic models as X variables

In [31]:
import pandas as pd
import numpy as np

from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

import nltk
#nltk.download()

import re
from nltk.stem.wordnet import WordNetLemmatizer 

In [32]:
df = pd.read_csv('drug.csv')

In [33]:
df.head()

Unnamed: 0,urlDrugName,rating,Review,score
0,enalapril,4,enalapril management of congestive heart failu...,Low
1,ortho-tri-cyclen,1,ortho-tri-cyclen birth prevention - Although t...,Low
2,ponstel,10,ponstel menstrual cramps - I was used to havin...,high
3,prilosec,3,prilosec acid reflux - The acid reflux went aw...,Low
4,lyrica,2,lyrica fibromyalgia - I think that the Lyrica ...,Low


In [34]:
df.shape

(4143, 4)

In [35]:
df['score'] = df.score.apply(lambda x: 1 if x == "high" else 0)

In [36]:
df['score'].value_counts()

1    3241
0     902
Name: score, dtype: int64

In [37]:
## Text pre-processing

#remove stopwords
from nltk.corpus import stopwords
stop_words = set(stopwords.words("english"))
import gensim
from gensim.utils import simple_preprocess
from nltk.corpus import stopwords

In [38]:
#Adding custom stop words
new_words = ["some","one","like","time","br","drug","effect","could","good",'even', 'get', 'would',
             'make', 'really', 'see', 'well', 'much', 'great', 'first', 'people', 'also', 'bad', 
             'show', 'way', 'thing', 'made', 'go', 'think', 'know', 'watch','look','many','day']
stop_words = stop_words.union(new_words)


## Defining functions

In [39]:
def strip_newline(series):
    return [review.replace('\n','') for review in series]


def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))
        
def remove_stopwords(texts):
    out = [[word for word in simple_preprocess(str(doc))
            if word not in stop_words]
            for doc in texts]
    return out

In [40]:
def bigrams(words, bi_min=15, tri_min=10):
    bigram = gensim.models.Phrases(words, min_count = bi_min)
    bigram_mod = gensim.models.phrases.Phraser(bigram)
    return bigram_mod

In [41]:
def get_corpus(df):
    df['Review'] = strip_newline(df.Review)
    words = list(sent_to_words(df.Review))
    words = remove_stopwords(words)
    bigram_mod = bigrams(words)
    bigram = [bigram_mod[review] for review in words]
    id2word = gensim.corpora.Dictionary(bigram)
    id2word.filter_extremes(no_below=10, no_above=0.35)
    id2word.compactify()
    corpus = [id2word.doc2bow(text) for text in bigram]
    
    return corpus, id2word, bigram

## Apply function to corpus to pre-process and extract bi-grams

In [42]:
train_corpus, train_id2word, bigram_train = get_corpus(df)

## Build the topic model

In [89]:
lda_train = gensim.models.ldamulticore.LdaMulticore(
                        corpus=train_corpus,
                        num_topics=10,
                        id2word=train_id2word,
                        chunksize=100,
                        workers=7, # Num. Processing Cores - 1
                        passes=50,
                        eval_every = 1,
                        per_word_topics=True)

In [90]:
lda_train.print_topics(20,num_words=15)[:10]

[(0,
  '0.024*"treatment" + 0.024*"acne" + 0.014*"thyroid" + 0.011*"years" + 0.010*"medication" + 0.009*"taken" + 0.009*"synthroid" + 0.009*"taking" + 0.008*"daily" + 0.008*"none" + 0.007*"take" + 0.007*"levels" + 0.007*"months" + 0.006*"antibiotic" + 0.006*"since"'),
 (1,
  '0.015*"take" + 0.012*"days" + 0.012*"started" + 0.012*"took" + 0.012*"doctor" + 0.011*"still" + 0.010*"went" + 0.009*"taking" + 0.009*"week" + 0.009*"eat" + 0.009*"never" + 0.008*"felt" + 0.008*"stomach" + 0.007*"got" + 0.006*"feel"'),
 (2,
  '0.060*"skin" + 0.025*"acne" + 0.019*"use" + 0.018*"face" + 0.015*"used" + 0.014*"cream" + 0.014*"using" + 0.011*"dry" + 0.010*"applied" + 0.009*"redness" + 0.009*"product" + 0.009*"retin" + 0.009*"apply" + 0.008*"treatment" + 0.007*"eye"'),
 (3,
  '0.021*"hair" + 0.021*"cholesterol" + 0.014*"blood" + 0.013*"hair_loss" + 0.012*"loss" + 0.011*"high" + 0.010*"taking" + 0.010*"diet" + 0.010*"daily" + 0.010*"lipitor" + 0.009*"level" + 0.009*"months" + 0.009*"mg" + 0.009*"levels" 

## Extract training vectors

In [92]:
train_vecs = []
for i in range(len(df)):
    top_topics = (
        lda_train.get_document_topics(train_corpus[i],
                                      minimum_probability=0.0)
    )
    topic_vec = [top_topics[i][1] for i in range(10)]
    topic_vec.extend([len(df.iloc[i].Review)])
    train_vecs.append(topic_vec)

In [93]:
train_vecs[2]

[0.0017864703,
 0.0017866711,
 0.0017862209,
 0.0017862006,
 0.0017864952,
 0.0017864573,
 0.0017863358,
 0.16800493,
 0.8177035,
 0.0017867765,
 712]

In [94]:
X = np.array(train_vecs)
y = np.array(df.score)

In [95]:
from sklearn import model_selection, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble
from sklearn.ensemble import RandomForestClassifier

In [96]:
print(X.shape, y.shape)

(4143, 11) (4143,)


In [97]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size = 0.30, random_state = 0)

In [98]:
print (X_train.shape, y_train.shape)
print (X_test.shape, y_test.shape)

(2900, 11) (2900,)
(1243, 11) (1243,)


In [100]:
# Scale Data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scale = scaler.fit_transform(X_train)
X_test_scale = scaler.transform(X_test)

In [101]:
# Fitting Naive Bayes to the Training set
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train_scale, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [103]:
# Predicting the Test set results
y_pred = classifier.predict(X_test_scale)

In [104]:
# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
#cm = confusion_matrix(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)
cm

array([[103, 172],
       [179, 789]], dtype=int64)

In [105]:
from sklearn.metrics import accuracy_score, recall_score
print(accuracy_score(y_test, y_pred))
#print(recall_score(y_test, y_pred))

0.7176186645213194
