In [95]:
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
import pandas as pd 
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np 
import re 
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer 
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report , confusion_matrix

In [96]:
test=pd.read_csv('drugsComTest_raw.csv')


In [97]:
train=pd.read_csv('drugsComTrain_raw.csv')

In [98]:
df=train.append(test)

In [99]:
df['review'].head()

0    "It has no side effect, I take it in combinati...
1    "My son is halfway through his fourth week of ...
2    "I used to take another oral contraceptive, wh...
3    "This is my first time using any form of birth...
4    "Suboxone has completely turned my life around...
Name: review, dtype: object

In [100]:
clean = pd.read_csv('x_cleansed.csv',names=['review'] )

In [101]:
clean.head()

Unnamed: 0,review
0,side effect take combination bystolic mg f...
1,son halfway fourth week intuniv became conce...
2,used take another oral contraceptive pill...
3,first time using form birth control gla...
4,suboxone completely turned life around feel...


In [102]:
df['review'] = clean['review']

In [103]:
df.head()

Unnamed: 0,uniqueID,drugName,condition,review,rating,date,usefulCount
0,206461,Valsartan,Left Ventricular Dysfunction,side effect take combination bystolic mg f...,9,20-May-12,27
1,95260,Guanfacine,ADHD,son halfway fourth week intuniv became conce...,8,27-Apr-10,192
2,92703,Lybrel,Birth Control,used take another oral contraceptive pill...,5,14-Dec-09,17
3,138000,Ortho Evra,Birth Control,first time using form birth control gla...,8,3-Nov-15,10
4,35696,Buprenorphine / naloxone,Opiate Dependence,suboxone completely turned life around feel...,9,27-Nov-16,37


In [104]:
df.shape

(215063, 7)

### remove the least frequent items in the drugname because it may lead us to bias the data  

In [105]:
key = list(df['drugName'].value_counts().to_dict().keys())
value = list(df['drugName'].value_counts().to_dict().values())

In [106]:
to_drop = []
for i in range(len(key)):
    if value[i] < 50:
        to_drop.append(key[i])

In [107]:
combined=df[~df['drugName'].isin(to_drop )]


In [108]:
combined.shape , df.shape 

((188990, 7), (215063, 7))

In [109]:
combined.drugName.value_counts()

Levonorgestrel                             4930
Etonogestrel                               4421
Ethinyl estradiol / norethindrone          3753
Nexplanon                                  2892
Ethinyl estradiol / norgestimate           2790
Ethinyl estradiol / levonorgestrel         2503
Phentermine                                2085
Sertraline                                 1868
Escitalopram                               1747
Mirena                                     1673
Implanon                                   1506
Gabapentin                                 1415
Bupropion                                  1369
Miconazole                                 1344
Venlafaxine                                1338
Medroxyprogesterone                        1308
Citalopram                                 1308
Duloxetine                                 1256
Lexapro                                    1250
Bupropion / naltrexone                     1249
Contrave                                

In [110]:
combined.columns

Index(['uniqueID', 'drugName', 'condition', 'review', 'rating', 'date',
       'usefulCount'],
      dtype='object')

In [111]:
def to_cat (value):
    if (value <= 4 ):
        val = 0
    elif (value < 8 ):
        val = 1
    else:
        val = 2
    return val

In [112]:
combined['rating'] = combined['rating'].apply(to_cat)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [113]:
combined['rating'].value_counts()

2    112679
0     47760
1     28551
Name: rating, dtype: int64

### remove the least frequent items in the condition  because it may lead us to bias the data  

In [None]:
keys = list(combined['condition'].value_counts().to_dict().keys())
values = list(combined['condition'].value_counts().to_dict().values())

In [116]:
to_drop = []
for i in range(len(keys)):
    if values[i] < 50:
        to_drop.append(keys[i])

In [117]:
combined=combined[~combined['condition'].isin(to_drop )]

In [118]:
combined.shape

(184930, 7)

In [119]:
combined.count()

uniqueID       184930
drugName       184930
condition      183857
review         184930
rating         184930
date           184930
usefulCount    184930
dtype: int64

In [120]:
combined.isna().sum()

uniqueID          0
drugName          0
condition      1073
review            0
rating            0
date              0
usefulCount       0
dtype: int64

In [121]:
combined['condition'].value_counts()

Birth Control                                                 37498
Depression                                                    11823
Anxiety                                                        7624
Pain                                                           7240
Acne                                                           6961
Bipolar Disorde                                                5480
Weight Loss                                                    4790
Obesity                                                        4673
Insomnia                                                       4521
ADHD                                                           4252
Emergency Contraception                                        3211
Vaginal Yeast Infection                                        2999
Diabetes, Type 2                                               2737
Abnormal Uterine Bleeding                                      2674
Bowel Preparation                               

In [122]:
combined.shape

(184930, 7)

### drop missing values if exist 

In [123]:
combined = combined.dropna(axis=0, how='any')
#combined['condition'].fillna(combined.groupby('drugName')['condition'].transform("mean") , inplace = True)

In [124]:
combined.shape
#combined.groupby(['drugName','condition']).agg(lambda x: x.value_counts().index[0])

(183857, 7)

In [125]:
x  = combined['review']
y  = combined['rating']

### perform the tfidf model on the data to calculate the weighted wods 

In [126]:
tfidf = TfidfVectorizer()
# x = tfidf.fit_transform(data["title"]).toarray()

In [127]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.25)

In [128]:
print (x_train.shape, y_train.shape)
print (x_test.shape, y_test.shape)

(137892,) (137892,)
(45965,) (45965,)


In [129]:
model = tfidf.fit_transform(x_train)
predection = tfidf.transform(x_test)

In [130]:
lr = LogisticRegression(random_state=0)
lr.fit(model, y_train)
print (lr.score(predection,y_test))
y_pred = lr.predict(predection)
print(classification_report(y_test, y_pred))




0.7433917110845208
              precision    recall  f1-score   support

           0       0.70      0.69      0.69     11619
           1       0.48      0.13      0.20      6927
           2       0.77      0.92      0.84     27419

   micro avg       0.74      0.74      0.74     45965
   macro avg       0.65      0.58      0.58     45965
weighted avg       0.71      0.74      0.71     45965

