In [1]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder,LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer,HashingVectorizer

## DATASET : Drug Review Dataset (Druglib.com) Data Set
https://archive.ics.uci.edu/ml/datasets/Drug+Review+Dataset+%28Druglib.com%29

In [2]:
!wget https://archive.ics.uci.edu/ml/machine-learning-databases/00461/drugLib_raw.zip

'wget' is not recognized as an internal or external command,
operable program or batch file.


In [3]:
!unzip 'drugLib_raw.zip'

'unzip' is not recognized as an internal or external command,
operable program or batch file.


In [4]:

drug_lib_test_df = pd.read_csv('drugLibTest_raw.tsv',sep='\t',index_col=0)

In [5]:
#drug_lib_test_df.head()

In [6]:
drug_lib_train_df = pd.read_csv('drugLibTrain_raw.tsv',sep='\t',index_col=0)

In [7]:
#drug_lib_train_df.head()

In [8]:
drug_reviews_df = pd.concat([drug_lib_test_df,drug_lib_train_df])

In [9]:
drug_reviews_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4143 entries, 1366 to 2748
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   urlDrugName        4143 non-null   object
 1   rating             4143 non-null   int64 
 2   effectiveness      4143 non-null   object
 3   sideEffects        4143 non-null   object
 4   condition          4142 non-null   object
 5   benefitsReview     4143 non-null   object
 6   sideEffectsReview  4141 non-null   object
 7   commentsReview     4135 non-null   object
dtypes: int64(1), object(7)
memory usage: 291.3+ KB


In [10]:
drug_reviews_df = drug_reviews_df.dropna().reset_index(drop=True)
drug_reviews_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4132 entries, 0 to 4131
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   urlDrugName        4132 non-null   object
 1   rating             4132 non-null   int64 
 2   effectiveness      4132 non-null   object
 3   sideEffects        4132 non-null   object
 4   condition          4132 non-null   object
 5   benefitsReview     4132 non-null   object
 6   sideEffectsReview  4132 non-null   object
 7   commentsReview     4132 non-null   object
dtypes: int64(1), object(7)
memory usage: 258.4+ KB


In [11]:
drug_reviews_df.head(20)

Unnamed: 0,urlDrugName,rating,effectiveness,sideEffects,condition,benefitsReview,sideEffectsReview,commentsReview
0,biaxin,9,Considerably Effective,Mild Side Effects,sinus infection,The antibiotic may have destroyed bacteria cau...,"Some back pain, some nauseau.",Took the antibiotics for 14 days. Sinus infect...
1,lamictal,9,Highly Effective,Mild Side Effects,bipolar disorder,Lamictal stabilized my serious mood swings. On...,"Drowsiness, a bit of mental numbness. If you t...",Severe mood swings between hypomania and depre...
2,depakene,4,Moderately Effective,Severe Side Effects,bipolar disorder,Initial benefits were comparable to the brand ...,"Depakene has a very thin coating, which caused...",Depakote was prescribed to me by a Kaiser psyc...
3,sarafem,10,Highly Effective,No Side Effects,bi-polar / anxiety,It controlls my mood swings. It helps me think...,I didnt really notice any side effects.,This drug may not be for everyone but its wond...
4,accutane,10,Highly Effective,Mild Side Effects,nodular acne,Within one week of treatment superficial acne ...,Side effects included moderate to severe dry s...,Drug was taken in gelatin tablet at 0.5 mg per...
5,biaxin,2,Marginally Effective,No Side Effects,sinus infection,By the end of the 10-day treatment I felt bett...,I felt no significant side effects - perhaps s...,Basically the treatment did not seem to work. ...
6,carbamazepine,8,Considerably Effective,Moderate Side Effects,seizure,reduction in seizures reduction in seizures re...,tired/sleepy very tired sleep and tired very t...,took it for seizure took pills drank with wate...
7,ultram-er,10,Highly Effective,Mild Side Effects,cervical disk degeneration and lower back pain,Ive been taking Tramadol for 2 weeks now. Ive ...,I have had no side effects so far. I hope it s...,"Treating for neck, shoulder, arms, lower back,..."
8,klonopin,10,Highly Effective,No Side Effects,panic disorder,I immediately stopped having panic attacks. I...,I experienced no side effects. I was not tire...,I started both klonopin and prozac together. ...
9,effexor,1,Marginally Effective,Extremely Severe Side Effects,depression,the presumed benefits were to help with a seve...,here we go.the initial effect would be dry mou...,family doctor initially prescribed wellbutin b...


## Use case : Attempt to predict sideEffects from sideEffects Review

In [12]:
side_effects_df = drug_reviews_df[['urlDrugName','sideEffects','sideEffectsReview']]

In [13]:
side_effects_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4132 entries, 0 to 4131
Data columns (total 3 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   urlDrugName        4132 non-null   object
 1   sideEffects        4132 non-null   object
 2   sideEffectsReview  4132 non-null   object
dtypes: object(3)
memory usage: 97.0+ KB


In [14]:
side_effects_df['sideEffects'].unique()

array(['Mild Side Effects', 'Severe Side Effects', 'No Side Effects',
       'Moderate Side Effects', 'Extremely Severe Side Effects'],
      dtype=object)

In [15]:
side_effects_df['sideEffects'].nunique()

5

In [16]:
side_effects_df['sideEffects'].value_counts()

Mild Side Effects                1347
No Side Effects                  1193
Moderate Side Effects             848
Severe Side Effects               489
Extremely Severe Side Effects     255
Name: sideEffects, dtype: int64

In [17]:
side_effects_df.loc[8]['sideEffectsReview']       #labelled as No Side Effect

'I experienced no side effects.  I was not tired nor did I feel medicated.'

In [18]:
side_effects_df.loc[5]['sideEffectsReview']       #labelled as No Side Effect

'I felt no significant side effects - perhaps some slight drowsiness.'

## **What happens when texts are not represented**

In [19]:
x_train_unprocessed, x_test_unprocessed, y_train_unprocessed, y_test_unprocessed = train_test_split(side_effects_df['sideEffectsReview'], side_effects_df['sideEffects'], test_size=0.2, random_state=0)

In [20]:
# clf = LogisticRegression(random_state=0, solver='lbfgs',multi_class='multinomial').fit(x_train_unprocessed, y_train_unprocessed)

## Different Ways of Encoding Text Label

###Label encoding using manual assignment

In [21]:
side_effects_df_1 = side_effects_df.copy()
side_effects_df_1.loc[side_effects_df_1['sideEffects'] =='Extremely Severe Side Effects',"manual_label_encode"] = 0
side_effects_df_1.loc[side_effects_df_1['sideEffects'] =='Mild Side Effects',"manual_label_encode"] = 1
side_effects_df_1.loc[side_effects_df_1['sideEffects'] =='Moderate Side Effects',"manual_label_encode"] = 2
side_effects_df_1.loc[side_effects_df_1['sideEffects'] =='No Side Effects',"manual_label_encode"] = 3
side_effects_df_1.loc[side_effects_df_1['sideEffects'] =='Severe Side Effects',"manual_label_encode"] = 4


In [22]:
side_effects_df_1.head()

Unnamed: 0,urlDrugName,sideEffects,sideEffectsReview,manual_label_encode
0,biaxin,Mild Side Effects,"Some back pain, some nauseau.",1.0
1,lamictal,Mild Side Effects,"Drowsiness, a bit of mental numbness. If you t...",1.0
2,depakene,Severe Side Effects,"Depakene has a very thin coating, which caused...",4.0
3,sarafem,No Side Effects,I didnt really notice any side effects.,3.0
4,accutane,Mild Side Effects,Side effects included moderate to severe dry s...,1.0


### Label Encoding Using pandas.astype('category)


In [23]:
side_effects_df_2 = side_effects_df.copy()
side_effects_df_2['pandas_as_category_label_encode']  = side_effects_df_2['sideEffects'].astype('category').cat.codes

In [24]:
side_effects_df_2.head()

Unnamed: 0,urlDrugName,sideEffects,sideEffectsReview,pandas_as_category_label_encode
0,biaxin,Mild Side Effects,"Some back pain, some nauseau.",1
1,lamictal,Mild Side Effects,"Drowsiness, a bit of mental numbness. If you t...",1
2,depakene,Severe Side Effects,"Depakene has a very thin coating, which caused...",4
3,sarafem,No Side Effects,I didnt really notice any side effects.,3
4,accutane,Mild Side Effects,Side effects included moderate to severe dry s...,1


###Label Encoding using pandas.factorize

In [25]:
side_effects_df_3 = side_effects_df.copy()
side_effects_df_3['pandas_factorize_label_encode'] = side_effects_df_3['sideEffects'].factorize(sort=True)[0]

In [26]:
side_effects_df_3.head()

Unnamed: 0,urlDrugName,sideEffects,sideEffectsReview,pandas_factorize_label_encode
0,biaxin,Mild Side Effects,"Some back pain, some nauseau.",1
1,lamictal,Mild Side Effects,"Drowsiness, a bit of mental numbness. If you t...",1
2,depakene,Severe Side Effects,"Depakene has a very thin coating, which caused...",4
3,sarafem,No Side Effects,I didnt really notice any side effects.,3
4,accutane,Mild Side Effects,Side effects included moderate to severe dry s...,1


###  Label Encoding Using Dictionary Mapping
#### Remember to talk about mapping as an option to set ordinal category label

In [27]:
side_effects_df_4 = side_effects_df.copy()
side_effects_dict = {'Mild Side Effects' : 1 ,'Severe Side Effects' : 4,'No Side Effects' : 3,'Moderate Side Effects' : 2,'Extremely Severe Side Effects' : 0}
side_effects_df_4['map_label_encode'] = side_effects_df_4['sideEffects'].map(side_effects_dict)

In [28]:
side_effects_df_4.head()

Unnamed: 0,urlDrugName,sideEffects,sideEffectsReview,map_label_encode
0,biaxin,Mild Side Effects,"Some back pain, some nauseau.",1
1,lamictal,Mild Side Effects,"Drowsiness, a bit of mental numbness. If you t...",1
2,depakene,Severe Side Effects,"Depakene has a very thin coating, which caused...",4
3,sarafem,No Side Effects,I didnt really notice any side effects.,3
4,accutane,Mild Side Effects,Side effects included moderate to severe dry s...,1


### Label Encoding Using Sklearn's LabelEncoder

In [29]:
side_effects_df_5 = side_effects_df.copy()
encoder = LabelEncoder()
side_effects_df_5['sklearn_label_encode'] = encoder.fit_transform(side_effects_df_5['sideEffects'])

In [30]:
side_effects_df_5.head()

Unnamed: 0,urlDrugName,sideEffects,sideEffectsReview,sklearn_label_encode
0,biaxin,Mild Side Effects,"Some back pain, some nauseau.",1
1,lamictal,Mild Side Effects,"Drowsiness, a bit of mental numbness. If you t...",1
2,depakene,Severe Side Effects,"Depakene has a very thin coating, which caused...",4
3,sarafem,No Side Effects,I didnt really notice any side effects.,3
4,accutane,Mild Side Effects,Side effects included moderate to severe dry s...,1


### One Hot Encoding Using panda.get_dummies

In [70]:
side_effects_df_6 = side_effects_df.copy()
dummy_df = pd.get_dummies(side_effects_df_6['sideEffects'],prefix=['sideEffects'])
side_effects_df_6 =  pd.concat([side_effects_df_6,dummy_df],axis=1)
y = side_effects_df_6

In [71]:
side_effects_df_6.head()

Unnamed: 0,urlDrugName,sideEffects,sideEffectsReview,['sideEffects']_Extremely Severe Side Effects,['sideEffects']_Mild Side Effects,['sideEffects']_Moderate Side Effects,['sideEffects']_No Side Effects,['sideEffects']_Severe Side Effects
0,biaxin,Mild Side Effects,"Some back pain, some nauseau.",0,1,0,0,0
1,lamictal,Mild Side Effects,"Drowsiness, a bit of mental numbness. If you t...",0,1,0,0,0
2,depakene,Severe Side Effects,"Depakene has a very thin coating, which caused...",0,0,0,0,1
3,sarafem,No Side Effects,I didnt really notice any side effects.,0,0,0,1,0
4,accutane,Mild Side Effects,Side effects included moderate to severe dry s...,0,1,0,0,0


## sklearn's CountVectorizer

In [72]:
x_train,x_test,y_train,y_test = train_test_split(side_effects_df_5['sideEffectsReview'],side_effects_df_5['sklearn_label_encode'], test_size=0.2, random_state=42)

In [73]:
cv = CountVectorizer()
x_train_cv = cv.fit_transform(x_train)
x_test_cv = cv.transform(x_test)

In [74]:
x_train_cv.shape, x_test_cv.shape

((3305, 7174), (827, 7174))

In [75]:
#x_train_cv[0:5]

In [76]:
x_array = x_train_cv.toarray()
x_array

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [38]:
cv.inverse_transform(x_array[10])

[array(['after', 'been', 'causes', 'constipation', 'day', 'days', 'die',
        'drive', 'go', 'have', 'headaches', 'heroin', 'ihad', 'ill',
        'into', 'it', 'killed', 'like', 'meds', 'missed', 'more', 'much',
        'must', 'my', 'never', 'nursing', 'of', 'or', 'per', 'serious',
        'sex', 'skipping', 'so', 'softeners', 'stool', 'stop', 'take',
        'than', 'three', 'to', 'two', 'wanted', 'withdrawal', 'you'],
       dtype='<U18')]

In [39]:
type(cv.vocabulary_)

dict

In [40]:
vocab_df = pd.Series(cv.vocabulary_).to_frame('feature_count')

In [41]:
vocab_df.T

Unnamed: 0,edema,excesive,bleeding,felt,nauseous,bloated,the,biggest,side,effect,...,homebound,realistic,sugary,minocin,tinnutus,dismay,furthermore,fortunately,inert,clockwork
feature_count,2155,2377,876,2534,4208,892,6359,834,5652,2166,...,3103,5109,6126,4039,6479,1961,2782,2717,3320,1258


In [42]:
vocab_df.sort_values(by='feature_count').T

Unnamed: 0,00,000,000mg,00pm,025,05,07,08,10,100,...,zithromycin,zocor,zofran,zoloft,zombie,zombing,zomig,zyban,zyprexa,zyrtec
feature_count,0,1,2,3,4,5,6,7,8,9,...,7164,7165,7166,7167,7168,7169,7170,7171,7172,7173


##sklearn's CountVectorizer with ngram

In [43]:
cv2 = CountVectorizer(ngram_range=(1,2))
x_train_cv_2= cv2.fit_transform(x_train)
x_test_cv_2 = cv2.transform(x_test)

In [44]:
x_train_cv_2.shape, x_test_cv_2.shape

((3305, 55581), (827, 55581))

In [45]:
x_array_2 = x_train_cv_2.toarray()
x_array_2

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [46]:
cv2.inverse_transform(x_array[10])

[array(['2nd and', 'about pos', 'activiites also', 'affective immediatley',
        'afterward were', 'afterwards', 'alarming but', 'allergy test',
        'and after', 'and comes', 'and consultant', 'and disabling',
        'and glutamate', 'and going', 'and mentally', 'and nothing',
        'and post', 'and steroid', 'anger the', 'antacid',
        'antibiotics first', 'anxiety anxiety', 'anxiety haven',
        'anxiety hyperactivity', 'any fragrance', 'any typical',
        'anything happens', 'apnea', 'approx', 'attribute', 'augmentin 22',
        'away swiftly', 'back currently', 'back maybe', 'bathroom due',
        'bathroom had', 'be unrelated', 'became sizes', 'because tired',
        'bed anything', 'been sleeping', 'being punched', 'better off',
        'biotin'], dtype='<U29')]

In [47]:
vocab_df_2 = pd.Series(cv2.vocabulary_).to_frame('feature_count')

In [48]:
vocab_df_2.T

Unnamed: 0,edema,excesive,bleeding,excesive bleeding,felt,nauseous,bloated,felt nauseous,nauseous bloated,the,...,inert,clockwork,whatsoever periods,periods were,on inert,inert pills,pills periods,periods came,came like,like clockwork
feature_count,14790,16164,7278,16165,17500,31284,7362,17562,31288,46195,...,23773,9490,53552,36299,34105,23774,36626,36288,8495,26783


In [49]:
vocab_df_2.sort_values(by='feature_count').T

Unnamed: 0,00,00 and,00 at,000,000 on,000 people,000mg,000mg per,00pm,00pm makes,...,zyban made,zyban should,zyprexa,zyprexa for,zyrtec,zyrtec at,zyrtec however,zyrtec in,zyrtec often,zyrtec we
feature_count,0,1,2,3,4,5,6,7,8,9,...,55571,55572,55573,55574,55575,55576,55577,55578,55579,55580


## sklearn's TfidfVectorizer

In [50]:
tv = TfidfVectorizer()
x_train_tv = tv.fit_transform(x_train)
x_test_tv = tv.transform(x_test)

In [51]:
x_train_tv.shape, x_test_tv.shape

((3305, 7174), (827, 7174))

In [52]:
t_array = x_train_tv.toarray()
t_array

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [53]:
tv.inverse_transform(t_array[10])

[array(['after', 'been', 'causes', 'constipation', 'day', 'days', 'die',
        'drive', 'go', 'have', 'headaches', 'heroin', 'ihad', 'ill',
        'into', 'it', 'killed', 'like', 'meds', 'missed', 'more', 'much',
        'must', 'my', 'never', 'nursing', 'of', 'or', 'per', 'serious',
        'sex', 'skipping', 'so', 'softeners', 'stool', 'stop', 'take',
        'than', 'three', 'to', 'two', 'wanted', 'withdrawal', 'you'],
       dtype='<U18')]

In [54]:
pd.Series(tv.idf_).to_frame('feature_count').T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,7164,7165,7166,7167,7168,7169,7170,7171,7172,7173
feature_count,7.7172,8.004882,8.410347,8.410347,8.410347,8.410347,8.004882,8.410347,4.898802,6.538545,...,8.410347,8.410347,8.004882,6.90627,6.395444,8.410347,8.004882,7.494056,8.004882,7.494056


In [55]:
vocab_df_3 = pd.Series(tv.vocabulary_).to_frame('feature_count')

In [56]:
vocab_df_3.T

Unnamed: 0,edema,excesive,bleeding,felt,nauseous,bloated,the,biggest,side,effect,...,homebound,realistic,sugary,minocin,tinnutus,dismay,furthermore,fortunately,inert,clockwork
feature_count,2155,2377,876,2534,4208,892,6359,834,5652,2166,...,3103,5109,6126,4039,6479,1961,2782,2717,3320,1258


In [57]:
vocab_df_3.sort_values(by='feature_count').T

Unnamed: 0,00,000,000mg,00pm,025,05,07,08,10,100,...,zithromycin,zocor,zofran,zoloft,zombie,zombing,zomig,zyban,zyprexa,zyrtec
feature_count,0,1,2,3,4,5,6,7,8,9,...,7164,7165,7166,7167,7168,7169,7170,7171,7172,7173


Sklearn's HashingVectorizer

In [66]:
hv = HashingVectorizer()
x_train = hv.fit_transform(x_train)
x_test = hv.transform(x_test)

In [67]:
x_test.shape, x_train.shape

((827, 1048576), (3305, 1048576))

In [68]:
print(hv.get_params())

{'alternate_sign': True, 'analyzer': 'word', 'binary': False, 'decode_error': 'strict', 'dtype': <class 'numpy.float64'>, 'encoding': 'utf-8', 'input': 'content', 'lowercase': True, 'n_features': 1048576, 'ngram_range': (1, 1), 'norm': 'l2', 'preprocessor': None, 'stop_words': None, 'strip_accents': None, 'token_pattern': '(?u)\\b\\w\\w+\\b', 'tokenizer': None}


In [69]:
# #Test Splits
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=0)


NameError: name 'x' is not defined