In [18]:
import pandas as pd
import numpy as np
from encode import onehotencode

In [2]:
df = pd.read_csv('../medicine_prescription_records.csv', index_col=0)
df

Unnamed: 0,specialty,years_practicing,cms_prescription_counts
0,Nephrology,7,"DOXAZOSIN MESYLATE, MIDODRINE HCL, MEGESTROL A..."
1,General Practice,7,"CEPHALEXIN, AMOXICILLIN, HYDROCODONE-ACETAMINO..."
2,General Practice,7,"CEPHALEXIN, AMOXICILLIN, CLINDAMYCIN HCL"
3,General Practice,7,AMOXICILLIN
4,Nephrology,6,"PIOGLITAZONE HCL, BENAZEPRIL HCL, DIAZEPAM, HY..."
...,...,...,...
239925,Family,7,"SIMVASTATIN, NEXIUM, CHLORHEXIDINE GLUCONATE, ..."
239926,Surgical Technologist,2,"SANTYL, CLOPIDOGREL"
239927,Psych/Mental Health,3,"HYDROXYZINE PAMOATE, BUSPIRONE HCL, SEROQUEL X..."
239928,Family,2,"CIPROFLOXACIN HCL, AMOXICILLIN, IBUPROFEN, NEX..."


In [3]:
drugs = np.hstack(df['cms_prescription_counts'].str.split(', ').values)

In [4]:
drug_counts = pd.Series(drugs).value_counts()
drug_counts

HYDROCODONE-ACETAMINOPHEN        74805
AMOXICILLIN                      60566
GABAPENTIN                       59619
OMEPRAZOLE                       58947
LISINOPRIL                       57142
                                 ...  
BD INSULIN SYRINGE MICRO-FINE        1
OXYTOCIN                             1
NAGLAZYME                            1
METROCREAM                           1
TESTRED                              1
Length: 2397, dtype: int64

In [5]:
top5drugs = list(drug_counts.head(5).index)
top5drugs

['HYDROCODONE-ACETAMINOPHEN',
 'AMOXICILLIN',
 'GABAPENTIN',
 'OMEPRAZOLE',
 'LISINOPRIL']

In [6]:
pat = r"\b(?!({}))\b\S+".format(')|('.join(top5drugs).replace('-', '\-'))
pat

'\\b(?!(HYDROCODONE\\-ACETAMINOPHEN)|(AMOXICILLIN)|(GABAPENTIN)|(OMEPRAZOLE)|(LISINOPRIL))\\b\\S+'

In [7]:
top5drugs_df = df.copy(deep=True)

In [8]:
top5drugs_df['cms_prescription_counts'] = df['cms_prescription_counts'].str.replace(pat, '', regex=True).str.strip().str.replace('\s+', ' ', regex=True)

In [9]:
top5drugs_df

Unnamed: 0,specialty,years_practicing,cms_prescription_counts
0,Nephrology,7,HYDROCODONE GABAPENTIN OMEPRAZOLE LISINOPRIL L...
1,General Practice,7,AMOXICILLIN HYDROCODONE
2,General Practice,7,AMOXICILLIN
3,General Practice,7,AMOXICILLIN
4,Nephrology,6,HYDROCODONE OMEPRAZOLE LISINOPRIL AMOXICILLIN ...
...,...,...,...
239925,Family,7,LISINOPRIL LISINOPRIL OMEPRAZOLE
239926,Surgical Technologist,2,
239927,Psych/Mental Health,3,GABAPENTIN
239928,Family,2,AMOXICILLIN LISINOPRIL LISINOPRIL


In [10]:
top5drugs_df.loc[12501]

specialty                  Surgical Critical Care
years_practicing                                7
cms_prescription_counts                         &
Name: 12501, dtype: object

In [11]:
top5drugs_df['cms_prescription_counts'].replace('', np.NaN).dropna()

0         HYDROCODONE GABAPENTIN OMEPRAZOLE LISINOPRIL L...
1                                   AMOXICILLIN HYDROCODONE
2                                               AMOXICILLIN
3                                               AMOXICILLIN
4         HYDROCODONE OMEPRAZOLE LISINOPRIL AMOXICILLIN ...
                                ...                        
239924                              AMOXICILLIN HYDROCODONE
239925                     LISINOPRIL LISINOPRIL OMEPRAZOLE
239927                                           GABAPENTIN
239928                    AMOXICILLIN LISINOPRIL LISINOPRIL
239929                                          AMOXICILLIN
Name: cms_prescription_counts, Length: 175929, dtype: object

In [12]:
top5drugs_df['cms_prescription_counts'] = top5drugs_df['cms_prescription_counts'].str.replace('[^\w\- ]', '', regex=True).replace('', np.NaN)

In [13]:
top5drugs_df = top5drugs_df.dropna()

In [14]:
top5drugs_df

Unnamed: 0,specialty,years_practicing,cms_prescription_counts
0,Nephrology,7,HYDROCODONE GABAPENTIN OMEPRAZOLE LISINOPRIL L...
1,General Practice,7,AMOXICILLIN HYDROCODONE
2,General Practice,7,AMOXICILLIN
3,General Practice,7,AMOXICILLIN
4,Nephrology,6,HYDROCODONE OMEPRAZOLE LISINOPRIL AMOXICILLIN ...
...,...,...,...
239924,General Practice,6,AMOXICILLIN HYDROCODONE
239925,Family,7,LISINOPRIL LISINOPRIL OMEPRAZOLE
239927,Psych/Mental Health,3,GABAPENTIN
239928,Family,2,AMOXICILLIN LISINOPRIL LISINOPRIL


In [15]:
top5drugs_df

Unnamed: 0,specialty,years_practicing,cms_prescription_counts
0,Nephrology,7,HYDROCODONE GABAPENTIN OMEPRAZOLE LISINOPRIL L...
1,General Practice,7,AMOXICILLIN HYDROCODONE
2,General Practice,7,AMOXICILLIN
3,General Practice,7,AMOXICILLIN
4,Nephrology,6,HYDROCODONE OMEPRAZOLE LISINOPRIL AMOXICILLIN ...
...,...,...,...
239924,General Practice,6,AMOXICILLIN HYDROCODONE
239925,Family,7,LISINOPRIL LISINOPRIL OMEPRAZOLE
239927,Psych/Mental Health,3,GABAPENTIN
239928,Family,2,AMOXICILLIN LISINOPRIL LISINOPRIL


In [16]:
np.unique(np.hstack(top5drugs_df['cms_prescription_counts'].str.extract('(\w+)', expand=False).values))

array(['AMOXICILLIN', 'GABAPENTIN', 'HYDROCODONE', 'LISINOPRIL',
       'OMEPRAZOLE'], dtype='<U11')

In [20]:
top5drugs_df.to_csv('top5drugs.csv')

In [22]:
encoded = onehotencode(top5drugs_df['cms_prescription_counts'])

(5,)


  encoded[data_s.str.contains(f'({word})'), i] = 1
100%|██████████| 5/5 [00:00<00:00,  6.87it/s]


In [None]:
np.save('top5encoded.npy', encoded)

In [25]:
top5drugs_df['encoded'] = [list(arr) for arr in encoded.astype(int)]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  top5drugs_df['encoded'] = [list(arr) for arr in encoded.astype(int)]


In [27]:
top5drugs_df.to_csv('top5drugs_encoded.csv')

In [26]:
top5drugs_df

Unnamed: 0,specialty,years_practicing,cms_prescription_counts,encoded
0,Nephrology,7,HYDROCODONE GABAPENTIN OMEPRAZOLE LISINOPRIL L...,"[0, 1, 1, 1, 1]"
1,General Practice,7,AMOXICILLIN HYDROCODONE,"[1, 0, 1, 0, 0]"
2,General Practice,7,AMOXICILLIN,"[1, 0, 0, 0, 0]"
3,General Practice,7,AMOXICILLIN,"[1, 0, 0, 0, 0]"
4,Nephrology,6,HYDROCODONE OMEPRAZOLE LISINOPRIL AMOXICILLIN ...,"[1, 1, 1, 1, 1]"
...,...,...,...,...
239924,General Practice,6,AMOXICILLIN HYDROCODONE,"[1, 0, 1, 0, 0]"
239925,Family,7,LISINOPRIL LISINOPRIL OMEPRAZOLE,"[0, 0, 0, 1, 1]"
239927,Psych/Mental Health,3,GABAPENTIN,"[0, 1, 0, 0, 0]"
239928,Family,2,AMOXICILLIN LISINOPRIL LISINOPRIL,"[1, 0, 0, 1, 0]"


In [19]:
onehotencode(top5drugs_df['cms_prescription_counts'])

(5,)


  encoded[data_s.str.contains(f'({word})'), i] = 1
100%|██████████| 5/5 [00:00<00:00,  7.44it/s]


array([[0., 1., 1., 1., 1.],
       [1., 0., 1., 0., 0.],
       [1., 0., 0., 0., 0.],
       ...,
       [0., 1., 0., 0., 0.],
       [1., 0., 0., 1., 0.],
       [1., 0., 0., 0., 0.]])

In [None]:
clean_top5drugs_df

0         HYDROCODONE GABAPENTIN OMEPRAZOLE LISINOPRIL L...
1                                   AMOXICILLIN HYDROCODONE
2                                               AMOXICILLIN
3                                               AMOXICILLIN
4         HYDROCODONE OMEPRAZOLE LISINOPRIL AMOXICILLIN ...
                                ...                        
239924                              AMOXICILLIN HYDROCODONE
239925                     LISINOPRIL LISINOPRIL OMEPRAZOLE
239927                                           GABAPENTIN
239928                    AMOXICILLIN LISINOPRIL LISINOPRIL
239929                                          AMOXICILLIN
Name: cms_prescription_counts, Length: 175927, dtype: object

In [None]:
clean_top5drugs_df.str.len().min()

10

In [None]:
clean_top5drugs_df.to_csv('top5drugs.csv')

In [None]:
np.unique(np.hstack(clean_top5drugs_df.str.split(' ').values))

array(['', 'AMOXICILLIN', 'GABAPENTIN', 'HYDROCODONE', 'LISINOPRIL',
       'OMEPRAZOLE'], dtype='<U11')

In [None]:
clean_top5drugs_df = clean_top5drugs_df.str.replace('[#&\(]', '', regex=True)

In [None]:
clean_top5drugs_df.str.len().min()

10

In [None]:
np.unique(np.char.strip(np.hstack(clean_top5drugs_df.str.strip().str.split(',').values)))

array(['', 'AMOXICILLIN', 'GABAPENTIN', 'HYDROCODONE', 'LISINOPRIL',
       'OMEPRAZOLE'], dtype='<U12')

In [None]:
small_df = df[df['cms_prescription_counts'].str.contains('|'.join(top5drugs))]
small_df

Unnamed: 0,specialty,years_practicing,cms_prescription_counts
0,Nephrology,7,"DOXAZOSIN MESYLATE, MIDODRINE HCL, MEGESTROL A..."
1,General Practice,7,"CEPHALEXIN, AMOXICILLIN, HYDROCODONE-ACETAMINO..."
2,General Practice,7,"CEPHALEXIN, AMOXICILLIN, CLINDAMYCIN HCL"
3,General Practice,7,AMOXICILLIN
4,Nephrology,6,"PIOGLITAZONE HCL, BENAZEPRIL HCL, DIAZEPAM, HY..."
...,...,...,...
239924,General Practice,6,"AMOXICILLIN, HYDROCODONE-ACETAMINOPHEN, CHLORH..."
239925,Family,7,"SIMVASTATIN, NEXIUM, CHLORHEXIDINE GLUCONATE, ..."
239927,Psych/Mental Health,3,"HYDROXYZINE PAMOATE, BUSPIRONE HCL, SEROQUEL X..."
239928,Family,2,"CIPROFLOXACIN HCL, AMOXICILLIN, IBUPROFEN, NEX..."


In [None]:
small_df.to_csv('top5medicine.csv')

In [None]:
from encode import onehotencode

In [None]:
res = onehotencode(df['cms_prescription_counts'])

(2397,)


100%|██████████| 2397/2397 [12:43<00:00,  3.14it/s]


In [None]:
np.save('onehotencoded.npy', res)

In [None]:
res.sum()

5341972.0

In [None]:
df['cms_prescription_counts'].str.split(', ').values

array([list(['DOXAZOSIN MESYLATE', 'MIDODRINE HCL', 'MEGESTROL ACETATE', 'BENAZEPRIL HCL', 'METOLAZONE', 'NOVOLOG', 'DIAZEPAM', 'HYDRALAZINE HCL', 'SENSIPAR', 'LABETALOL HCL', 'PREDNISONE', 'CALCITRIOL', 'HYDROCODONE-ACETAMINOPHEN', 'HYDROCHLOROTHIAZIDE', 'LOSARTAN-HYDROCHLOROTHIAZIDE', 'FENOFIBRATE', 'MINOXIDIL', 'MELOXICAM', 'ATENOLOL', 'CARISOPRODOL', 'GABAPENTIN', 'OMEPRAZOLE', 'KLOR-CON M10', 'LANTUS', 'AMLODIPINE BESYLATE', 'CARVEDILOL', 'LOSARTAN POTASSIUM', 'IRBESARTAN', 'NIFEDICAL XL', 'NIFEDIPINE ER', 'LEVOTHYROXINE SODIUM', 'POTASSIUM CHLORIDE', 'FUROSEMIDE', 'GLYBURIDE', 'CLONIDINE HCL', 'TEMAZEPAM', 'SPIRONOLACTONE', 'LOVASTATIN', 'LISINOPRIL', 'PANTOPRAZOLE SODIUM', 'CALCIUM ACETATE', 'NEXIUM', 'ZOLPIDEM TARTRATE', 'DIOVAN', 'OXYCODONE HCL', 'METOPROLOL SUCCINATE', 'RANITIDINE HCL', 'ATORVASTATIN CALCIUM', 'TAMSULOSIN HCL', 'OXYBUTYNIN CHLORIDE', 'LISINOPRIL-HYDROCHLOROTHIAZIDE', 'METOPROLOL TARTRATE', 'AMLODIPINE BESYLATE-BENAZEPRIL', 'BUMETANIDE', 'BYSTOLIC', 'ISOSORBID