## Getting the list of unique values

This notebook will get the number of unique values for each categorical feature and what the unique values are.

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('pharmacy_tx.csv')
df[['type', 'drug']]=df['drug'].str.split(' ', expand = True)

In [3]:
df.nunique()

tx_date          363
pharmacy          58
diagnosis        133
drug              77
bin               12
pcn               48
group             48
rejected           2
patient_pay    20437
type               2
dtype: int64

In [4]:
df.diagnosis.unique()

array(['G99.93', 'U60.52', 'Q85.91', 'N55.01', 'K32.86', 'U27.71',
       'I68.27', 'Q72.66', 'M42.24', 'G51.87', 'I38.43', 'P77.13',
       'P07.55', 'C98.15', 'E71.74', 'H36.57', 'W50.87', 'Z66.42',
       'Y11.84', 'H60.83', 'C10.29', 'N80.59', 'K87.68', 'Q74.21',
       'V80.87', 'V97.67', 'M06.85', 'B05.36', 'D13.25', 'Z20.23',
       'U61.13', 'C14.36', 'Z98.86', 'X75.30', 'H51.45', 'Y51.55',
       'W40.94', 'E30.01', 'N27.23', 'E49.13', 'Z25.06', 'W45.59',
       'G95.10', 'U06.52', 'A14.01', 'R33.59', 'H47.09', 'L85.96',
       'U41.19', 'B45.03', 'H54.35', 'P29.44', 'I59.87', 'B03.27',
       'H33.06', 'C48.92', 'J12.31', 'U75.95', 'A13.39', 'N48.90',
       'Q80.97', 'C57.40', 'T57.97', 'O91.95', 'X26.32', 'Z95.40',
       'M68.79', 'S15.62', 'B42.10', 'L68.59', 'J20.78', 'H86.54',
       'C74.81', 'E11.62', 'L85.13', 'V90.72', 'N37.61', 'Q32.32',
       'W82.27', 'N22.62', 'I91.91', 'W00.33', 'B84.86', 'E86.20',
       'G10.16', 'K78.87', 'N59.44', 'Z34.94', 'L07.00', 'M31.

In [5]:
df.diagnosis.value_counts()

I68.27    2193157
G99.93    1386006
K32.86    1292235
Q85.91     775390
W50.87     718798
           ...   
Y08.66         24
G27.20         24
N33.46         12
X00.63         12
J65.04         12
Name: diagnosis, Length: 133, dtype: int64

In [6]:
df.bin.unique()

array([725700, 664344, 571569,  96934, 539437, 322463, 757349, 691847,
       160389, 956971, 718350, 756120])

In [7]:
df.bin.value_counts()

664344    3855462
322463    2542376
725700    2100642
691847    1323776
757349     959441
96934      870740
571569     774925
539437     516933
956971     272906
160389     261912
718350     256611
756120     174520
Name: bin, dtype: int64

In [8]:
df.pcn.unique()

array(['1UQC', nan, 'KB38N', 'S76J7V6', '327CKV', 'CS8580', 'MSCXSG',
       'NC7EN', 'KBOSN', '3O71UTS', 'ZQPX', 'T52GV', 'N098KI', 'DY4B',
       'AZUO5U', '3Y5ZW0', 'RB7UU', 'RM0HB', 'BIZF', 'RAM3J', 'TAZ5W',
       'WM6A', 'J5DT8', 'YFVIA', 'W1LW9Y', 'OO0E', 'BZ22Z2', 'P4LC',
       '6ZGS97C', '9D24', 'T17LNK', 'TPJD', 'REGLCC', 'YICC41', 'XH4T3',
       '7THOQ5', 'CG3ZWQ', 'IF448', 'NG4CS', '2TIC', 'MQWH09H', '393U',
       '9C5MOR3', 'YL5CMT', '9FU70', 'ULM7G', 'W7L3', 'K5KDJ7G', 'FX2Z'],
      dtype=object)

In [9]:
df.pcn.value_counts()

327CKV     1090515
S76J7V6     777502
MSCXSG      713743
3O71UTS     690356
1UQC        676984
KB38N       594959
N098KI      400190
T52GV       364750
NC7EN       351568
CS8580      305315
YFVIA       291860
WM6A        272906
RB7UU       261912
J5DT8       256611
DY4B        238438
TPJD        226317
T17LNK      182746
7THOQ5      174520
TAZ5W       161867
RM0HB       157326
REGLCC      156307
IF448       147338
YICC41      132469
3Y5ZW0      126536
W1LW9Y      111090
KBOSN       108027
6ZGS97C     105167
2TIC        103804
BIZF        101105
9C5MOR3     100203
ZQPX         96356
9D24         93238
XH4T3        92847
9FU70        83021
AZUO5U       68067
OO0E         60095
RAM3J        47895
P4LC         46208
BZ22Z2       46125
W7L3         40076
YL5CMT       34869
NG4CS        33271
MQWH09H      33252
ULM7G        30128
393U         28893
CG3ZWQ       25218
K5KDJ7G      21030
FX2Z         20263
Name: pcn, dtype: int64

In [10]:
df.group.unique()

array([nan, '52H8KH0F83K', '6BYJBW', 'ZX2QUWR', 'IOEAN1DWVV3Y', '1CAHL',
       'HO8HUGL', 'STGRDKR1J5RD', 'DGLGRYP', 'I4UYEP84W3',
       'KZWQDIHCLLHD1', 'EVD4X5', '6SP1DG', 'IGN6JL34H37D', 'RS5RB3YA',
       'SJVO3GXUURRGO', 'QK6BI1N61', '7DUPMODV0', 'T51T6V2E8L', 'L9QZA',
       'IX6P0', 'AJK5MZ25T9IA', '0OGKQ', 'HPVSQW7M8', 'ZOYKF0N5NEO',
       'O19XSLDEFB', 'VC81HUO7ZD', 'YY6B1J4E8KJ3', 'DYGBI610ZY',
       'Z01MLD4I', '7LL04USF', 'BH2Q8B3GY2GAV', 'MP3IQ', '9R3Z3QKDF3',
       'GOM8K0', 'FZPLF4O6FD', 'GQIGYFQQ2WGH', 'XK8RM5E75ZW',
       '0TZ9XYJZJH', '7Q756WMLLV25X', 'XY5GQQ9', 'S2QKZ0OFNWS6X',
       'U19J4RVCA', 'RGVK1', 'V96T9QL5', '77MAJF66DGD', 'TFZOR5R49',
       '1N5IRQ', 'OD99VAJGWV'], dtype=object)

In [11]:
df.group.value_counts()

IOEAN1DWVV3Y     1090515
HO8HUGL          1010441
STGRDKR1J5RD      753156
DGLGRYP           713743
6BYJBW            594959
52H8KH0F83K       507920
1CAHL             443820
6SP1DG            400190
EVD4X5            364750
AJK5MZ25T9IA      291860
L9QZA             272906
RS5RB3YA          261912
IX6P0             256611
YY6B1J4E8KJ3      241607
Z01MLD4I          182746
T51T6V2E8L        161867
SJVO3GXUURRGO     157326
BH2Q8B3GY2GAV     156307
FZPLF4O6FD        147338
MP3IQ             132469
DYGBI610ZY        126720
0OGKQ             111090
I4UYEP84W3        107321
XK8RM5E75ZW       103804
QK6BI1N61         101105
S2QKZ0OFNWS6X     100203
KZWQDIHCLLHD1      96356
VC81HUO7ZD         93238
GOM8K0             92847
9R3Z3QKDF3         88372
U19J4RVCA          83021
ZX2QUWR            79042
0TZ9XYJZJH         75782
IGN6JL34H37D       68067
HPVSQW7M8          60095
RGVK1              56637
7DUPMODV0          47895
7LL04USF           46678
O19XSLDEFB         46208
ZOYKF0N5NEO        46125


In [12]:
df.drug.unique()

array(['tanoclolol', 'oxasoted', 'cupitelol', 'mamate', 'lalol',
       'foxivelule', 'tafistitrisin', 'prazinib', 'momudobatin',
       'gentipapavir', 'cibroniudosin', 'rulfalol', 'keglusited',
       'pucomalol', 'glycontazepelol', 'glycogane', 'cicrochoric',
       'satrade', 'tovane', 'suvinicuvir', 'semufolic', 'sorine', 'ratin',
       'hozirol', 'dienulol', 'hivir', 'diadaric', 'mule', 'tocilic',
       'gorol', 'simarol', 'vocopirin', 'cycloxasonol', 'bovirol',
       'brede', 'nusudaric', 'hidizuzunib', 'nephelilin', 'colifunene',
       'antimab', 'plazamiglutic', 'vivafastat', 'tugesutin',
       'pheromycin', 'isobrovelin', 'flacelfatastat', 'choxestamenium',
       'colade', 'dusin', 'tricatripride', 'fazipilin', 'tocrocin',
       'glulune', 'fumiluric', 'kediborin', 'ribosatharin',
       'genetramycicin', 'todiadianic', 'spifistime', 'nicotilin',
       'cuxirin', 'notin', 'gosate', 'monemodiase', 'debome',
       'lixegimoric', 'sacrode', 'prefluflomycin', 'thiostaste

In [13]:
df.drug.value_counts()

prazinib        1711757
rulfalol         718503
pucomalol        556466
mule             543372
tanoclolol       513317
                 ...   
dusin              9589
sizubesin          8130
gohevitravir       4114
lehydrome          2561
nenizevir          1112
Name: drug, Length: 77, dtype: int64