# Label targets using fuzzy-wuzzy
---

## _Step 0. Load packages_

In [4]:
# import sys
# sys.path.insert(0, '/healthcare-fraud/src/')
import src.scripts as src
import pandas as pd
import numpy as np

## _Step 1. Load, clean, and prep data_
---

In [27]:
# load data
LEIE_id_unlabeled = src.read_from_efs('LEIE_id_unlabeled.csv')
PartD_id_unlabeled = src.read_from_efs('PartD_id_unlabeled.csv')
print(LEIE_id_unlabeled.shape, PartD_id_unlabeled.shape)

(14968, 6) (1294885, 7)


  mask |= (ar1 == a)


## Build out `full` names
> 1. start with identifying companies first
> 2. than move on to people
---

In [28]:
# filter out companies
LEIE_id_comp = LEIE_id_unlabeled[LEIE_id_unlabeled['first name'].isna()].reset_index(drop = True)
PartD_id_comp = PartD_id_unlabeled[PartD_id_unlabeled['first name'].isna()].reset_index(drop = True)
print(LEIE_id_comp.shape, PartD_id_comp.shape)

(241, 6) (763, 7)


In [76]:
LEIE_id_comp['full'] = LEIE_id_comp[['last/org name','specialty','city']].apply(lambda x: ' '.join(x), axis=1)
PartD_id_comp['full'] = PartD_id_comp[['last/org name','specialty','city']].apply(lambda x: ' '.join(x), axis=1)
print(len(LEIE_id_comp), len(PartD_id_comp))

241 763


In [49]:
PartD_id_comp.head()

Unnamed: 0,npi,last/org name,first name,city,specialty,exclusion_method,exclusion_year,full
0,1003040478,MONA TABIB MD INC,,ENCINO,Family Practice,,,MONA TABIB MD INC Family Practice ENCINO
1,1003074600,GERIATRIC RESEARCH & CONSULTING GROUP LLC,,BEAR CREEK TOWNSHIP,Family Medicine,,,GERIATRIC RESEARCH & CONSULTING GROUP LLC Fami...
2,1003838053,AUBURN FAMILY DENTAL,,AUBURN,Dentist,,,AUBURN FAMILY DENTAL Dentist AUBURN
3,1003879941,BARRY L GETZOFF ASSOCIATES,,LANGHORNE,Internal Medicine,,,BARRY L GETZOFF ASSOCIATES Internal Medicine L...
4,1003887118,EVMS ACADEMIC PHYSICIANS AND SURGEONS HEALTH S...,,NORFOLK,Internal Medicine,,,EVMS ACADEMIC PHYSICIANS AND SURGEONS HEALTH S...


### Now, lets focus on people
> 1. build out `full` names
> 2. perform NLTK string matching. Fuzzywuzzy isn't installing.

In [83]:
# filter out people
LEIE_id_peop = LEIE_id_unlabeled[~LEIE_id_unlabeled['first name']
                                 .isna()].dropna(axis=0, how='any', subset=['last/org name',
                                                                            'first name', 
                                                                            'specialty',
                                                                            'city']).reset_index(drop = True)
PartD_id_peop = PartD_id_unlabeled[~PartD_id_unlabeled['first name']
                                   .isna()].dropna(axis=0, how='any', subset=['last/org name',
                                                                              'first name', 
                                                                              'specialty',
                                                                              'city']).reset_index(drop = True)
print(LEIE_id_peop.shape, PartD_id_peop.shape)

(14726, 6) (1294086, 7)


In [84]:
LEIE_id_peop[['last/org name','first name', 'specialty','city']].dropna(axis=0, how='any', inplace=False).shape

(14726, 4)

In [85]:
LEIE_id_peop['full'] = LEIE_id_peop[['last/org name','first name', 'specialty','city']].apply(lambda x: ' '.join(x), axis=1)
PartD_id_peop['full'] = PartD_id_peop[['last/org name','first name', 'specialty','city']].apply(lambda x: ' '.join(x), axis=1)
print(len(LEIE_id_peop), len(PartD_id_peop))

14726 1294086


In [86]:
PartD_id_peop.head()

Unnamed: 0,npi,last/org name,first name,city,specialty,exclusion_method,exclusion_year,full
0,1003000126,ENKESHAFI,ARDALAN,CUMBERLAND,Internal Medicine,,,ENKESHAFI ARDALAN Internal Medicine CUMBERLAND
1,1003000142,KHALIL,RASHID,TOLEDO,Anesthesiology,,,KHALIL RASHID Anesthesiology TOLEDO
2,1003000167,ESCOBAR,JULIO,DAYTON,Dentist,,,ESCOBAR JULIO Dentist DAYTON
3,1003000282,BLAKEMORE,ROSIE,NASHVILLE,Nurse Practitioner,,,BLAKEMORE ROSIE Nurse Practitioner NASHVILLE
4,1003000407,GIRARDI,DAVID,PATTON,Family Practice,,,GIRARDI DAVID Family Practice PATTON


## Fuzzy wuzzy matching
---

In [89]:
from fuzzywuzzy import fuzz

ModuleNotFoundError: No module named 'fuzzywuzzy'

(14727, 6) (1294122, 7)


(14727, 6) (1294122, 7)


In [12]:
LEIE_id_unlabeled.head()

Unnamed: 0,exclusion year,npi,last/org name,first name,city,specialty
0,2018,1922348218,184TH STREET PHARMACY CORP,,BRONX,PHARMACY
1,2018,0,A & H VITAMINS SUPPLY CORP,,TEANECK,PHARMACY
2,2017,1942476080,"A & Y MEDICAL SUPPLY, INC",,FOREST HILLS,DME - GENERAL
3,2017,1891731758,"A FAIR DEAL PHARMACY, INC",,BROOKLYN,PHARMACY
4,2017,0,"A SERVICE CAB CO, INC",,METAIRIE,TRANSPORTATION CO


Unnamed: 0,npi,last/org name,first name,city,specialty,exclusion_method,exclusion_year
594,1003040478,MONA TABIB MD INC,,ENCINO,Family Practice,,
1030,1003074600,GERIATRIC RESEARCH & CONSULTING GROUP LLC,,BEAR CREEK TOWNSHIP,Family Medicine,,
3699,1003838053,AUBURN FAMILY DENTAL,,AUBURN,Dentist,,
5132,1003879941,BARRY L GETZOFF ASSOCIATES,,LANGHORNE,Internal Medicine,,
5432,1003887118,EVMS ACADEMIC PHYSICIANS AND SURGEONS HEALTH S...,,NORFOLK,Internal Medicine,,
6812,1003932930,RAMA NANDIPATI M.D INC.,,MERCED,Internal Medicine,,
7353,1003968959,HARTMAN REHABILITATION ASSOCIATES,,LEBANON,Physical Medicine and Rehabilitation,,
7866,1003993221,ADVANCED MEDICAL CARE OF HUDSON COUNTY LLC,,UNION CITY,Internal Medicine,,
7921,1003994922,KEN M ZONIES MD LLC,,PIKESVILLE,Internal Medicine,,
8077,1013000983,USURG ASSOCIATES INC,,DARBY,Urology,,
