In [1]:
import pandas as pd
import pickle

#### [The medicare provider utilization and payment data](https://data.cms.gov/Medicare-Physician-Supplier/Medicare-Provider-Utilization-and-Payment-Data-Phy/fs4p-t5eq/data) is a largish file (a little over 2 GB)   with 1,242,756 rows
Depending on your computer's resources you can:
 - read it in, make it smaller, and pickle it for faster loading next time
     - remove unwanted columns
     - convert as columns that are of datatype *object* to datetimes or numeric types as possible
 - read it in in chunks

#### Let's grab a single chunk of 1000 records and look at it

In [2]:
%%time

payment_df = pd.DataFrame()
chunk_nbr = 1

for chunk in pd.read_csv('../data/Medicare_Provider_Utilization_and_Payment_Data__Physician_and_Other_Supplier_PUF_CY2017.csv', chunksize = 1000):
    if chunk_nbr == 1:
        payment_df = payment_df.append(chunk)
        chunk_nbr += 1
    else: break
payment_df.head()

CPU times: user 21.3 ms, sys: 3.86 ms, total: 25.2 ms
Wall time: 25 ms


Unnamed: 0,National Provider Identifier,Last Name/Organization Name of the Provider,First Name of the Provider,Middle Initial of the Provider,Credentials of the Provider,Gender of the Provider,Entity Type of the Provider,Street Address 1 of the Provider,Street Address 2 of the Provider,City of the Provider,...,HCPCS Code,HCPCS Description,HCPCS Drug Indicator,Number of Services,Number of Medicare Beneficiaries,Number of Distinct Medicare Beneficiary/Per Day Services,Average Medicare Allowed Amount,Average Submitted Charge Amount,Average Medicare Payment Amount,Average Medicare Standardized Amount
0,1003000126,ENKESHAFI,ARDALAN,,M.D.,M,I,900 SETON DR,,CUMBERLAND,...,99217,Hospital observation care discharge,N,100.0,96,100,73.3988,325.78,56.8272,57.4924
1,1003000126,ENKESHAFI,ARDALAN,,M.D.,M,I,900 SETON DR,,CUMBERLAND,...,99218,Hospital observation care typically 30 minutes,N,26.0,25,26,100.08,449.0,78.46,79.306154
2,1003000126,ENKESHAFI,ARDALAN,,M.D.,M,I,900 SETON DR,,CUMBERLAND,...,99219,Hospital observation care typically 50 minutes,N,52.0,51,52,136.38,614.0,102.807692,103.895385
3,1003000126,ENKESHAFI,ARDALAN,,M.D.,M,I,900 SETON DR,,CUMBERLAND,...,99220,Hospital observation care typically 70 minutes...,N,59.0,59,59,190.363729,755.932203,141.293559,142.865763
4,1003000126,ENKESHAFI,ARDALAN,,M.D.,M,I,900 SETON DR,,CUMBERLAND,...,99221,"Initial hospital inpatient care, typically 30 ...",N,16.0,16,16,101.68,462.8125,79.71,80.75


In [3]:
payment_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 26 columns):
 #   Column                                                    Non-Null Count  Dtype  
---  ------                                                    --------------  -----  
 0   National Provider Identifier                              1000 non-null   int64  
 1   Last Name/Organization Name of the Provider               1000 non-null   object 
 2   First Name of the Provider                                976 non-null    object 
 3   Middle Initial of the Provider                            658 non-null    object 
 4   Credentials of the Provider                               936 non-null    object 
 5   Gender of the Provider                                    976 non-null    object 
 6   Entity Type of the Provider                               1000 non-null   object 
 7   Street Address 1 of the Provider                          1000 non-null   object 
 8   Street Address 2 of

#### What if you only want to gather the observations where HCPCS = 99213?

In [3]:
%%time

HCPCS_rows =[]
for chunk in pd.read_csv('../data/Medicare_Provider_Utilization_and_Payment_Data__Physician_and_Other_Supplier_PUF_CY2017.csv', 
                         chunksize = 1000):
    HCPCS_rows.append(chunk[chunk['HCPCS Code'] == '99213']) 
               
                
hcpcs_pay_99213 = pd.concat(HCPCS_rows, ignore_index=True)

CPU times: user 1min 40s, sys: 14.8 s, total: 1min 55s
Wall time: 1min 56s


In [5]:
print(hcpcs_pay_99213.shape)
hcpcs_pay_99213.head()

(457843, 26)


Unnamed: 0,National Provider Identifier,Last Name/Organization Name of the Provider,First Name of the Provider,Middle Initial of the Provider,Credentials of the Provider,Gender of the Provider,Entity Type of the Provider,Street Address 1 of the Provider,Street Address 2 of the Provider,City of the Provider,...,HCPCS Code,HCPCS Description,HCPCS Drug Indicator,Number of Services,Number of Medicare Beneficiaries,Number of Distinct Medicare Beneficiary/Per Day Services,Average Medicare Allowed Amount,Average Submitted Charge Amount,Average Medicare Payment Amount,Average Medicare Standardized Amount
0,1003000142,KHALIL,RASHID,,M.D.,M,I,4126 N HOLLAND SYLVANIA RD,SUITE 220,TOLEDO,...,99213,Established patient office or other outpatient...,N,129.0,89,129,70.93,109.0,50.868062,53.268372
1,1003000423,VELOTTA,JENNIFER,A,M.D.,F,I,11100 EUCLID AVE,,CLEVELAND,...,99213,Established patient office or other outpatient...,N,21.0,16,21,70.93,90.0,49.309524,52.72
2,1003000480,ROTHCHILD,KEVIN,B,MD,M,I,12605 E 16TH AVE,,AURORA,...,99213,Established patient office or other outpatient...,N,25.0,23,25,52.06,200.6,34.8768,36.0628
3,1003000522,WEIGAND,FREDERICK,J,MD,M,I,1565 SAXON BLVD,SUITE 102,DELTONA,...,99213,Established patient office or other outpatient...,N,360.0,210,360,73.17,219.0,48.301222,49.178417
4,1003000530,SEMONCHE,AMANDA,M,DO,F,I,1021 PARK AVE,SUITE 203,QUAKERTOWN,...,99213,Established patient office or other outpatient...,N,112.0,96,112,78.47,120.0,50.076607,47.592679


#### Let's keep the following columns:   
- National Provider Identifier  --                              1000 non-null int64
- Last Name/Organization Name of the Provider --                1000 non-null object
- First Name of the Provider  --                                976 non-null object
- Credentials of the Provider  --                               936 non-null object
- Entity Type of the Provider  --                               1000 non-null object
- Street Address 1 of the Provider --                           1000 non-null object
- City of the Provider --                                       1000 non-null object
- Zip Code of the Provider --                                   1000 non-null int64
- State Code of the Provider --                                 1000 non-null object
- Country Code of the Provider --                              1000 non-null object
- Provider Type --                                              1000 non-null object
- Medicare Participation Indicator --                           1000 non-null object
- Place of Service --                                           1000 non-null object
- HCPCS Code --                                                 1000 non-null object
- HCPCS Description --                                          1000 non-null object
- HCPCS Drug Indicator --                                       1000 non-null object
- Number of Services --                                         1000 non-null float64
- Number of Medicare Beneficiaries --                           1000 non-null int64
- Number of Distinct Medicare Beneficiary/Per Day Services --   1000 non-null int64
- Average Medicare Allowed Amount --                            1000 non-null float64
- Average Submitted Charge Amount --                            1000 non-null float64
- Average Medicare Payment Amount --                            1000 non-null float64
- Average Medicare Standardized Amount --                       1000 non-null float64

#### Is there anything we can convert from an object? Columns we can drop?
 - Nothing convertible, but we'll drop 3 columns

In [6]:
hcpcs_pay_99213 = hcpcs_pay_99213.drop(columns = ['Middle Initial of the Provider', 
                                                  'Gender of the Provider',
                                                  'Street Address 2 of the Provider'])


In [8]:
hcpcs_pay_99213.to_pickle("../class_data/hcpcs_pay_99213.pkl")

In [11]:
%%time

outpatient_docs_df = pd.read_pickle("../class_data/hcpcs_pay_99213.pkl")

CPU times: user 441 ms, sys: 146 ms, total: 588 ms
Wall time: 589 ms


In [12]:
print(outpatient_docs.shape)
outpatient_docs.head()

(457843, 23)


Unnamed: 0,National Provider Identifier,Last Name/Organization Name of the Provider,First Name of the Provider,Credentials of the Provider,Entity Type of the Provider,Street Address 1 of the Provider,City of the Provider,Zip Code of the Provider,State Code of the Provider,Country Code of the Provider,...,HCPCS Code,HCPCS Description,HCPCS Drug Indicator,Number of Services,Number of Medicare Beneficiaries,Number of Distinct Medicare Beneficiary/Per Day Services,Average Medicare Allowed Amount,Average Submitted Charge Amount,Average Medicare Payment Amount,Average Medicare Standardized Amount
0,1003000142,KHALIL,RASHID,M.D.,I,4126 N HOLLAND SYLVANIA RD,TOLEDO,436233536,OH,US,...,99213,Established patient office or other outpatient...,N,129.0,89,129,70.93,109.0,50.868062,53.268372
1,1003000423,VELOTTA,JENNIFER,M.D.,I,11100 EUCLID AVE,CLEVELAND,441061716,OH,US,...,99213,Established patient office or other outpatient...,N,21.0,16,21,70.93,90.0,49.309524,52.72
2,1003000480,ROTHCHILD,KEVIN,MD,I,12605 E 16TH AVE,AURORA,800452545,CO,US,...,99213,Established patient office or other outpatient...,N,25.0,23,25,52.06,200.6,34.8768,36.0628
3,1003000522,WEIGAND,FREDERICK,MD,I,1565 SAXON BLVD,DELTONA,327255876,FL,US,...,99213,Established patient office or other outpatient...,N,360.0,210,360,73.17,219.0,48.301222,49.178417
4,1003000530,SEMONCHE,AMANDA,DO,I,1021 PARK AVE,QUAKERTOWN,18951,PA,US,...,99213,Established patient office or other outpatient...,N,112.0,96,112,78.47,120.0,50.076607,47.592679
