In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline

In [2]:
cms_physician = pd.read_csv('../data/cms_physician_and_other_supplier-payments_2015-2017.csv')
#for line in open('u.item', encoding='ISO-8859-1'):
#read each line
cms_physician.columns

Index(['year', 'National Provider Identifier',
       'Last Name/Organization Name of the Provider',
       'First Name of the Provider', 'Middle Initial of the Provider',
       'Credentials of the Provider', 'Gender of the Provider',
       'Entity Type of the Provider', 'Street Address 1 of the Provider',
       'Street Address 2 of the Provider', 'City of the Provider',
       'Zip Code of the Provider', 'State Code of the Provider',
       'Country Code of the Provider', 'Provider Type',
       'Medicare Participation Indicator', 'Place of Service', 'HCPCS Code',
       'HCPCS Description', 'HCPCS Drug Indicator', 'Number of Services',
       'Number of Medicare Beneficiaries',
       'Number of Distinct Medicare Beneficiary/Per Day Services',
       'Average Medicare Allowed Amount', 'Average Submitted Charge Amount',
       'Average Medicare Payment Amount',
       'Average Medicare Standardized Amount'],
      dtype='object')

In [3]:
#drop columns
cms_physician = cms_physician.drop(columns=['First Name of the Provider',
                                            'Last Name/Organization Name of the Provider',
                                            'Middle Initial of the Provider',
                                            'Credentials of the Provider',
                                            'Gender of the Provider',
                                            'Entity Type of the Provider',
                                            'Street Address 1 of the Provider',
                                            'Street Address 2 of the Provider',
                                            'State Code of the Provider',
                                            'Country Code of the Provider',
                                            'Medicare Participation Indicator',
                                            'Place of Service',
                                            'HCPCS Drug Indicator',
                                            'Number of Distinct Medicare Beneficiary/Per Day Services',
                                            'Average Medicare Allowed Amount',
                                            'Average Medicare Standardized Amount'], axis=1)
cms_physician.head()

Unnamed: 0,year,National Provider Identifier,City of the Provider,Zip Code of the Provider,Provider Type,HCPCS Code,HCPCS Description,Number of Services,Number of Medicare Beneficiaries,Average Submitted Charge Amount,Average Medicare Payment Amount
0,2017,1033112180,CHATTANOOGA,374211688,Maxillofacial Surgery,70355,X-ray imaging of teeth,25,25,104.4,11.348
1,2017,1043302466,NASHVILLE,372320001,Diagnostic Radiology,70355,X-ray imaging of teeth,27,26,80.0,7.824074
2,2017,1104009646,KNOXVILLE,379201502,Maxillofacial Surgery,21248,Partial reconstruction of lower jaw or cheek b...,30,19,1970.0,662.898333
3,2017,1104009646,KNOXVILLE,379201502,Maxillofacial Surgery,70355,X-ray imaging of teeth,60,60,85.833333,13.701667
4,2017,1104031053,NASHVILLE,372328605,Otolaryngology,21461,Open treatment of broken jaw bone,11,11,3614.0,348.31


In [4]:
#rename columns
cms_physician = cms_physician.rename(columns = {'National Provider Identifier':'provider_id',
                                                'City of the Provider':'city',
                                                'Zip Code of the Provider':'postal_code',
                                                'City of the Provider':'city',
                                                'State Code of the Provider':'state',
                                                'Provider Type':'treatment_type',
                                                'HCPCS Code':'hcpcs_code',
                                                'HCPCS Description':'description',
                                                'Number of Services':'services_performed',
                                                'Number of Medicare Beneficiaries':'beneficiaries',
                                                'Average Submitted Charge Amount':'avg_charge_submitted',
                                                'Average Medicare Payment Amount':'avg_medicare payment'
                                               })
cms_physician.head()

Unnamed: 0,year,provider_id,city,postal_code,treatment_type,hcpcs_code,description,services_performed,beneficiaries,avg_charge_submitted,avg_medicare payment
0,2017,1033112180,CHATTANOOGA,374211688,Maxillofacial Surgery,70355,X-ray imaging of teeth,25,25,104.4,11.348
1,2017,1043302466,NASHVILLE,372320001,Diagnostic Radiology,70355,X-ray imaging of teeth,27,26,80.0,7.824074
2,2017,1104009646,KNOXVILLE,379201502,Maxillofacial Surgery,21248,Partial reconstruction of lower jaw or cheek b...,30,19,1970.0,662.898333
3,2017,1104009646,KNOXVILLE,379201502,Maxillofacial Surgery,70355,X-ray imaging of teeth,60,60,85.833333,13.701667
4,2017,1104031053,NASHVILLE,372328605,Otolaryngology,21461,Open treatment of broken jaw bone,11,11,3614.0,348.31


In [5]:
#export to csv
cms_physician.to_csv('../cms_physician.csv', index = False)

In [6]:
cms_physician.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 95 entries, 0 to 94
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   year                  95 non-null     int64  
 1   provider_id           95 non-null     int64  
 2   city                  95 non-null     object 
 3   postal_code           95 non-null     int64  
 4   treatment_type        95 non-null     object 
 5   hcpcs_code            95 non-null     int64  
 6   description           95 non-null     object 
 7   services_performed    95 non-null     int64  
 8   beneficiaries         95 non-null     int64  
 9   avg_charge_submitted  95 non-null     float64
 10  avg_medicare payment  95 non-null     float64
dtypes: float64(2), int64(6), object(3)
memory usage: 8.3+ KB


### Group table by zip code, year, and number of visits

In [7]:
#delete last 4 digits in postal_code
cms_physician['postal_code'] = cms_physician['postal_code'].astype(str).str[:-4]
cms_physician.head()

Unnamed: 0,year,provider_id,city,postal_code,treatment_type,hcpcs_code,description,services_performed,beneficiaries,avg_charge_submitted,avg_medicare payment
0,2017,1033112180,CHATTANOOGA,37421,Maxillofacial Surgery,70355,X-ray imaging of teeth,25,25,104.4,11.348
1,2017,1043302466,NASHVILLE,37232,Diagnostic Radiology,70355,X-ray imaging of teeth,27,26,80.0,7.824074
2,2017,1104009646,KNOXVILLE,37920,Maxillofacial Surgery,21248,Partial reconstruction of lower jaw or cheek b...,30,19,1970.0,662.898333
3,2017,1104009646,KNOXVILLE,37920,Maxillofacial Surgery,70355,X-ray imaging of teeth,60,60,85.833333,13.701667
4,2017,1104031053,NASHVILLE,37232,Otolaryngology,21461,Open treatment of broken jaw bone,11,11,3614.0,348.31


In [8]:
#change services_performed to number_of_visits to match smile_on dataframe
cms_physician = cms_physician.rename(columns = {'services_performed': 'number_of_visits'})

In [9]:
#drop unnecessary columns
cms_physician = cms_physician.drop(columns=['provider_id',
                                            'city',
                                            'treatment_type',
                                            'hcpcs_code',
                                            'description',
                                            'beneficiaries',
                                            'avg_charge_submitted',
                                            'avg_medicare payment'
                                           ])

### create 2017 dataframe

In [10]:
#isolate 2017
cms_physician_2017 = cms_physician.loc[cms_physician['year'] == 2017]
cms_physician_2017.head()

Unnamed: 0,year,postal_code,number_of_visits
0,2017,37421,25
1,2017,37232,27
2,2017,37920,30
3,2017,37920,60
4,2017,37232,11


In [11]:
cms_physician_2017.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 33 entries, 0 to 32
Data columns (total 3 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   year              33 non-null     int64 
 1   postal_code       33 non-null     object
 2   number_of_visits  33 non-null     int64 
dtypes: int64(2), object(1)
memory usage: 1.0+ KB


In [12]:
cms_2017_grouped = cms_physician_2017.groupby(['postal_code']).agg({'number_of_visits': "sum"}).reset_index()
cms_2017_grouped

Unnamed: 0,postal_code,number_of_visits
0,37203,77
1,37232,305
2,37343,72
3,37421,87
4,37909,14
5,37920,372
6,37923,17
7,38132,34


### create 2016 dataframe

In [13]:
#isolate 2016
cms_physician_2016 = cms_physician.loc[cms_physician['year'] == 2016]
cms_physician_2016

Unnamed: 0,year,postal_code,number_of_visits
33,2016,37421,14
34,2016,37232,26
35,2016,37920,34
36,2016,37920,14
37,2016,37343,60
38,2016,37204,13
39,2016,38501,71
40,2016,37416,24
41,2016,37920,53
42,2016,37211,105


In [14]:
#export to csv
cms_physician_2016.to_csv('../cms_physician_2016.csv', index = False)

In [15]:
cms_2016_grouped = cms_physician_2016.groupby(['postal_code']).agg({'number_of_visits': "sum"}).reset_index()
cms_2016_grouped

Unnamed: 0,postal_code,number_of_visits
0,37203,101
1,37204,13
2,37211,105
3,37232,318
4,37343,79
5,37416,24
6,37421,86
7,37920,378
8,38501,71


### merge dataframes and calculate average number of visits by zip code

In [16]:
cms_average = (cms_2016_grouped.set_index('postal_code').add(cms_2017_grouped.set_index('postal_code'), fill_value=0)*.5).reset_index()
cms_average = cms_average.rename(columns = {'number_of_visits':'visits_per_year'})
cms_average

Unnamed: 0,postal_code,visits_per_year
0,37203,89.0
1,37204,6.5
2,37211,52.5
3,37232,311.5
4,37343,75.5
5,37416,12.0
6,37421,86.5
7,37909,7.0
8,37920,375.0
9,37923,8.5


### import smile on data

In [17]:
smile_on = pd.read_csv('../data/smile_on.csv', encoding="ISO-8859-1",  low_memory=False, header=2)
#for line in open('u.item', encoding='ISO-8859-1'):
#read each line
smile_on.head()

Unnamed: 0,CallReportNum,ReportVersion,CallDateAndTimeStart,CallDateAndTimeEnd,CallerNum,PhoneWorkerNum,CityName,CountyName,StateProvince,PostalCode,...,SMILE ON 60+ Screening - Last Screening Date,SMILE ON 60+ Base-line Oral and Conditions Questions - Are you limited in what you can eat?,SMILE ON 60+ Base-line Oral and Conditions Questions - Dental Clinic,SMILE ON 60+ Oral Care Encounter - What care was provided to the enrollee? (Select all that apply),SMILE ON 60+ Site Information - Enroller ID,SMILE ON 60+ Site Information - Enroller Location,SMILE ON 60+ Treatment Plan - Is the treatment plan completed?,SMILE ON 60+ Treatment Plan - Was a treatment plan developed?,"SMILE ON 60+ Treatment Plan - can chew and ""social six"" esthetics",SMILE ON 60+ Verification of Eligibility - Transportation Arranged
0,81381827,SMILE ON 60+ Registration,11/25/2020 16:46,11/25/2020 16:48,3859874,136529,Springfield,Robertson,TN,37172.0,...,,No,Matthew Walker Comprehensive Health Center Inc...,,Matthew Walker Comprehensive Health Center Inc...,Provider Site,,,,
1,81379177,SMILE ON 60+ Registration,11/25/2020 15:40,11/25/2020 15:55,3859755,82341,Nashville,Davidson,TN,37212.0,...,,No,Interfaith Dental Clinic - Nashville,,,Health Assist Office,,,,
2,81377435,SMILE ON 60+ Clinical,11/25/2020 15:02,11/25/2020 15:02,2518680,119506,Rogersville,Hawkins,TN,37857.0,...,,No,,5650 - Add tooth to existing partial denture - 30,Friends in Need Health Center - Kingsport,Provider Site,Yes,Yes,Yes,
3,81375215,SMILE ON 60+ Clinical,11/25/2020 14:17,11/25/2020 14:17,3231923,143703,Memphis,Shelby,TN,38104.0,...,,Yes,,No Code - Denture step Mandibular: may be used...,Christ Community Health Services- Frayser,Provider Site,,,,
4,81373934,SMILE ON 60+ Clinical,11/25/2020 13:50,11/25/2020 13:58,-1,135611,,,TN,,...,11/25/2020,,,,Neighborhood Health - Lebanon,,,Yes,No,


In [18]:
#drop irrelevant columns
smile_on = smile_on.drop(columns=['CallReportNum',
                                  'ReportVersion',
                                  'CallDateAndTimeEnd',
                                  'CallerNum',
                                  'PhoneWorkerNum',
                                  'CityName',
                                  'CountyName',
                                  'StateProvince',
                                  'EnteredOn',
                                  'SMILE ON 60+ Screening - Last Screening Date',
                                  'SMILE ON 60+ Base-line Oral and Conditions Questions - Are you limited in what you can eat?',
                                  'SMILE ON 60+ Base-line Oral and Conditions Questions - Dental Clinic',
                                  'SMILE ON 60+ Oral Care Encounter - What care was provided to the enrollee? (Select all that apply)',
                                  'SMILE ON 60+ Site Information - Enroller ID',
                                  'SMILE ON 60+ Site Information - Enroller Location',
                                  'SMILE ON 60+ Treatment Plan - Is the treatment plan completed?',
                                  'SMILE ON 60+ Treatment Plan - Was a treatment plan developed?',
                                  'SMILE ON 60+ Treatment Plan - can chew and "social six" esthetics',
                                  'SMILE ON 60+ Verification of Eligibility - Transportation Arranged'                                ''
                                 ])
smile_on.head()

Unnamed: 0,CallDateAndTimeStart,PostalCode
0,11/25/2020 16:46,37172.0
1,11/25/2020 15:40,37212.0
2,11/25/2020 15:02,37857.0
3,11/25/2020 14:17,38104.0
4,11/25/2020 13:50,


In [19]:
#extract year
smile_on['year'] = pd.DatetimeIndex(smile_on['CallDateAndTimeStart']).year
smile_on.head()

Unnamed: 0,CallDateAndTimeStart,PostalCode,year
0,11/25/2020 16:46,37172.0,2020
1,11/25/2020 15:40,37212.0,2020
2,11/25/2020 15:02,37857.0,2020
3,11/25/2020 14:17,38104.0,2020
4,11/25/2020 13:50,,2020


In [20]:
#delete .0 in postal_code
smile_on['PostalCode'] = smile_on['PostalCode'].astype(str).str[:-2]
smile_on.head()

Unnamed: 0,CallDateAndTimeStart,PostalCode,year
0,11/25/2020 16:46,37172,2020
1,11/25/2020 15:40,37212,2020
2,11/25/2020 15:02,37857,2020
3,11/25/2020 14:17,38104,2020
4,11/25/2020 13:50,n,2020


In [21]:
#drop column and change column name
smile_on = smile_on.drop(columns=['CallDateAndTimeStart'])
smile_on = smile_on.rename(columns = {'PostalCode': 'postal_code'})
smile_on.head()

Unnamed: 0,postal_code,year
0,37172,2020
1,37212,2020
2,37857,2020
3,38104,2020
4,n,2020


In [22]:
#remove null values for postal_code
smile_on = smile_on.loc[~smile_on['postal_code'].isin(['n', 'NaN'])]
smile_on.head()

Unnamed: 0,postal_code,year
0,37172,2020
1,37212,2020
2,37857,2020
3,38104,2020
5,37664,2020


### For year 2018

In [23]:
#for year 2018
smile_on_18 = smile_on.loc[smile_on['year'] == 2018].reset_index()
smile_on_18.head()

Unnamed: 0,index,postal_code,year
0,35419,37203,2018
1,35420,37617,2018
2,35422,37207,2018
3,35423,37207,2018
4,35424,38462,2018


In [24]:
smile_on_18 = smile_on_18.drop(columns=['index'])

In [25]:
smile_visits_18 = smile_on_18['postal_code'].value_counts()
smile_visits_18

37130    190
37167    145
37129    144
37323    143
37211    136
        ... 
37385      1
38553      1
38583      1
37406      1
37813      1
Name: postal_code, Length: 237, dtype: int64

In [26]:
smile_visits_18 = smile_visits_18.to_frame().reset_index()
smile_visits_18.head()

Unnamed: 0,index,postal_code
0,37130,190
1,37167,145
2,37129,144
3,37323,143
4,37211,136


In [32]:
#rename columns
smile_visits_18 = smile_visits_18.rename(columns = {'index':'postal_code', 'postal_code':'number_of_visits'})
smile_visits_18.head()

Unnamed: 0,postal_code,number_of_visits
0,37130,190
1,37167,145
2,37129,144
3,37323,143
4,37211,136


### For year 2019

In [31]:
#for year 2019
smile_on_19 = smile_on.loc[smile_on['year'] == 2019].reset_index()
smile_on_19 = smile_on_19.drop(columns=['index'])
smile_on_19.head()

Unnamed: 0,postal_code,year
0,38116,2019
1,38301,2019
2,38127,2019
3,38116,2019
4,38116,2019


In [28]:
smile_visits_19 = smile_on_19['postal_code'].value_counts()
smile_visits_19.head()

37013    508
37130    486
37207    470
37211    403
37129    363
Name: postal_code, dtype: int64

In [29]:
smile_visits_19 = smile_visits_19.to_frame().reset_index()
smile_visits_19.head()

Unnamed: 0,index,postal_code
0,37013,508
1,37130,486
2,37207,470
3,37211,403
4,37129,363


In [33]:
#rename columns
smile_visits_19 = smile_visits_19.rename(columns = {'index':'postal_code', 'postal_code':'number_of_visits'})
smile_visits_19.head()

Unnamed: 0,postal_code,number_of_visits
0,37013,508
1,37130,486
2,37207,470
3,37211,403
4,37129,363


### merge dataframes and calculate average number of visits by zip code

In [34]:
smile_average = (smile_visits_18.set_index('postal_code').add(smile_visits_19.set_index('postal_code'), fill_value=0)*.5).reset_index()
smile_average = smile_average.rename(columns = {'number_of_visits':'visits_per_year'})
smile_average

Unnamed: 0,postal_code,visits_per_year
0,24201,1.5
1,24202,4.0
2,24245,0.5
3,24248,1.0
4,24301,0.5
...,...,...
445,38580,1.0
446,38581,3.0
447,38583,6.0
448,38585,8.0


In [None]:
smile_average = (smile_on_18.set_index('postal_code').add(smile_on_19.set_index('postal_code'), fill_value=0)*.5).reset_index()
smile_average = smile_average.rename(columns = {'number_of_visits':'visits_per_year'})
smile_average