In [1]:
column_headers = [
    "Unique identifier",
    "Last name / organization",
    "First name",
    "Middle initial",
    "Credential ('', 'M.D.', 'D.O.', 'MD','DO', 'OTR', 'D.C.', ...)",
    "Gender",
    "Entity ('I' for individual, 'O' for organization)",
    "Address line 1",
    "Address line 2",
    "City",
    "Zip",
    "State",
    "Country",
    "Specialty ('Internal Medicine', 'Pathology', ...)",
    "Participates in medicare ('Y' or 'N')",
    "Place of service ('F' for facility or 'O' for other)",
    "HCPCS code (a procedure code)",
    "HCPCS code description",
    "HCPCS drug indicator ('Y' or 'N')",
    "Line service count (people, hours, miles, ...)",
    "Beneficiary unique count (number of distinct beneficiary, possibly receiving many procedures)",
    "Beneficiary day service count (number of distinct procedures, possibly on the same person)",
    "Average medicare allowed amount",
    "Standard deviation",
    "Average submitted charges amount",
    "Standard deviation",
    "Average medicare payment amount",
    "Standard deviation"
]
for i in range(0, len(column_headers)):
    print str(i) + ": " + column_headers[i]

0: Unique identifier
1: Last name / organization
2: First name
3: Middle initial
4: Credential ('', 'M.D.', 'D.O.', 'MD','DO', 'OTR', 'D.C.', ...)
5: Gender
6: Entity ('I' for individual, 'O' for organization)
7: Address line 1
8: Address line 2
9: City
10: Zip
11: State
12: Country
13: Specialty ('Internal Medicine', 'Pathology', ...)
14: Participates in medicare ('Y' or 'N')
15: Place of service ('F' for facility or 'O' for other)
16: HCPCS code (a procedure code)
17: HCPCS code description
18: HCPCS drug indicator ('Y' or 'N')
19: Line service count (people, hours, miles, ...)
20: Beneficiary unique count (number of distinct beneficiary, possibly receiving many procedures)
21: Beneficiary day service count (number of distinct procedures, possibly on the same person)
22: Average medicare allowed amount
23: Standard deviation
24: Average submitted charges amount
25: Standard deviation
26: Average medicare payment amount
27: Standard deviation


In [2]:
# Computing statistics, building dictionaries:

from math import floor

provider_dictionary = dict()
provider_total_submitted = dict()
provider_total_allowed = dict()

infile = open("provider_utilization_2013.txt","r")
line = infile.readline()
line = infile.readline()
#for c in range(10000):
while True:   
    line = infile.readline()
    if(line == ""):
        break
    tokens = line.split("\t")
    
    provider_id = tokens[0]
    
    provider_dictionary[provider_id] = tokens[1:16]
    
    number_of_procedures = int(tokens[21])
    avg_allowed_charge = float(tokens[22])
    avg_submitted_charge = float(tokens[24])
    
    if provider_id not in provider_total_submitted:
        provider_total_submitted[provider_id] = avg_submitted_charge * number_of_procedures
        provider_total_allowed[provider_id] = avg_allowed_charge * number_of_procedures
    else:
        provider_total_submitted[provider_id] += avg_submitted_charge * number_of_procedures
        provider_total_allowed[provider_id] += avg_allowed_charge * number_of_procedures
    
infile.close()

provider_expensiveness = dict()
for provider_id in provider_total_submitted:
    provider_expensiveness[provider_id] = provider_total_submitted[provider_id] / provider_total_allowed[provider_id]

In [10]:
# Find latitude and longitude associated with provider address
import django
import os
from geopy.geocoders import Nominatim
from geopy.exc import GeocoderTimedOut
from random import random
from time import sleep
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "medissect.settings")
django.setup()
from explorer.models import Provider
from django.db import IntegrityError

geolocator = Nominatim()
count = 0
fail_count = 0
skip_count = 0
success_count = 0
for code in provider_dictionary:
    if(Provider.objects.filter(npi = code).count()):
        skip_count += 1
    else:
        token = provider_dictionary[code]
        if token[10] != 'MA':
            count += 1
            skip_count += 1
            continue
        
        address = token[6] + ", " + token[8] + ", " + token[10]
        location = None
        sleep(0.1 + 0.2 *random())
        try:
            location = geolocator.geocode(address)
        except GeocoderTimedOut:
            count += 1
            fail_count += 1
            continue
            
        if location:
            provider = Provider(
                npi = int(code),
                last_name = token[0],
                first_name = token[1],
                middle_initial = token[2],
                credentials = token[3],
                gender = token[4],
                is_organization = (token[5] == 'O'),
                street1 = token[6],
                street2 = token[7],
                city = token[8],
                zipcode = token[9],
                state = token[10],
                country = token[11],
                medicare_participant = (token[13] == 'Y'),
                at_facility = (token[14] == 'F'),
                longitude = location.longitude,
                latitude = location.latitude,
                expensiveness = provider_expensiveness[code]
            )
            provider.save()
            success_count += 1
        else:
            fail_count += 1
    count += 1
    if count % 1 == 0:
        print "count: {0:6d}, skip: {1:6d}, success: {2:6d}, fail: {3:6d}".format(count, skip_count, success_count, fail_count)
        

count:      2, skip:      1, success:      0, fail:      1
count:     10, skip:      9, success:      0, fail:      1
count:     74, skip:     73, success:      0, fail:      1
count:    156, skip:    155, success:      0, fail:      1
count:    161, skip:    160, success:      0, fail:      1
count:    274, skip:    273, success:      0, fail:      1
count:    285, skip:    284, success:      0, fail:      1
count:    299, skip:    298, success:      0, fail:      1
count:    337, skip:    336, success:      0, fail:      1
count:    377, skip:    376, success:      0, fail:      1
count:    397, skip:    396, success:      0, fail:      1
count:    405, skip:    404, success:      0, fail:      1
count:    428, skip:    427, success:      0, fail:      1
count:    437, skip:    436, success:      0, fail:      1
count:    451, skip:    450, success:      0, fail:      1
count:    453, skip:    452, success:      0, fail:      1
count:    468, skip:    467, success:      0, fail:     

KeyboardInterrupt: 

909605