In [1]:
import pandas as pd
import numpy as np
import sqlite3

### Data source
CDC: https://data.cdc.gov/Flu-Vaccinations/Vaccines-gov-Flu-vaccinating-provider-locations/bugr-bbfr

### Information
The Vaccines.gov dataset includes provider information for flu vaccine provider locations in the U.S. Vaccines.gov is powered by VaccineFinder.

### Suggestions on building functions
1. Browsing: 1) input `number` **(not that meaningful?)**
                
                ----> number of rows in the dataset 
2. Searching: input `state+city` 

                    ---> find flu vaccine provider in the same city(name, location street, open hours, other information)
               
3. Searching (auto-completion): input `part of provider's name` (loc_name) 
                                
                                ----> whole provider's name + relevant information(location, open hours, other information)

## EDA

In [2]:
vaccines_data = pd.read_csv("/Users/liuxiaoquan/Documents/GitHub/Biostats821_Final_Project/Vaccines.gov__Flu_vaccinating_provider_locations.csv", low_memory=False)
vaccines_data.head()


Unnamed: 0,provider_location_guid,loc_store_no,loc_phone,loc_name,loc_admin_street1,loc_admin_street2,loc_admin_city,loc_admin_state,loc_admin_zip,sunday_hours,...,insurance_accepted,walkins_accepted,provider_notes,searchable_name,in_stock,supply_level,quantity_last_updated,latitude,longitude,category
0,70d24f31-f2c4-4e08-b1b6-afbbd83947a2,Not applicable,(580) 924-4285,BRYAN CHD - DURANT,1524 W. Chuckwa DR,,Durant,OK,74701,,...,True,True,,Flu Shot,True,-1,2022-10-25,34.008567,-96.382197,seasonal
1,9cd97dd4-2196-4585-a1e5-c1ed72e94510,Not applicable,(304) 682-0444,Renegade Pharmacy Inc,18 Logan Street,,Oceana,WV,24870,Closed,...,True,True,Covid 19 vaccinations given Mon.-Wed.-Fri betw...,"Flu Shot (65+, high-dose or adjuvanted)",False,3,2022-12-23,37.692379,-81.634361,seasonal
2,328f435a-3fe8-4262-bd00-17eab759ced2,Not applicable,(931) 647-6561,ST. BETHLEHEM DRUGS,800 WEATHERLY DR SUITE 101A,,CLARKSVILLE,TN,37043,Closed,...,True,True,Please call to schedule an appointment ONLY if...,Flu Shot (Egg free),False,-1,2022-12-08,36.574125,-87.285068,seasonal
3,a499a795-2e5c-4f7f-a3a2-e3042bd0e813,Not applicable,215-735-5600,Center City Pediatrics,1740 South Street,1740 South St Suite 200,Philadelphia,PA,19146,,...,,,,Flu Shot,True,4,2022-12-19,39.944177,-75.171724,seasonal
4,eaf46825-8bba-4f27-ba26-7f9fb787030d,MS1005662,(718) 433-9800,Vida Sana Pharmacy #MS1005662,8820 37th ave,,Jackson Heights,NY,11372-7737,Closed,...,True,True,,"Flu Shot (65+, high-dose or adjuvanted)",True,4,2023-01-02,40.75021,-73.878084,seasonal


In [3]:
vaccines_data.shape

(245118, 28)

In [4]:
#count the number of state+city and make a new dataframe
vaccines_data['state_city'] = vaccines_data['loc_admin_state'] +' '+ vaccines_data['loc_admin_city']
vaccines_data['state_city'].value_counts().to_frame()

Unnamed: 0,state_city
TX HOUSTON,1924
IL CHICAGO,1514
TX SAN ANTONIO,1429
NY NEW YORK,1237
FL MIAMI,1101
...,...
UT Castle Dale,1
OK Prague,1
NY Bay Shore,1
AL GEORGIANA,1


In [5]:
# count the number of searchable_name
vaccines_data['searchable_name'].value_counts().to_frame()

Unnamed: 0,searchable_name
Flu Shot,116706
Flu Shot (Egg free),63137
"Flu Shot (65+, high-dose or adjuvanted)",58364
Flu Nasal Spray,6911


In [6]:
# vaccinate provider in Durham, NC
VAC = vaccines_data.loc[vaccines_data['loc_admin_city'] == 'Durham',:]
VAC.head()

Unnamed: 0,provider_location_guid,loc_store_no,loc_phone,loc_name,loc_admin_street1,loc_admin_street2,loc_admin_city,loc_admin_state,loc_admin_zip,sunday_hours,...,walkins_accepted,provider_notes,searchable_name,in_stock,supply_level,quantity_last_updated,latitude,longitude,category,state_city
4108,2dd9f103-5118-42e1-986c-21229835853d,10-4831,9194898160,Sams Club #10-4831,4005 Durham Chapel Hill Blvd,,Durham,NC,27707-2516,,...,True,,Flu Nasal Spray,False,-1,2023-03-28,35.967432,-78.958222,seasonal,NC Durham
4985,824308c3-7468-4c8b-90c8-c0a57f674454,1817,(919) 598-0803,Publix Super Markets Inc. #1817,1065 Yunus Road,,Durham,NC,27703-7200,11:00AM - 6:00PM,...,True,,Flu Shot,False,-1,2023-04-06,35.940518,-78.85366,seasonal,NC Durham
4986,824308c3-7468-4c8b-90c8-c0a57f674454,1817,(919) 598-0803,Publix Super Markets Inc. #1817,1065 Yunus Road,,Durham,NC,27703-7200,11:00AM - 6:00PM,...,True,,Flu Shot,False,-1,2023-04-06,35.940518,-78.85366,seasonal,NC Durham
21086,824308c3-7468-4c8b-90c8-c0a57f674454,1817,(919) 598-0803,Publix Super Markets Inc. #1817,1065 Yunus Road,,Durham,NC,27703-7200,11:00AM - 6:00PM,...,True,,Flu Shot (Egg free),False,-1,2023-04-06,35.940518,-78.85366,seasonal,NC Durham
50223,d350fa41-8714-4e81-bed7-a4a310dfce3d,420,919-620-1947,Food Lion #420,3808 Guess Road,,Durham,NC,27705,,...,True,,Flu Shot (Egg free),False,-1,2023-04-07,36.058269,-78.928471,seasonal,NC Durham


In [7]:
vaccines_data['category'].value_counts().to_frame()

Unnamed: 0,category
seasonal,245118


In [8]:
vaccines_data['supply_level'].value_counts().to_frame()


Unnamed: 0,supply_level
-1,207258
4,28463
3,8088
1,1309


In [9]:
vaccines_data.columns

Index(['provider_location_guid', 'loc_store_no', 'loc_phone', 'loc_name',
       'loc_admin_street1', 'loc_admin_street2', 'loc_admin_city',
       'loc_admin_state', 'loc_admin_zip', 'sunday_hours', 'monday_hours',
       'tuesday_hours', 'wednesday_hours', 'thursday_hours', 'friday_hours',
       'saturday_hours', 'web_address', 'pre_screen', 'insurance_accepted',
       'walkins_accepted', 'provider_notes', 'searchable_name', 'in_stock',
       'supply_level', 'quantity_last_updated', 'latitude', 'longitude',
       'category', 'state_city'],
      dtype='object')

In [10]:
# drop the columns that are not needed 'loc_store_no', 'latitude', 'longitude','category'
vac_new = vaccines_data.drop(['loc_store_no', 'latitude', 'longitude','category','supply_level'], axis=1)
vac_new.shape


(245118, 24)

In [11]:
vac_new['sunday_hours'] = 'sunday_hours:  '+ vac_new['sunday_hours'].astype(str) + '  |  '
vac_new['monday_hours'] = 'monday_hours:  '+ vac_new['monday_hours'].astype(str) + '  |  '
vac_new['tuesday_hours'] = 'tuesday_hours:  '+ vac_new['tuesday_hours'].astype(str) + '  |  '
vac_new['wednesday_hours'] = 'wednesday_hours:  '+ vac_new['wednesday_hours'].astype(str) + '  |  '
vac_new['thursday_hours'] = 'thursday_hours:  '+ vac_new['thursday_hours'].astype(str) + '  |  '
vac_new['friday_hours'] = 'friday_hours:  '+ vac_new['friday_hours'].astype(str) + '  |  '
vac_new['saturday_hours'] = 'saturday_hours:  '+ vac_new['saturday_hours'].astype(str)

# combine the hours into one column
vac_new['opening_hours'] = vac_new['sunday_hours'] + vac_new['monday_hours'] + vac_new['tuesday_hours'] 
+ vac_new['wednesday_hours'] + vac_new['thursday_hours'] + vac_new['friday_hours'] + vac_new['saturday_hours']

0         wednesday_hours:  8:00 AM - 5:00 PM  |  thursd...
1         wednesday_hours:  8:30 AM - 6:30 PM  |  thursd...
2         wednesday_hours:  9:00 AM - 6:00 PM  |  thursd...
3         wednesday_hours:  nan  |  thursday_hours:  nan...
4         wednesday_hours:  9:30 AM - 7:00 PM  |  thursd...
                                ...                        
245113    wednesday_hours:  08:00AM-09:00PM  |  thursday...
245114    wednesday_hours:  08:00AM-08:00PM  |  thursday...
245115    wednesday_hours:  9:00 AM - 7:00 PM  |  thursd...
245116    wednesday_hours:  9:00 AM - 7:00 PM  |  thursd...
245117    wednesday_hours:  8:00 am - 8:00 pm  |  thursd...
Length: 245118, dtype: object

In [12]:
vac_new['opening_hours'].head()

0    sunday_hours:  nan  |  monday_hours:  8:00 AM ...
1    sunday_hours:  Closed  |  monday_hours:  8:30 ...
2    sunday_hours:  Closed  |  monday_hours:  9:00 ...
3    sunday_hours:  nan  |  monday_hours:  nan  |  ...
4    sunday_hours:  Closed  |  monday_hours:  9:30 ...
Name: opening_hours, dtype: object

In [13]:
vac_new.drop(['sunday_hours', 'monday_hours', 'tuesday_hours','wednesday_hours','thursday_hours','friday_hours','saturday_hours'], axis=1, inplace=True)
vac_new.columns

Index(['provider_location_guid', 'loc_phone', 'loc_name', 'loc_admin_street1',
       'loc_admin_street2', 'loc_admin_city', 'loc_admin_state',
       'loc_admin_zip', 'web_address', 'pre_screen', 'insurance_accepted',
       'walkins_accepted', 'provider_notes', 'searchable_name', 'in_stock',
       'quantity_last_updated', 'state_city', 'opening_hours'],
      dtype='object')

In [14]:
# check data type
vac_new.dtypes

provider_location_guid    object
loc_phone                 object
loc_name                  object
loc_admin_street1         object
loc_admin_street2         object
loc_admin_city            object
loc_admin_state           object
loc_admin_zip             object
web_address               object
pre_screen                object
insurance_accepted        object
walkins_accepted          object
provider_notes            object
searchable_name           object
in_stock                    bool
quantity_last_updated     object
state_city                object
opening_hours             object
dtype: object

In [15]:
# save the new dataframe to a csv file
vac_new.to_csv('vaccines_processed.csv', index=False)

In [16]:
# subset for NC
vac_NC = vac_new.loc[vac_new['loc_admin_state'] == 'NC',:]

# save to a csv file
vac_NC.to_csv('vaccines_NC.csv', index=False)

## Create Database Using SQLite

In [17]:
connection = sqlite3.connect("Flu_Vaccines_Provider_NC.db")
cursor = connection.cursor()
cursor.execute("DROP TABLE IF EXISTS Vaccines")

# create a table

cursor.execute("""CREATE TABLE Vaccines (
guid VARCHAR, phone_number VARCHAR, provider_name VARCHAR, street_address1 VARCHAR,
street_address2 VARCHAR, city VARCHAR, state VARCHAR, zip VARCHAR, website VARCHAR, 
pre_sreening_required VARCHAR, insurance_accepted VARCHAR, walkins_accepted VARCHAR, 
provider_notes VARCHAR, searchable_name VARCHAR, in_stock VARCHAR, 
quantity_last_updated VARCHAR, state_city VARCHAR, hours VARCHAR)""") 

# insert data into the table
with open('/Users/liuxiaoquan/Documents/GitHub/Biostats821_Final_Project/vaccines_NC.csv', 'r') as f:
    next(f) # Skip the header row.
    for line in f:
        vaccines_line = line.strip().split(',')
        cursor.execute("INSERT INTO Vaccines VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)",
                          vaccines_line[0:18])

connection.commit()
connection.close()

                       

In [18]:
# select data from the table
connection = sqlite3.connect("Flu_Vaccines_Provider_NC.db")
cursor = connection.cursor()
cursor.execute("SELECT * FROM Vaccines LIMIT 5")
data = cursor.fetchall()
print(data[0])

('0a2bcf19-9d62-41e0-b22e-cc6b6c371b63', '910-278-6050', 'Thomas Drugs #3438365', '7917 E OAK ISLAND DR', '', 'Oak Island', 'NC', '28465', '', '', '', '', '', 'Flu Shot (Egg free)', 'False', '2023-01-09', 'NC Oak Island', 'sunday_hours:  nan  |  monday_hours:  nan  |  tuesday_hours:  nan  |')
