# Exploratory Analysis of California State Board of Pharmacy Sterile Compounding Licenses

The pharmacy data utilized in this analysis was retrieved from the California State Board of Pharmacy License Verification Database on 11/24/2024 at 2:53pm CST.  Please visit https://search.dca.ca.gov/results for more information. 

## Notebook Setup

In [58]:
pip install matplotlib

In [60]:
import pandas as pd
import matplotlib.pyplot as plt


# Set pandas to display more rows/columns for wider dataframes
pd.set_option('display.max_columns', None)  # Show all columns
pd.set_option('display.max_rows', 100)     # Adjust rows
pd.set_option('display.width', 1000)       # Increase width for better visibility

#pd.reset_option('all') 

## Load & Clean Pharmacy Data

In [86]:
pharmacies = pd.read_csv('attempt_19.csv')
pharmacies.head()

#Drop Duplicates
pharmacies = pharmacies.drop_duplicates()

#Drop Accidental Column
pharmacies = pharmacies.drop(columns=['CEDARS-SINAI MEDICAL CENTER -4TH FLOOR MAIN'])

#Cast Zip as String
pharmacies['Zip'] = pharmacies['Zip'].astype(str).str.replace(r'\.0$', '', regex=True)

#Drop NAs
pharmacies = pharmacies.dropna()



## Inspect Pharmacy Data

The below cells provide statistics and summaries of the entire dataset. 

In [87]:
#Review summary statistics
pharmacies.describe()

In [105]:
# Find rows with duplicated License Numbers (keeping the first occurrence)
duplicated_license_numbers = pharmacies[pharmacies.duplicated(subset='License Number', keep=False)]

# Display the result
duplicated_license_numbers


In [88]:
#Review datatypes and memory usage
pharmacies.info()

In [89]:
#Inspect datatypes
pharmacies.dtypes

In [90]:
#Review unique titles in pharmacy table
pharmacies['Pharmacy Name'].value_counts()

In [91]:
#Count any null values 
pharmacies.isnull().sum()

## Exploring the Pharmacy Data Visually

The below cells are an initial analysis of the pharmacy data utilizing matplotlib.pyplot.  

In [95]:
import matplotlib.pyplot as plt

# Group data by License Type and count occurrences, sorting by descending order
license_counts = pharmacies['License Type'].value_counts(ascending=False)

# Adjust the figure size
plt.figure(figsize=(10, 8))  # Taller to fit long labels

# Plot the horizontal bar graph
bars = plt.barh(
    license_counts.index, license_counts.values, color='skyblue', edgecolor='black'
)

# Add title and labels with increased font size
plt.title('Histogram of License Type', fontsize=16)
plt.xlabel('Count', fontsize=14)
plt.ylabel('Type', fontsize=14)

# Add labels to the bars
for bar in bars:
    # Add the label to the end of each bar
    plt.text(
        bar.get_width() + 0.5,  # Slightly beyond the bar's end
        bar.get_y() + bar.get_height() / 2,  # Vertically centered
        f'{int(bar.get_width())}',  # Text is the width (count)
        va='center', fontsize=10
    )

# Ensure layout adjusts for long labels
plt.tight_layout()

plt.show()


In [99]:
import matplotlib.pyplot as plt

# Generate descriptive statistics
summary = pharmacies.describe()

# Transpose the summary for easier access
summary_transposed = summary.T

# Select the `count` and `freq` rows
summary_subset = summary_transposed[['count', 'freq']]

# Plot as a grouped horizontal bar chart
summary_subset.plot(
    kind='barh', 
    figsize=(12, 8), 
    color=['skyblue', 'orange'], 
    edgecolor='black'
)

# Add title and axis labels
plt.title('Descriptive Statistics for Categorical Data', fontsize=16)
plt.xlabel('Value', fontsize=14)
plt.ylabel('Features', fontsize=14)

# Adjust layout for readability
plt.tight_layout()
plt.legend(title='Statistics', fontsize=12)
plt.show()


In [104]:
import matplotlib.pyplot as plt

# Calculate the value counts for the 'Pharmacy Name' column, limit to top 25, and sort by frequency
top_pharmacies = pharmacies['Pharmacy Name'].value_counts().head(25).sort_values(ascending=True)

# Create the horizontal bar chart
plt.figure(figsize=(12, 8))
plt.barh(top_pharmacies.index, top_pharmacies.values, color='skyblue', edgecolor='black')

# Add labels and title
plt.title('Top 25 Pharmacy Names by Frequency', fontsize=16)
plt.xlabel('Count of Pharmacy Name', fontsize=14)
plt.ylabel('Pharmacy Name', fontsize=14)

# Annotate bars with the count values
for index, value in enumerate(top_pharmacies.values):
    plt.text(value + 0.5, index, str(value), va='center', fontsize=10)

# Adjust layout for readability
plt.tight_layout()

plt.show()


In [100]:
import matplotlib.pyplot as plt

# Calculate the number of unique values for each column
unique_counts = pharmacies.nunique()

# Sort the unique counts for better visualization (optional)
unique_counts = unique_counts.sort_values(ascending=False)

# Create the horizontal bar chart
plt.figure(figsize=(12, 8))
plt.barh(unique_counts.index, unique_counts.values, color='skyblue', edgecolor='black')

# Add labels and title
plt.title('Number of Unique Values per Column', fontsize=16)
plt.xlabel('Number of Unique Values', fontsize=14)
plt.ylabel('Columns', fontsize=14)

# Annotate bars with the unique count values
for index, value in enumerate(unique_counts.values):
    plt.text(value + 0.5, index, str(value), va='center', fontsize=10)

# Adjust layout for readability
plt.tight_layout()

plt.show()


In [96]:
#Create a histogram of the License Status distribution
pharmacies['License Status'].hist(bins=20)
plt.title('Histogram of License Status')
plt.xlabel('Status')
plt.ylabel('Count')
plt.show()

In [118]:
# Group by year and month for a time series plot
pharmacies['Expiration Date'] = pd.to_datetime(pharmacies['Expiration Date'], errors='coerce')

pharmacies['Expiration Month'] = pharmacies['Expiration Date'].dt.to_period('M')

# Count number of expirations per month
expiration_by_month = pharmacies['Expiration Month'].value_counts().sort_index()

# Plot the time series of expirations by month
plt.figure(figsize=(12, 6))
expiration_by_month.plot(kind='line', marker='o', color='skyblue')

# Add title and labels
plt.title('Number of Expirations per Month', fontsize=16)
plt.xlabel('Month', fontsize=14)
plt.ylabel('Number of Expirations', fontsize=14)

# Improve layout
plt.tight_layout()
plt.show()


In [127]:
pip install plotly

In [128]:
# Example aggregation: Count the number of pharmacies by state
pharmacies_by_state = pharmacies['State'].value_counts()
pharmacies_by_state = pharmacies_by_state.reset_index()
pharmacies_by_state.columns = ['State', 'Count']

print(pharmacies_by_state.head())

import plotly.express as px

# Create the choropleth map
fig = px.choropleth(
    pharmacies_by_state,
    locations='State',          # Column containing state abbreviations
    locationmode='USA-states',  # Use USA state codes
    color='Count',              # Metric to color by
    color_continuous_scale='Blues',  # Color scale
    scope='usa',                # Focus on the USA
    title='Pharmacies by State'
)

# Display the figure
fig.show()


In [134]:
# Full state name to abbreviation mapping
state_name_to_abbrev = {
    'Alabama': 'AL', 'Alaska': 'AK', 'Arizona': 'AZ', 'Arkansas': 'AR', 'California': 'CA',
    'Colorado': 'CO', 'Connecticut': 'CT', 'Delaware': 'DE', 'Florida': 'FL', 'Georgia': 'GA',
    'Hawaii': 'HI', 'Idaho': 'ID', 'Illinois': 'IL', 'Indiana': 'IN', 'Iowa': 'IA',
    'Kansas': 'KS', 'Kentucky': 'KY', 'Louisiana': 'LA', 'Maine': 'ME', 'Maryland': 'MD',
    'Massachusetts': 'MA', 'Michigan': 'MI', 'Minnesota': 'MN', 'Mississippi': 'MS', 'Missouri': 'MO',
    'Montana': 'MT', 'Nebraska': 'NE', 'Nevada': 'NV', 'New Hampshire': 'NH', 'New Jersey': 'NJ',
    'New Mexico': 'NM', 'New York': 'NY', 'North Carolina': 'NC', 'North Dakota': 'ND', 'Ohio': 'OH',
    'Oklahoma': 'OK', 'Oregon': 'OR', 'Pennsylvania': 'PA', 'Rhode Island': 'RI', 'South Carolina': 'SC',
    'South Dakota': 'SD', 'Tennessee': 'TN', 'Texas': 'TX', 'Utah': 'UT', 'Vermont': 'VT',
    'Virginia': 'VA', 'Washington': 'WA', 'West Virginia': 'WV', 'Wisconsin': 'WI', 'Wyoming': 'WY',
}

# Add state abbreviations to the DataFrame
pharmacies_by_state['Abbreviation'] = pharmacies_by_state['State'].map(state_name_to_abbrev)

import plotly.express as px

# Create the choropleth map
fig = px.choropleth(
    pharmacies_by_state,
    locations='Abbreviation',     # Column with state abbreviations
    locationmode='USA-states',    # Use USA state codes
    color='Count',                # Metric to color by
    color_continuous_scale='Blues',  # Color scale
    scope='usa',                  # Focus on the USA
    title='Pharmacies by State'
)

# Display the figure
fig.show()



In [136]:
pip install pgeocode

In [145]:
import pgeocode
import pandas as pd
import plotly.express as px

# Filter for California
pharmacies_in_california = pharmacies[pharmacies['State'] == 'California']

# Count the number of pharmacies per Zip code
pharmacies_by_zip = pharmacies_in_california['Zip'].value_counts().reset_index()
pharmacies_by_zip.columns = ['Zip', 'Count']

# Get latitude and longitude for each Zip code using pgeocode
nomi = pgeocode.Nominatim('us')

# Extract lat/lon for each Zip
latitudes = []
longitudes = []
for zip_code in pharmacies_by_zip['Zip']:
    geo_info = nomi.query_postal_code(zip_code)
    if geo_info.latitude and geo_info.longitude:
        latitudes.append(geo_info.latitude)
        longitudes.append(geo_info.longitude)
    else:
        latitudes.append(None)
        longitudes.append(None)

# Add lat/lon to DataFrame
pharmacies_by_zip['Latitude'] = latitudes
pharmacies_by_zip['Longitude'] = longitudes

# Drop rows where lat/lon are missing (invalid or unrecognized Zip codes)
pharmacies_by_zip.dropna(subset=['Latitude', 'Longitude'], inplace=True)

# Debug: Print out the first few rows to verify lat/lon
print(pharmacies_by_zip.head())

# Create scatter geo plot focused on California
fig = px.scatter_geo(
    pharmacies_by_zip,
    lat='Latitude',
    lon='Longitude',
    size='Count',  # Size of the point represents the number of pharmacies
    hover_name='Zip',  # Hover shows the Zip code
    title='Pharmacies Distribution by Zip in California',
    color='Count',  # Color points by count
    color_continuous_scale='Blues',
    scope='usa'  # Keep USA scope for state outlines
)

# Update the map to zoom into California (latitude/longitude boundaries)
fig.update_geos(
    visible=False,  # Remove national boundaries
    resolution=50,  # Higher resolution for better detail
    projection_type="mercator",  # Mercator projection for detailed zoom
    center={"lat": 37.5, "lon": -119.5},  # Approximate geographic center of California
    lataxis_range=[32, 42],  # Latitude bounds for California
    lonaxis_range=[-125, -114],  # Longitude bounds for California
)

# Show the plot
fig.show()


In [146]:
import pgeocode
import pandas as pd
import plotly.express as px

# Filter for California
pharmacies_in_california = pharmacies[pharmacies['State'] == 'California']

# Count the number of pharmacies per Zip code
pharmacies_by_zip = pharmacies_in_california['Zip'].value_counts().reset_index()
pharmacies_by_zip.columns = ['Zip', 'Count']

# Get latitude and longitude for each Zip code using pgeocode
nomi = pgeocode.Nominatim('us')

# Extract lat/lon for each Zip
latitudes = []
longitudes = []
for zip_code in pharmacies_by_zip['Zip']:
    geo_info = nomi.query_postal_code(zip_code)
    if geo_info.latitude and geo_info.longitude:
        latitudes.append(geo_info.latitude)
        longitudes.append(geo_info.longitude)
    else:
        latitudes.append(None)
        longitudes.append(None)

# Add lat/lon to DataFrame
pharmacies_by_zip['Latitude'] = latitudes
pharmacies_by_zip['Longitude'] = longitudes

# Drop rows where lat/lon are missing (invalid or unrecognized Zip codes)
pharmacies_by_zip.dropna(subset=['Latitude', 'Longitude'], inplace=True)

# Check if DataFrame is empty after dropping invalid rows
print(f"Data points left for plotting: {len(pharmacies_by_zip)}")
print(pharmacies_by_zip.head())  # Optional: Print first few rows to verify lat/lon

# Ensure lat/lon are numeric (float) for plotting
pharmacies_by_zip['Latitude'] = pd.to_numeric(pharmacies_by_zip['Latitude'], errors='coerce')
pharmacies_by_zip['Longitude'] = pd.to_numeric(pharmacies_by_zip['Longitude'], errors='coerce')

# Basic scatter geo plot (without specific zoom settings)
fig = px.scatter_geo(
    pharmacies_by_zip,
    lat='Latitude',
    lon='Longitude',
    size='Count',  # Size of the point represents the number of pharmacies
    hover_name='Zip',  # Hover shows the Zip code
    title='Pharmacies Distribution by Zip in California',
    color='Count',  # Color points by count
    color_continuous_scale='Blues',
    scope='usa',  # Keep USA scope for state outlines
)

# Show the plot
fig.show()


In [148]:
import pgeocode
import pandas as pd
import plotly.express as px

# Filter for California
pharmacies_in_california = pharmacies[pharmacies['State'] == 'California']

# Count the number of pharmacies per Zip code
pharmacies_by_zip = pharmacies_in_california['Zip'].value_counts().reset_index()
pharmacies_by_zip.columns = ['Zip', 'Count']

# Get latitude and longitude for each Zip code using pgeocode
nomi = pgeocode.Nominatim('us')

# Extract lat/lon for each Zip
latitudes = []
longitudes = []
for zip_code in pharmacies_by_zip['Zip']:
    geo_info = nomi.query_postal_code(zip_code)
    if geo_info.latitude and geo_info.longitude:
        latitudes.append(geo_info.latitude)
        longitudes.append(geo_info.longitude)
    else:
        latitudes.append(None)
        longitudes.append(None)

# Add lat/lon to DataFrame
pharmacies_by_zip['Latitude'] = latitudes
pharmacies_by_zip['Longitude'] = longitudes

# Drop rows where lat/lon are missing (invalid or unrecognized Zip codes)
pharmacies_by_zip.dropna(subset=['Latitude', 'Longitude'], inplace=True)

# Check if DataFrame is empty after dropping invalid rows
print(f"Data points left for plotting: {len(pharmacies_by_zip)}")
print(pharmacies_by_zip.head())  # Optional: Print first few rows to verify lat/lon

# Ensure lat/lon are numeric (float) for plotting
pharmacies_by_zip['Latitude'] = pd.to_numeric(pharmacies_by_zip['Latitude'], errors='coerce')
pharmacies_by_zip['Longitude'] = pd.to_numeric(pharmacies_by_zip['Longitude'], errors='coerce')

# Create scatter geo plot focused on California
fig = px.scatter_geo(
    pharmacies_by_zip,
    lat='Latitude',
    lon='Longitude',
    size='Count',  # Size of the point represents the number of pharmacies
    hover_name='Zip',  # Hover shows the Zip code
    title='Pharmacies Distribution by Zip in California',
    color='Count',  # Color points by count
    color_continuous_scale='Blues'
)

# Update the map to zoom into California (latitude/longitude boundaries)
fig.update_geos(
    visible=True,  # Keep state boundaries visible
    resolution=50,  # Higher resolution for better detail
    projection_type="mercator",  # Mercator projection for detailed zoom
    center={"lat": 37.5, "lon": -119.5},  # Approximate geographic center of California
    lataxis_range=[32, 42],  # Latitude bounds for California
    lonaxis_range=[-125, -114],  # Longitude bounds for California
)

# Show the plot
fig.show()


In [149]:
import pgeocode
import pandas as pd
import plotly.express as px

# Filter for California
pharmacies_in_california = pharmacies[pharmacies['State'] == 'California']

# Count the number of pharmacies per Zip code
pharmacies_by_zip = pharmacies_in_california['Zip'].value_counts().reset_index()
pharmacies_by_zip.columns = ['Zip', 'Count']

# Get latitude and longitude for each Zip code using pgeocode
nomi = pgeocode.Nominatim('us')

# Extract lat/lon for each Zip
latitudes = []
longitudes = []
for zip_code in pharmacies_by_zip['Zip']:
    geo_info = nomi.query_postal_code(zip_code)
    if geo_info.latitude and geo_info.longitude:
        latitudes.append(geo_info.latitude)
        longitudes.append(geo_info.longitude)
    else:
        latitudes.append(None)
        longitudes.append(None)

# Add lat/lon to DataFrame
pharmacies_by_zip['Latitude'] = latitudes
pharmacies_by_zip['Longitude'] = longitudes

# Drop rows where lat/lon are missing (invalid or unrecognized Zip codes)
pharmacies_by_zip.dropna(subset=['Latitude', 'Longitude'], inplace=True)

# Check if DataFrame is empty after dropping invalid rows
print(f"Data points left for plotting: {len(pharmacies_by_zip)}")
print(pharmacies_by_zip.head())  # Optional: Print first few rows to verify lat/lon

# Ensure lat/lon are numeric (float) for plotting
pharmacies_by_zip['Latitude'] = pd.to_numeric(pharmacies_by_zip['Latitude'], errors='coerce')
pharmacies_by_zip['Longitude'] = pd.to_numeric(pharmacies_by_zip['Longitude'], errors='coerce')

# Create scatter geo plot focused on California
fig = px.scatter_geo(
    pharmacies_by_zip,
    lat='Latitude',
    lon='Longitude',
    size='Count',  # Size of the point represents the number of pharmacies
    hover_name='Zip',  # Hover shows the Zip code
    title='Pharmacies Distribution by Zip in California',
    color='Count',  # Color points by count
    color_continuous_scale='Blues',  # This can be modified for better contrast
)

# Update the map to zoom into California and adjust colors
fig.update_geos(
    visible=True,  # Keep state boundaries visible
    resolution=50,  # Higher resolution for better detail
    projection_type="mercator",  # Mercator projection for detailed zoom
    center={"lat": 37.5, "lon": -119.5},  # Approximate geographic center of California
    lataxis_range=[32, 42],  # Latitude bounds for California
    lonaxis_range=[-125, -114],  # Longitude bounds for California
    landcolor="#f5e1a4",  # Set land color to light straw (beige) color
    showcoastlines=True,
    coastlinecolor="black",  # Black coastline for better contrast
)

# Show the plot
fig.show()


In [152]:
# Print the column names and inspect the structure
print(county_geo.columns)
print(county_geo.head())