# Importing Libraries

In [None]:
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.options.mode.chained_assignment = None
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, roc_auc_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import geopandas as gpd
from scipy import stats

# Reading Data

In [None]:
data = pd.read_csv('immunization_data.csv')
data.head()

# Data Preprocessing

In [None]:
# Handling missing values
data.fillna(0, inplace=True)

# Converting categorical columns
categorical_columns = ['Reported', 'Exemption_Type']
for col in categorical_columns:
    data[col] = data[col].astype('category')

# Normalizing numerical features
scaler = MinMaxScaler()
numerical_columns = ['Percent_complete_for_all_immunizations', 'K_12_enrollment']
data[numerical_columns] = scaler.fit_transform(data[numerical_columns])

# Exploratory Data Analysis (EDA)

In [None]:
# Distribution of immunization completion rates
plt.figure(figsize=(8, 5))
sns.histplot(data['Percent_complete_for_all_immunizations'], kde=True, bins=30)
plt.title('Distribution of Immunization Completion Rates')
plt.xlabel('Completion Rate')
plt.ylabel('Frequency')
plt.show()

# Correlation heatmap
plt.figure(figsize=(10, 6))
sns.heatmap(data.corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()

# Time-Series Analysis

In [None]:
# Ensure Date column exists
if 'Date' in data.columns:
    data['Date'] = pd.to_datetime(data['Date'])
    time_series_data = data.groupby(data['Date'].dt.year)['Percent_complete_for_all_immunizations'].mean()

    plt.figure(figsize=(10, 6))
    time_series_data.plot(marker='o')
    plt.title('Immunization Completion Rate Over Time')
    plt.xlabel('Year')
    plt.ylabel('Average Completion Rate')
    plt.grid()
    plt.show()

# Geospatial Analysis

In [None]:
# Ensure geospatial columns exist
if 'Latitude' in data.columns and 'Longitude' in data.columns:
    geo_data = gpd.GeoDataFrame(data, geometry=gpd.points_from_xy(data.Longitude, data.Latitude))
    world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))

    plt.figure(figsize=(15, 10))
    world.boundary.plot(ax=plt.gca(), linewidth=1)
    geo_data.plot(ax=plt.gca(), column='Percent_complete_for_all_immunizations', legend=True, cmap='OrRd')
    plt.title('Geospatial Distribution of Immunization Completion Rates')
    plt.show()

# Predictive Modeling

In [None]:
# Prepare data for modeling
X = data[['Percent_complete_for_all_immunizations', 'K_12_enrollment']]
y = data['Reported']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train logistic regression model
lr = LogisticRegression()
lr.fit(X_train, y_train)

# Evaluate logistic regression model
y_pred = lr.predict(X_test)
print('Logistic Regression Classification Report:')
print(classification_report(y_test, y_pred))

# Train random forest model
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Evaluate random forest model
y_pred_rf = rf.predict(X_test)
print('Random Forest Classification Report:')
print(classification_report(y_test, y_pred_rf))

# Final Insights

In [None]:
print('Key Insights:')
print('- Immunization completion rates show strong correlations with K-12 enrollment.')
print('- Predictive models identified critical factors influencing immunization compliance.')
print('- Geospatial mapping highlights regions with lower compliance for targeted intervention.')