In [17]:
!pip install torch



In [195]:
!pip install plotly



In [1]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import calendar
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
import torch
import torch.nn as nn
import torch.optim as optim
import plotly.graph_objs as go
import plotly.offline as pyo

In [2]:
# load read data
df_hepatitis = pd.read_csv('hepatitis.csv')
df_measles = pd.read_csv('measles.csv')
df_mumps = pd.read_csv('mumps.csv')
df_pertussis = pd.read_csv('pertussis.csv')
df_rubella = pd.read_csv('rubella.csv')
df_smallpox = pd.read_csv('smallpox.csv')
df_hepatitis

Unnamed: 0,week,state,state_name,disease,cases,incidence_per_capita
0,196601,AL,ALABAMA,HEPATITIS A,5,0.14
1,196601,AR,ARKANSAS,HEPATITIS A,11,0.58
2,196601,AZ,ARIZONA,HEPATITIS A,6,0.37
3,196601,CA,CALIFORNIA,HEPATITIS A,89,0.47
4,196601,CO,COLORADO,HEPATITIS A,1,0.05
...,...,...,...,...,...,...
90834,201152,VT,VERMONT,HEPATITIS A,0,0.00
90835,201152,WA,WASHINGTON,HEPATITIS A,0,0.00
90836,201152,WI,WISCONSIN,HEPATITIS A,0,0.00
90837,201152,WV,WEST VIRGINIA,HEPATITIS A,0,0.00


In [3]:
# taking care of data discrepencies 
dfs = [df_hepatitis, df_measles, df_mumps, df_pertussis, df_rubella, df_smallpox]

for i, df in enumerate(dfs):
    max_cases = df['cases'].max()  
    dfs[i] = df[df['cases'] != max_cases]  

In [4]:
class DiseasePredictor(nn.Module):
    def __init__(self):
        super(DiseasePredictor, self).__init__()
        self.fc1 = nn.Linear(in_features=2, out_features=64)  # Assuming 3 features for simplicity
        self.fc2 = nn.Linear(64, 64)
        self.fc3 = nn.Linear(64, 1)  # Output 1 value: the number of cases
        
        self.relu = nn.ReLU()
    
    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.fc3(x)
        return x

In [5]:
# Define a function to convert DataFrames to tensors
def df_to_tensors(df_features, df_target):
    features_tensor = torch.tensor(df_features.values, dtype=torch.float32)
    target_tensor = torch.tensor(df_target.values, dtype=torch.float32).view(-1, 1)
    return features_tensor, target_tensor

# Loop through each DataFrame
disease_dfs = {
    'Hepatitis': df_hepatitis,
    'Measles': df_measles,
    'Mumps': df_mumps,
    'Pertussis': df_pertussis,
    'Rubella': df_rubella,
    'Smallpox': df_smallpox
}

for disease, df in disease_dfs.items():
    print(f"\nProcessing {disease}")
    
    # Assume X and y are defined; you'll need to adapt this part to actually prepare X and y for each df
    X = df[['week', 'incidence_per_capita']]  # Placeholder: replace with actual features
    y = df['cases']
    
    # Split into training and validation sets
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Convert to tensors
    X_train_tensor, y_train_tensor = df_to_tensors(X_train, y_train)
    X_val_tensor, y_val_tensor = df_to_tensors(X_val, y_val)
    
    # Initialize model and other components for each disease to avoid knowledge retention
    model = DiseasePredictor()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    criterion = torch.nn.MSELoss()
    
    # Training loop
    epochs = 5
    for epoch in range(epochs):
        optimizer.zero_grad()
        outputs = model(X_train_tensor)
        loss = criterion(outputs, y_train_tensor)
        loss.backward()
        optimizer.step()
    
    # Evaluation
    model.eval()
    with torch.no_grad():
        predictions = model(X_val_tensor)
        val_loss = criterion(predictions, y_val_tensor)
        print(f"{disease} Validation Loss: {val_loss.item()}")

    # Optionally, save each model with a disease-specific name
    torch.save(model.state_dict(), f'{disease.lower()}_model.pth')


Processing Hepatitis
Hepatitis Validation Loss: 42110420.0

Processing Measles
Measles Validation Loss: 25350924.0

Processing Mumps
Mumps Validation Loss: 5101577.0

Processing Pertussis
Pertussis Validation Loss: 126145.40625

Processing Rubella
Rubella Validation Loss: 76890.1328125

Processing Smallpox
Smallpox Validation Loss: 27919722.0


In [6]:
def predict_and_create_table(model, X_features, states):
    # Convert features to tensor
    features_tensor = torch.tensor(X_features.values, dtype=torch.float32)
    
    # Predict cases
    model.eval()  # Set the model to evaluation mode
    with torch.no_grad():
        predicted_cases_tensor = model(features_tensor)
    
    # Convert predictions to numpy array
    predicted_cases = predicted_cases_tensor.numpy().flatten()  # Adjust shape as necessary
    
    # Create DataFrame with state and predicted cases
    predicted_df = pd.DataFrame({
        'State': states,
        'Predicted Cases': predicted_cases
    })
    
    return predicted_df

# Assuming disease_dfs dictionary is already defined and filled with DataFrames for each disease

for disease, df in disease_dfs.items():
    print(f"\nProcessing {disease}")
    
    # Extract state information
    states = df['state']
    
    # Prepare features - ensure these match your model's expected input
    X = df[['week', 'incidence_per_capita']]
    
    # Initialize model - assuming a single model architecture for all diseases
    model = DiseasePredictor()
    # Load the trained model weights - replace 'your_model_path.pth' with the actual path
    model_path = f'{disease.lower()}_model.pth'
    model.load_state_dict(torch.load(model_path))
    
    # Predict cases and create table
    predicted_table = predict_and_create_table(model, X, states)
    
    # Print the table
    print(f"{disease} Predicted Cases by State:")
    print(predicted_table)


Processing Hepatitis
Hepatitis Predicted Cases by State:
      State  Predicted Cases
0        AL      6426.687012
1        AR      6426.750488
2        AZ      6426.720703
3        CA      6426.735352
4        CO      6426.675293
...     ...              ...
90834    VT      6575.432129
90835    WA      6575.432129
90836    WI      6575.432129
90837    WV      6575.432129
90838    WY      6575.432129

[90839 rows x 2 columns]

Processing Measles
Measles Predicted Cases by State:
       State  Predicted Cases
0         AL      5062.830566
1         AR      5062.849121
2         AZ      5062.762207
3         CA      5062.740234
4         CO      5063.017090
...      ...              ...
145162    NV      5258.346191
145163    NY      5258.346191
145164    OH      5258.346191
145165    TX      5258.346191
145166    UT      5258.346191

[145167 rows x 2 columns]

Processing Mumps
Mumps Predicted Cases by State:
      State  Predicted Cases
0        AK      2252.207764
1        AL      22

In [44]:
# # Convert X_train and y_train to PyTorch tensors
# X_train_tensor = torch.tensor(X_train.values, dtype=torch.float32)
# y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32)

# # If y_train is a series, ensure it's reshaped into a 2D tensor for consistency
# y_train_tensor = y_train_tensor.view(-1, 1)

In [208]:
# epochs = 5  # Example epoch count
# for epoch in range(epochs):
#     optimizer.zero_grad()
#     outputs = model(X_train_tensor)  # Use the tensor version
#     loss = criterion(outputs, y_train_tensor)  # Use the tensor version
#     loss.backward()
#     optimizer.step()
#     print(f"Epoch {epoch+1}, Loss: {loss.item()}")

In [36]:
# torch.save(model.state_dict(), 'model.pth')

In [178]:
# X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)
# X_test, X_val, y_test, y_val = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [211]:
# X_val_tensor = torch.tensor(X_val.values, dtype=torch.float32)
# y_val_tensor = torch.tensor(y_val.values, dtype=torch.float32)
# y_val_tensor = y_val_tensor.view(-1, 1)

In [181]:
# model.eval()  # Set the model to evaluation mode
# with torch.no_grad():
#     predictions = model(X_val_tensor)  # Use the tensor version
#     val_loss = criterion(predictions, y_val_tensor)  # Use the tensor version
#     print(f"Validation Loss: {val_loss.item()}")

Validation Loss: 5908156.5


In [207]:
model_files = ['hepatitis_model.pth', 'measles_model.pth', 'mumps_model.pth', 
               'pertussis_model.pth', 'rubella_model.pth', 'smallpox_model.pth']
models = {}

for file_name in model_files:
    model = DiseasePredictor()
    model.load_state_dict(torch.load(file_name))
    model.eval()
    models[file_name] = model

In [199]:
def calculate_accuracy(model, X_val_tensor, y_val_tensor):
    with torch.no_grad():
        outputs = model(X_val_tensor)
        _, predicted_classes = torch.max(outputs, 1)
        correct_predictions = (predicted_classes == y_val_tensor).sum().item()
        accuracy = correct_predictions / y_val_tensor.size(0)
    return accuracy

datasets = {
    'hepatitis': df_hepatitis,
    'measles': df_measles,
    'mumps': df_mumps,
    'pertussis': df_pertussis,
    'rubella': df_rubella,
    'smallpox': df_smallpox
}

# validation_data = {
#     'hepatitis': (X_val_tensor_hepatitis, y_val_tensor_hepatitis),
#     'measles': (X_val_tensor_measles, y_val_tensor_measles),
#     'mumps': (X_val_tensor_mumps, y_val_tensor_mumps),
#     'pertussis': (X_val_tensor_pertussis, y_val_tensor_pertussis),
#     'rubella': (X_val_tensor_rubella, y_val_tensor_rubella),
#     'smallpox': (X_val_tensor_smallpox, y_val_tensor_smallpox)
# }

validation_tensors = {}

for disease, df in datasets.items():
    X = df['incidence_per_capita']  # Your actual feature columns
    y = df['cases']  # Or whatever your target column is
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Convert X_val, y_val to tensors
    X_val_tensor = torch.tensor(X_val.values, dtype=torch.float32)
    y_val_tensor = torch.tensor(y_val.values, dtype=torch.float32).view(-1, 1)  # Adjust dtype if classification
    
    validation_tensors[disease] = (X_val_tensor, y_val_tensor)
    
for pth in model_files:
    disease_name = pth.split("_")[0]  # Assuming naming convention holds
    model = models[pth]
    
    X_val_tensor, y_val_tensor = validation_tensors[disease_name]  # Get the correct validation tensors
    
    # Assuming calculate_accuracy function definition remains the same
    accuracy = calculate_accuracy(model, X_val_tensor, y_val_tensor)
    print(f"{disease_name.capitalize()} Model Accuracy: {accuracy}")

# # Printing each model's accuracy
# for pth in model_files:
#     disease_name = pth.split("_")[0].capitalize()
#     model = models[pth]
#     X_val_disease, y_val_disease = validation_data[disease_name]
#     accuracy = calculate_accuracy(model, X_val_disease, y_val_disease)
#     print(f"{disease_name} Model Accuracy: {accuracy}\n")

In [206]:
all_states = [
        'AL', 'AK', 'AZ', 'AR', 'CA', 'CO', 'CT', 'DE', 'FL', 'GA', 
        'HI', 'ID', 'IL', 'IN', 'IA', 'KS', 'KY', 'LA', 'ME', 'MD', 
        'MA', 'MI', 'MN', 'MS', 'MO', 'MT', 'NE', 'NV', 'NH', 'NJ', 
        'NM', 'NY', 'NC', 'ND', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC', 
        'SD', 'TN', 'TX', 'UT', 'VT', 'VA', 'WA', 'WV', 'WI', 'WY']

df_hepatitis = pd.DataFrame({'state': ['PA'], 'predicted_cases': [100]})
df_measles = pd.DataFrame({'state': [all_states], 'predicted_cases': [150]})
df_mumps = pd.DataFrame({'state': [all_states], 'predicted_cases': [20]})
df_pertussis = pd.DataFrame({'state': [all_states], 'predicted_cases': [250]})
df_rubella = pd.DataFrame({'state': [all_states], 'predicted_cases': [300]})
df_smallpox = pd.DataFrame({'state': [all_states], 'predicted_cases': [350]})

disease_dataframes = {
    'Hepatitis': df_hepatitis,
    'Measles': df_measles,
    'Mumps': df_mumps,
    'Pertussis': df_pertussis,
    'Rubella': df_rubella,
    'Smallpox': df_smallpox
}

# Ensure all DataFrames have a 'predicted_cases' column; add it with default values if missing
for disease_name, df in disease_dataframes.items():
    if 'predicted_cases' not in df.columns:
        df['predicted_cases'] = 0  # Assign a default value

# Function to create a heatmap for a given disease DataFrame
def create_heatmap(df, disease_name):
    # List of all US state abbreviations
    all_states = [
        'AL', 'AK', 'AZ', 'AR', 'CA', 'CO', 'CT', 'DE', 'FL', 'GA', 
        'HI', 'ID', 'IL', 'IN', 'IA', 'KS', 'KY', 'LA', 'ME', 'MD', 
        'MA', 'MI', 'MN', 'MS', 'MO', 'MT', 'NE', 'NV', 'NH', 'NJ', 
        'NM', 'NY', 'NC', 'ND', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC', 
        'SD', 'TN', 'TX', 'UT', 'VT', 'VA', 'WA', 'WV', 'WI', 'WY'
    ]
    
    # Add missing states with default predicted_cases value
    missing_states = set(all_states) - set(df['state'])
    missing_data = [{'state': state, 'predicted_cases': 0} for state in missing_states]
    df = pd.concat([df, pd.DataFrame(missing_data)], ignore_index=True)
    
    # Create the heatmap
    fig = go.Figure(data=go.Choropleth(
        locations=df['state'],  # Spatial coordinates
        z=df['predicted_cases'].astype(float),  # Data to be color-coded
        locationmode='USA-states',  # set of locations match entries in `locations`
        colorscale='Reds',
        colorbar_title="Predicted Cases",
    ))

    fig.update_layout(
        title_text=f'Predicted {disease_name} Cases by State',
        geo_scope='usa',  # limit map scope to USA
    )

    # Save the plot as an HTML file
    filename = f'heatmap_{disease_name.lower()}.html'
    pyo.plot(fig, filename=filename)

# Generate and save a heatmap for each disease
for disease_name, df in disease_dataframes.items():
    create_heatmap(df, disease_name)

TypeError: unhashable type: 'list'

In [11]:
# df_hepatitis['year'] = df_hepatitis['week'].apply(lambda x: int(str(x)[:4]))
# df_hepatitis['week_of_year'] = df_hepatitis['week'].apply(lambda x: int(str(x)[4:]))

# # Use the 'year' and 'week_of_year' as features for now
# X = df_hepatitis[['year', 'week_of_year', 'incidence_per_capita']]
# y = df_hepatitis['cases']

# # Splitting the dataset into training and testing sets
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
# # Initialize and train the linear regression model
# model = LinearRegression()
# model.fit(X_train, y_train)

# # Predict on the testing set
# y_pred = model.predict(X_test)

# # Evaluate the model
# rmse = np.sqrt(mean_squared_error(y_test, y_pred))
# # Calculate R-squared on the training set
# r_squared_train = model.score(X_train, y_train)
# print(f"R-squared on the training set: {r_squared_train}")

# # Calculate R-squared on the testing set
# r_squared_test = model.score(X_test, y_test)
# print(f"R-squared on the testing set: {r_squared_test}")

# print(f"RMSE: {rmse}")

R-squared on the training set: 0.1599361581139458
R-squared on the testing set: 0.14466848839838797
RMSE: 20.72902526140426
