In [13]:
import pandas as pd

# Sample DataFrame
data = pd.DataFrame({
    "FIPS": [1001, 1001, 1001, 1003, 1003, 1003],
    "state": ["AL", "AL", "AL", "AL", "AL", "AL"],
    "Week": [1, 2, 3, 1, 2, 3],
    "cases": [10, 20, 30, 40, 50, 60]
})

# Step 1: Sort the DataFrame
data = data.sort_values(by=["FIPS", "state", "Week"])

# Step 2: Shift the cases column to create next_week_cases
data['next_week_cases'] = data.groupby(['FIPS', 'state'])['cases'].shift(-1)

# Step 3: Rename the original cases column and create the final DataFrame
data.rename(columns={'cases': 'current_week_cases'}, inplace=True)
final_df = data[['FIPS', 'state', 'Week', 'current_week_cases', 'next_week_cases']]

# Show the resulting DataFrame
print(final_df)


   FIPS state  Week  current_week_cases  next_week_cases
0  1001    AL     1                  10             20.0
1  1001    AL     2                  20             30.0
2  1001    AL     3                  30              NaN
3  1003    AL     1                  40             50.0
4  1003    AL     2                  50             60.0
5  1003    AL     3                  60              NaN


In [14]:
import pandas as pd

# Assuming 'data' is your existing DataFrame from the previous transformation
# Sample DataFrame creation (replicating previous final data structure)
# data = pd.DataFrame({
#     "FIPS": [1001, 1001, 1001, 1003, 1003, 1003],
#     "state": ["AL", "AL", "AL", "AL", "AL", "AL"],
#     "Week": [1, 2, 3, 1, 2, 3],
#     "current_week_cases": [10, 20, 30, 40, 50, 60],
#     "next_week_cases": [20, 30, None, 50, 60, None]
# })
data = final_df

# Step 1: Create new DataFrame for pgmpy input
# We need to make two entries per row: one for _current and one for _next
data_current = data[['FIPS', 'Week', 'current_week_cases']].copy()
data_next = data[['FIPS', 'Week', 'next_week_cases']].copy()

# Rename columns
data_current.columns = ['FIPS', 'Week', 'cases']
data_next.columns = ['FIPS', 'Week', 'cases']

# Add a suffix to distinguish between current and next
data_current['variable'] = data_current['FIPS'].astype(str) + '_current'
data_next['variable'] = data_next['FIPS'].astype(str) + '_next'

# Concatenate both DataFrames vertically
pgmpy_data = pd.concat([data_current, data_next], ignore_index=True)

# Pivot to wide format
pgmpy_data = pgmpy_data.pivot_table(index='Week', columns='variable', values='cases', aggfunc='first')

# Flatten the columns MultiIndex (not needed here, just simplifying)
pgmpy_data.columns = pgmpy_data.columns.get_level_values(0)

# The resulting DataFrame
print(pgmpy_data)


variable  1001_current  1001_next  1003_current  1003_next
Week                                                      
1                 10.0       20.0          40.0       50.0
2                 20.0       30.0          50.0       60.0
3                 30.0        NaN          60.0        NaN


In [15]:
print(pgmpy_data.columns)

Index(['1001_current', '1001_next', '1003_current', '1003_next'], dtype='object', name='variable')


In [16]:
import numpy as np

# Dictionary to store bin edges
bin_edges_dict = {}

# Function to calculate quantile-based bins and apply discretization
def discretize_columns(df, fip):
    current_col = f"{fip}_current"
    next_col = f"{fip}_next"
    # Combine current and next week cases to get full range of data for binning
    combined_series = pd.concat([df[current_col], df[next_col]]).dropna()
    # Calculate the quantile edges
    quantiles = np.linspace(0, 1, 6)
    bin_edges = combined_series.quantile(quantiles)
    # Ensure uniqueness and sort them
    unique_edges = sorted(set(bin_edges))
    # Store bin edges in dictionary
    bin_edges_dict[current_col] = unique_edges
    bin_edges_dict[next_col] = unique_edges
    # Discretize both columns using the same bin edges
    labels = [f"bin_{i+1}" for i in range(len(unique_edges)-1)]
    df[current_col] = pd.cut(df[current_col], bins=unique_edges, labels=labels, include_lowest=True)
    df[next_col] = pd.cut(df[next_col], bins=unique_edges, labels=labels, include_lowest=True)

# Apply the discretization function to each FIP group
fips = set(col.split('_')[0] for col in pgmpy_data.columns)
for fip in fips:
    discretize_columns(pgmpy_data, fip)

# Create a DataFrame from the bin edges dictionary
bin_edges_df = pd.DataFrame.from_dict(bin_edges_dict, orient='index').T

# The resulting DataFrames
print(pgmpy_data)
print(bin_edges_df)

variable 1001_current 1001_next 1003_current 1003_next
Week                                                  
1               bin_1     bin_2        bin_1     bin_2
2               bin_2     bin_4        bin_2     bin_4
3               bin_4       NaN        bin_4       NaN
   1003_current  1003_next  1001_current  1001_next
0          40.0       40.0          10.0       10.0
1          48.0       48.0          18.0       18.0
2          50.0       50.0          20.0       20.0
3          54.0       54.0          24.0       24.0
4          60.0       60.0          30.0       30.0
