# Part 1 Data Analysis:
## Understanding the Data

In [None]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt

In [None]:
# Listing all of the CSV files
csv_files = [
    '/content/supplies.csv',
    '/content/providers.csv',
    '/content/procedures.csv',
    '/content/payers.csv',
    '/content/payer_transitions.csv',
    '/content/patients.csv',
    '/content/organizations.csv',
    '/content/medications.csv',
    '/content/observations.csv',
    '/content/immunizations.csv',
    '/content/imaging_studies.csv',
    '/content/encounters.csv',
    '/content/conditions.csv',
    '/content/devices.csv',
    '/content/careplans.csv',
    '/content/allergies.csv'
]
#-------------------------------------------------------------------------------
# Reading all CSV files into pandas DataFrames and store them in a dictionary
dataframes = {}
for file in csv_files:
    key = file.split('/')[-1][:-4]  # Get only the filename without the path and '.csv' extension
    dataframes[key] = pd.read_csv(file)
#-------------------------------------------------------------------------------
# Displaying the patients DataFrame
display(dataframes['patients'].head())


In [None]:
# Analyzing each DataFrame individually
for key, df in dataframes.items():
    print(f"--- {key.upper()} DataFrame ---")
#-------------------------------------------------------------------------------    
    # Displaying the first 10 rows of the DataFrame
    print("\nFirst 10 rows:")
    display(df.head(10))
#-------------------------------------------------------------------------------
    # Displaying the shape (number of rows and columns) of the DataFrame
    display(f"\nShape: {df.shape}")
#-------------------------------------------------------------------------------
    # Displaying the column names and their data types
    print("\nData types:")
    display(df.dtypes)
#-------------------------------------------------------------------------------
    # Displaying basic statistics for numerical columns
    print("\nBasic statistics:")
    display(df.describe())
#-------------------------------------------------------------------------------
    # Checking for missing values in each column
    print("\nMissing values:")
    display(df.isnull().sum())
#-------------------------------------------------------------------------------
    print("\n")

## Analysis: 
### Summary of each DataFrame:

1- SUPPLIES DataFrame:
* The DataFrame is empty with 0 rows and 6 columns (DATE, PATIENT, ENCOUNTER, CODE, DESCRIPTION, QUANTITY).
* All columns are of the 'object' data type.
* There are no missing values.

2- PROVIDERS DataFrame:
* Contains 5855 rows and 12 columns (Id, ORGANIZATION, NAME, GENDER, SPECIALITY, ADDRESS, CITY, STATE, ZIP, LAT, LON, UTILIZATION).
* Data types are mostly 'object', except for 'LAT', 'LON', and 'UTILIZATION' which are 'float64' and 'int64', respectively.
* No missing values.

3- PROCEDURES DataFrame:
* Contains 34,981 rows and 8 columns (DATE, PATIENT, ENCOUNTER, CODE, DESCRIPTION, BASE_COST, REASONCODE, REASONDESCRIPTION).
* Data types include 'object', 'int64', 'float64'.
* Missing values are present in 'REASONCODE' and 'REASONDESCRIPTION' columns.

4- PAYERS DataFrame:
* Contains 10 rows and 21 columns.
* Data types are mostly 'object', with some 'float64' and 'int64'.
* Missing values are present in the columns 'ADDRESS', 'CITY', 'STATE_HEADQUARTERED', 'ZIP', and 'PHONE'.

5- PAYER_TRANSITIONS DataFrame:
* Contains 3,801 rows and 5 columns (PATIENT, START_YEAR, END_YEAR, PAYER, OWNERSHIP).
* All columns are of the 'object' data type, except for 'START_YEAR' and 'END_YEAR' which are 'int64'.
* No missing values.

6- ORGANIZATIONS DataFrame:
* Shape: (1119, 11) with a mix of 'object', 'float64', and 'int64' data types.
* Missing values: PHONE - 184, suggesting incomplete contact information for some organizations.
* Key numerical features: LAT (latitude), LON (longitude), REVENUE, UTILIZATION (number of services provided).

7- MEDICATIONS DataFrame:
* Shape: (42989, 13) with a combination of 'object', 'float64', and 'int64' data types.
* Missing values: STOP - 1895 (end date of medication), REASONCODE - 11117, REASONDESCRIPTION - 11117 (both indicating reasons for medication).
* Key numerical features: CODE (medication identifier), BASE_COST, PAYER_COVERAGE, DISPENSES, TOTALCOST, REASONCODE.

8- OBSERVATIONS DataFrame:
* Shape: (299697, 8) with mostly 'object' data types and some 'float64' and 'int64'.
* Missing values: ENCOUNTER - 30363, UNITS - 12735 (units of measurement for values).
* Key categorical features: CODE, DESCRIPTION, VALUE, UNITS, TYPE (categorizing different types of observations).

9- IMMUNIZATIONS DataFrame:
* Shape: (15478, 6) with 'object', 'float64', and 'int64' data types.
* Missing values: None
* Key numerical features: CODE (unique identifier for immunizations), BASE_COST (cost of each immunization).

10- IMAGING_STUDIES DataFrame:
* Shape: (855, 10) with a mix of 'object', 'float64', and 'int64' data types.
* Missing values: None, suggesting complete data for all imaging studies.
* Key numerical features: BODYSITE_CODE (unique identifier for the body site where the imaging study was performed).

11- ENCOUNTERS DataFrame:

* Shape: (53346, 15) with a combination of 'object', 'float64', and 'int64' data types.
* Missing values: REASONCODE (39569), REASONDESCRIPTION (39569) - both columns indicating reasons for encounters.
* Key features include DATE, PATIENT, PROVIDER, PAYER, ENCOUNTERCLASS, and TYPE.

12- CONDITIONS DataFrame:

* Shape: (8376, 6) with mostly 'object' data types and some 'float64' and 'int64'.
* Missing values: STOP (3811) - indicating the end date for a condition.
* Key features include DATE, PATIENT, ENCOUNTER, CODE, and DESCRIPTION.

13- DEVICES DataFrame:

* Shape: (78, 7) with 'object', 'float64', and 'int64' data types.
* Missing values: STOP (78) - indicating the end date for a device.
* Key features include DATE, PATIENT, ENCOUNTER, CODE, and DESCRIPTION.

14- CAREPLANS DataFrame:

* Shape: (3483, 9) with a mix of 'object', 'float64', and 'int64' data types.
* Missing values: STOP (1532) - indicating the end date for a care plan, REASONCODE (327), REASONDESCRIPTION (327) - both columns indicating reasons for care plans.
* Key features include DATE, PATIENT, ENCOUNTER, CODE, DESCRIPTION, and TYPE.

15- ALLERGIES DataFrame:

* Shape: (597, 6) with mostly 'object' data types and some 'float64' and 'int64'.
* Missing values: STOP (533) - indicating the end date for an allergy.
* Key features include DATE, PATIENT, ENCOUNTER, CODE, and DESCRIPTION.

## Initial Machine Learning Model Ideas:

##1- Clustering model or recommendation system for personalized treatments

## Only using these dataframes? Maybe? 

## Algorithms: 
* K-Means (Baseline)
* K-Means++ 
* Hierarchical Clustering (Maybe this one is the most appropriate for our project due to the size of the dataframes)

## Dataframes:
* Allergies
* Conditions 
* Encounters
* Immunizations
* Medications
* Observations
* Patients
* Providers

##2- Predictive modeling for medication adherence

## Algorithms:
* Logistic Regression (Baseline)
* Decision Trees (Baseline) 
* Random Forests
* Neural Networks 
  * Specifically: (RNN or LSTM) due to Feedforward is suitable for binary outcome like adherence or non-adherence

## Dataframes:
* Conditions
* Encounters
* Medications
* Observations
* Patients
* Providers 


In [None]:
# Read All CSV files
supplies = pd.read_csv('supplies.csv')
patients = pd.read_csv('patients.csv')
providers = pd.read_csv('providers.csv')
procedures = pd.read_csv('procedures.csv')
payers = pd.read_csv('payers.csv')
payer_transitions = pd.read_csv('payer_transitions.csv')
organizations = pd.read_csv('organizations.csv')
medications = pd.read_csv('medications.csv')
observations = pd.read_csv('observations.csv')
immunizations = pd.read_csv('immunizations.csv')
imaging_studies = pd.read_csv('imaging_studies.csv')
encounters = pd.read_csv('encounters.csv')
conditions = pd.read_csv('conditions.csv')
devices = pd.read_csv('devices.csv')
careplans = pd.read_csv('careplans.csv')
allergies = pd.read_csv('allergies.csv')

## Predictive Modeling for Medication Adherence
### Attempt 1 using Logistic Regression 

In [None]:
# Datasets:
# Conditions
# Encounters
# Medications
# Observations
# Patients
# Providers
#-------------------------------------------------------------------------------
# Listing the required datasets
datasets = ['conditions.csv', 'encounters.csv', 'medications.csv', 'observations.csv', 'patients.csv', 'providers.csv']

# Looping through each dataset and perform the desired checks
for dataset in datasets:
    df = pd.read_csv(dataset)
#-------------------------------------------------------------------------------    
    # Print statistics
    print(f"Statistics for {dataset}:")
    display(df.describe())
#-------------------------------------------------------------------------------    
    # Print shape
    display(f"Shape for {dataset}: {df.shape}")
#-------------------------------------------------------------------------------    
    # Print head
    print(f"Head for {dataset}:")
    display(df.head())
#-------------------------------------------------------------------------------    
    # Check for null values
    print(f"{dataset} null values")
    print(df.isnull().sum())

In [None]:
datasets = ['conditions.csv', 'encounters.csv', 'medications.csv', 'observations.csv', 'patients.csv']    
for dataset in datasets:
  df = pd.read_csv(dataset)
  print(f"{dataset} null values")
  print(df.shape)
  print(df.isnull().sum())
  print(df.dtypes)

- Patient Names, Conditions

# Part 2 Significant Revision & Modeling
# Where to Start the Final Project (Without Privacy Preserving)

Following an in-depth analysis, we have concluded that the synthetic medical data is not suitable for our project due to the time constraints associated with cleaning the data. Instead, we have identified a dataset of medical costs and have decided to incorporate Social Security information from the Patients dataset. This will enable us to focus on Privacy-Preserving Machine Learning.

https://www.kaggle.com/datasets/mirichoi0218/insurance

In [None]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt


In [None]:
def data_tester(df):
    display(df)
    print()

    shape = df.shape
    display(shape)
    print()

    display('Missing Values:')
    missing_values = df.isnull().sum()
    display(missing_values)
    print()

    display('Data Types:')
    data_types = df.dtypes
    display(data_types)
    print()

    return df, shape, missing_values, data_types

In [None]:
patients = pd.read_csv('/content/patients.csv')
insurance = pd.read_csv('/content/insurance.csv')

In [None]:
data_tester(patients)
print()
data_tester(insurance)

In [None]:
# Selecting a subset of rows from the insurance dataset to match the num of rows 
# in the Patients dataset
sub_insurance = insurance.sample(n=len(patients), random_state=42)

# Reseting the index of the sub_insurance to make sure it doesnt get our of order
sub_insurance.reset_index(drop=True, inplace=True)

# Adding the SSN column from the Patients dataset to the sub_insurance
sub_insurance['SSN'] = patients['SSN']

# Saving the new dataset
sub_insurance.to_csv("insurance_ssn.csv", index=False)

In [None]:
data_tester(sub_insurance)

## Extra Steps Before Modeling (Not Necessary) 

In [None]:
from sklearn.preprocessing import StandardScaler

# Initialize the scaler
scaler = StandardScaler()

# Scale the numerical columns
numerical_columns = ['age', 'bmi', 'children']
sub_insurance[numerical_columns] = scaler.fit_transform(sub_insurance[numerical_columns])


In [None]:
import seaborn as sns

# Calculate the correlation matrix
corr_matrix = sub_insurance.corr()

# Visualize the correlation matrix using a heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.title("Correlation Matrix Heatmap")
plt.show()


Due to the positive correlation between charges and age, bmi and children are weak there is no issue for multicollinearity in the dataset

In [None]:
# Plot boxplots for numerical columns
fig, axes = plt.subplots(1, 3, figsize=(15, 5))

for i, col in enumerate(numerical_columns):
    sns.boxplot(x=sub_insurance[col], ax=axes[i])
    axes[i].set_title(f"Boxplot of {col}")

plt.tight_layout()
plt.show()


In [None]:
numerical_columns = ['age', 'bmi', 'children']
summary_stats = sub_insurance[numerical_columns].describe()
print(summary_stats)


In [None]:
iqr_stats = summary_stats.loc[['25%', '75%']].T
iqr_stats['IQR'] = iqr_stats['75%'] - iqr_stats['25%']
iqr_stats['lower_bound'] = iqr_stats['25%'] - 1.5 * iqr_stats['IQR']
iqr_stats['upper_bound'] = iqr_stats['75%'] + 1.5 * iqr_stats['IQR']
print(iqr_stats)


In [None]:
from scipy.stats.mstats import winsorize

# Winsorize the 'age' column to handle outliers
sub_insurance['age'] = winsorize(sub_insurance['age'], limits=[0.05, 0.05])

# Check the summary statistics for the 'age' column after winsorization
print(sub_insurance['age'].describe())

In [None]:
# Calculate IQR and lower/upper bounds for bmi column
bmi_iqr = np.percentile(sub_insurance['bmi'], 75) - np.percentile(sub_insurance['bmi'], 25)
bmi_lower_bound = np.percentile(sub_insurance['bmi'], 25) - 1.5 * bmi_iqr
bmi_upper_bound = np.percentile(sub_insurance['bmi'], 75) + 1.5 * bmi_iqr


In [None]:
# Winsorize the children column
winsorized_children = winsorize(sub_insurance['children'], limits=(0.05, 0.05))

# Replace the original children column with the winsorized values
sub_insurance['children'] = winsorized_children


## Modeling

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Encode categorical variables and drop the 'SSN' column
# SSN column drop due to irrelevance 
df_encoded = pd.get_dummies(sub_insurance.drop('SSN', axis=1), columns=['sex', 'smoker', 'region'], drop_first=True)

# Separate features (X) and target (y)
X = df_encoded.drop('charges', axis=1)
y = df_encoded['charges']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and fit the Linear Regression Model
model = LinearRegression()
model.fit(X_train, y_train)

# Predict charges for the test set
y_pred = model.predict(X_test)

# Evaluate the model's performance
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse:.2f}")
print(f"Root Mean Squared Error: {rmse:.2f}")
print(f"R-squared: {r2:.2f}")



**Explanation:**
The model's R-squared value is 0.76, which suggests that it can explain about 76% of the variance in the insurance charges data. This indicates a relatively good fit, as the model is able to capture a significant portion of the relationship between the input features and the target variable.

However, the model's Root Mean Squared Error (RMSE) is 5,620.58. This means that, on average, the model's predictions are approximately $5,620.58 away from the actual insurance charges. Although the model is able to explain a substantial proportion of the variance in the data, there is still room for improvement, as the error in the predictions can be quite large in some cases.

In [None]:
!pip install lazypredict

In [None]:
import lazypredict
from lazypredict.Supervised import LazyRegressor
from sklearn.model_selection import train_test_split

# SSN column drop due to irrelevance 
df_encoded = pd.get_dummies(sub_insurance.drop('SSN', axis=1), columns=['sex', 'smoker', 'region'], drop_first=True)

# Splitting the dataset into training and testing sets
X = df_encoded.drop('charges', axis=1)
y = df_encoded['charges']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fitting and evaluate multiple models using LazyRegressor
reg = LazyRegressor(verbose=0, ignore_warnings=True, custom_metric=None)
models, predictions = reg.fit(X_train, X_test, y_train, y_test)

# Displaying the performance of each model
display(models)


**Without the Extra Steps:**
GradientBoostingRegressor has an R-squared value of 0.86 and an RMSE of 4224.49, which indicates a stronger fit and smaller prediction errors compared to the LinearRegression model.
<br> <br>
**With the Extra Steps:**
GradientBoostingRegressor has an R-squared value of 0.87 and an RMSE of 4193.79

## Side Note: 
**(Without the Extra Steps)** This is without any tuning hyperparameters, or performing feature engineering. (Our Baseline Model)

**(With the Extra Steps)** Improved Model, but not Perfect!
<br> <br>
We can now start the Privacy Preserving Techniques.

# Part 3
# Testing Privacy Preserving Methods

## Where to Start the Final Project (With Privacy Preserving)

In [None]:
!pip install diffprivlib

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from diffprivlib.models import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from numpy.linalg import norm
import hashlib

In [None]:
df = pd.read_csv('/content/insurance_ssn.csv')

## Simulating External Hard Disk (Without Encryption)

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
def data_tester(df):
    display(df)
    print()

    shape = df.shape
    display(shape)
    print()

    display('Missing Values:')
    missing_values = df.isnull().sum()
    display(missing_values)
    print()

    display('Data Types:')
    data_types = df.dtypes
    display(data_types)
    print()

    return df, shape, missing_values, data_types

In [None]:
df = pd.read_csv('/content/insurance_ssn.csv')

In [None]:
data_tester(df)

In [None]:
# Dictionary simulating the external storage
external_storage = {
    'admin_key': '/content/insurance_ssn.csv',
    'insurance_key': '/content/insurance_ssn.csv',
    'doctor_key': '/content/insurance_ssn.csv',
}
# Admin key has full access to the dataset
# Insurance providers have access to SSN and medical charges only
# Doctors don't require access to SSN and medical charges, but need everything else
def read_external_storage(key):
    if key in external_storage:
        file_path = external_storage[key]
        df = pd.read_csv(file_path)
                
        if key == 'admin_key':
            return df
        elif key == 'insurance_key':
            return df[['SSN', 'charges']]
        elif key == 'doctor_key':
            return df.drop(['SSN', 'charges'], axis=1)
        else:
            raise ValueError(f"Invalid key: {key}")
    else:
        raise ValueError(f"Invalid key: {key}")

In [None]:
# Testing the admin key 
admin = read_external_storage('admin_key')
display(admin)

In [None]:
# Testing the insurance key 
insurance = read_external_storage('insurance_key')
display(df)

In [None]:
# Testing the doctor key 
doctor = read_external_storage('doctor_key')
display(doctor)

In [None]:
# Testing a random key
security = read_external_storage('security_key')
display(security)

## Simulating External Hard Disk (with Differential Privacy "Noise")

In [None]:
# Dictionary simulating the external storage
external_storage = {
    'admin_key': '/content/insurance_ssn.csv',
    'insurance_key': '/content/insurance_ssn.csv',
    'doctor_key': '/content/insurance_ssn.csv',
}

def add_noise(data, epsilon):
    # Calculating the scale of Laplace noise
    # Adding Laplace noise to datapoint

    scale = 1 / epsilon

    noisy_data = data.copy()
    for column in noisy_data.columns:
        if noisy_data[column].dtype == 'object':
            # Converting string column to integer
            try:
                noisy_data[column] = noisy_data[column].astype(int)
            except ValueError:
                # Skip adding noise to non-numeric columns
                continue

        noisy_data[column] = noisy_data[column].apply(lambda x: x + np.random.laplace(0, scale))
    
    return noisy_data

def read_external_storage_with_noise(key, epsilon=0.1):
    if key in external_storage:
        file_path = external_storage[key]
        df = pd.read_csv(file_path)
        
        # Admin key has full access to the dataset
        if key == 'admin_key':
            return df
        # Insurance providers have access to SSN and medical charges only
        elif key == 'insurance_key':
            data = df[['SSN', 'charges']]
            noisy_data = add_noise(data, epsilon)
            return noisy_data
        # Doctors don't require access to SSN and medical charges, but need everything else
        elif key == 'doctor_key':
            data = df.drop(['SSN', 'charges'], axis=1)
            noisy_data = add_noise(data, epsilon)
            return noisy_data
        else:
            raise ValueError(f"Invalid key: {key}")
    else:
        raise ValueError(f"Invalid key: {key}")


In [None]:
# Testing the insurance key with and without noise
# Displaying both Datasets

insurance = read_external_storage('insurance_key')
insurance_with_noise = read_external_storage_with_noise('insurance_key', epsilon=0.0001)

# Using Concatenate the original and noisy data
# Adding column level to differentiate both datasets
insurance_con = pd.concat([insurance, insurance_with_noise], axis=1)
insurance_con.columns = pd.MultiIndex.from_product([['Original', 'Noisy'], insurance.columns])

display(insurance_con)

In [None]:
# Extract charges from original and noisy data
original_charges = insurance['charges']
noisy_charges = insurance_with_noise['charges']

# A scatter plot
plt.figure(figsize=(12, 8))

# Plot the original charges
plt.scatter(range(len(original_charges)), original_charges, alpha=0.5, color='blue', marker='o', label='Original Charges')

# Plot the noisy charges
plt.scatter(range(len(noisy_charges)), noisy_charges, alpha=0.5, color='red', marker='x', label='Noisy Charges')

plt.xlabel('Data Point Index')
plt.ylabel('Charges')
plt.title('Comparison of Original and Noisy Charges')

plt.legend()
plt.show()


#### **Question:**
Should we add noise to the SSN? We might have to split the SSN to treat it as three seperate parts then adding the noise on each part

In [None]:
# Testing the doctor key with and without noise
# Displaying both Datasets

doctor = read_external_storage('doctor_key')
doctor_with_noise = read_external_storage_with_noise('doctor_key', epsilon=0.01)

# Using Concatenate the original and noisy data
# Adding column level to differentiate both datasets
doctor_con = pd.concat([doctor, doctor_with_noise], axis=1)
doctor_con.columns = pd.MultiIndex.from_product([['Original', 'Noisy'], doctor.columns])

display(doctor_con)

In [None]:
# Reading both original and noisy data using doctor_key
# Combine original and noisy data into a single dataframe

original_data = read_external_storage('doctor_key')
noisy_data = read_external_storage_with_noise('doctor_key', epsilon=0.01)

original_data['Dataset'] = 'Original'
noisy_data['Dataset'] = 'Noisy'
combined_data = pd.concat([original_data, noisy_data])

# A scatter plot matrix
sns.pairplot(combined_data, hue='Dataset', diag_kind='hist', markers=['o', 's'], plot_kws={'alpha': 0.5})
plt.show()


## Simulating External Hard Disk (with Privacy Preserving ML "Encryption") Not Done

## Pseudonymization: Replacing the SSN with pseudonyms using a a secure hash function

For example, to obtain the digest of the byte string b"Nobody inspects the spammish repetition":

>>>


```
import hashlib
m = hashlib.sha256()
m.update(b"Nobody inspects")
m.update(b" the spammish repetition")
m.digest()
b'\x03\x1e\xdd}Ae\x15\x93\xc5\xfe\\\x00o\xa5u+7\xfd\xdf\xf7\xbcN\x84:\xa6\xaf\x0c\x95\x0fK\x94\x06'
m.hexdigest()
'031edd7d41651593c5fe5c006fa5752b37fddff7bc4e843aa6af0c950f4b9406'
```
More condensed:
```
hashlib.sha256(b"Nobody inspects the spammish repetition").hexdigest()
'031edd7d41651593c5fe5c006fa5752b37fddff7bc4e843aa6af0c950f4b9406'
```

Reference Link: https://docs.python.org/3/library/hashlib.html

In [None]:
import hashlib

def pseudonymize_ssn(data):
    for i in range(len(data)):
        data.loc[i, 'SSN'] = hashlib.sha256(data.loc[i, 'SSN'].encode()).hexdigest()
    return data

data = pseudonymize_ssn(df)

In [None]:
data.head(20)

## Applying Differential Privacy & Pseudonymization to a Linear Regression Model

In [None]:
!pip install diffprivlib

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from diffprivlib.models import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from numpy.linalg import norm

In [None]:
data = pd.read_csv('insurance_ssn.csv')
data_tester(data)

In [None]:
import hashlib

def pseudonymize_ssn(data):
    for i in range(len(data)):
        data.loc[i, 'SSN'] = hashlib.sha256(data.loc[i, 'SSN'].encode()).hexdigest()
    return data
data = pseudonymize_ssn(data)

In [None]:
data.head()

In [None]:
# Calculate the L2 norm of each row in the data
l2_norms = norm(X, axis=1)

# Get the maximum L2 norm
max_l2_norm = np.max(l2_norms)

print('Max L2 Norm:', max_l2_norm)


In [None]:
# Initializing the scaler
scaler = StandardScaler()

# Scaling the numerical columns
numerical_columns = ['age', 'bmi', 'children', 'charges']
data[numerical_columns] = scaler.fit_transform(data[numerical_columns])

# Creating dummy variables for categorical columns
df_encoded = pd.get_dummies(data, columns=['sex', 'smoker', 'region'], drop_first=True)

# Separating features (X) and target (y)
X = df_encoded.drop(['charges', 'SSN'], axis=1) # we drop 'SSN' because it doesn't provide useful information for our model
y = df_encoded['charges']

# Split the dataset into training and test sets
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.2, random_state=42)

# Applying differential privacy from diffprivlib.models
model = LinearRegression(epsilon=1.0, data_norm=4.383231988180274)
model.fit(train_X, train_y)

# Predicting on the test set
pred_y = model.predict(test_X)

# Evaluating the model
mse = mean_squared_error(test_y, pred_y)
rmse = np.sqrt(mse)
r2 = r2_score(test_y, pred_y)

print(f"Mean Squared Error: {mse:.2f}")
print(f"Root Mean Squared Error: {rmse:.2f}")
print(f"R-squared: {r2:.2f}")


## Applying Federated Learning with Diagram


### Using a Feed Forward Neural Network Model
### A Centralized Model: 
that consists of a single coordinating organization, called the federation owner or orchestrator, and a set of participant organizations or data owners.

An Overview of Federated Learning: https://www.youtube.com/watch?v=1YbPmkChcbo <br>
Initial thoughts are:
- Perfect for TinyML
- Great for multiple clients with multiple devices
- Outdated 2019  
<br>

A Image Classification Tutorial:
https://github.com/tensorflow/federated/blob/main/docs/tutorials/federated_learning_for_image_classification.ipynb <br>
A link to understand this Module: tff.learning.algorithms.build_weighted_fed_avg
https://www.tensorflow.org/federated/api_docs/python/tff/learning
<br>
A link to showcase the building block in building your own federated learning simulation/ real scenario:
https://www.tensorflow.org/federated/tutorials/building_your_own_federated_learning_algorithm


In [None]:
import pandas as pd
import matplotlib as plt

In [None]:
df = pd.read_csv('/content/insurance_ssn.csv')

In [None]:
data_tester(df)

In [None]:
# Had to uninstall and re-install tensorflow_federated for it to work
!pip uninstall -y tensorflow_federated
!pip install tensorflow_federated

In [None]:
# Restarted the runtime and re-installed tensorflow_federated
!pip install tensorflow_federated

In [None]:
# Checking the version of Python
!python --version

In [None]:
import tensorflow as tf
import tensorflow_federated as tff

In [None]:
# Printing the version of the "tff" package
print(tff.__version__)

In [None]:
df['sex'] = df['sex'].map({'female': 0, 'male': 1})
df['smoker'] = df['smoker'].map({'no': 0, 'yes': 1})
df['region'] = df['region'].map({'northeast': 0, 'northwest': 1, 'southeast': 2, 'southwest': 3})
df['client_id'] = df['SSN'].apply(lambda x: int(x[-1]))

# One-hot encode the 'region' column (Categorical Data)
df = pd.get_dummies(df, columns=['region'])

# Partition the data into client datasets to simulate multiple devices 
client_datasets = []
for client_id in df['client_id'].unique():
    client_df = df[df['client_id'] == client_id]
    features = client_df.drop(columns=['charges', 'SSN', 'client_id']).values
    labels = client_df['charges'].values
    client_tf_dataset = tf.data.Dataset.from_tensor_slices((features, labels)).batch(1)
    client_datasets.append(client_tf_dataset)

client_datasets = []
for client_id in client_ids:
    client_df = df[df['SSN'] == client_id]
    
    # Ensure that there is more than one record for each client
    if len(client_df) > 1:
        features = client_df.drop(columns=['charges', 'SSN']).values
        labels = client_df['charges'].values
        client_tf_dataset = tf.data.Dataset.from_tensor_slices((features, labels))
        
        # Batch the dataset its adjustable if needed
        client_tf_dataset = client_tf_dataset.batch(1) 
        client_datasets.append(client_tf_dataset)


# A keras model
def create_keras_model():
    return tf.keras.models.Sequential([
        tf.keras.layers.Dense(10, input_shape=(6,), activation='relu'),  
        tf.keras.layers.Dense(1)
    ])

# The Keras model in a TFF learning model
def model_fn():
    keras_model = create_keras_model()
    return tff.learning.models.from_keras_model(
        keras_model,
        input_spec=client_datasets[0].element_spec,
        loss=tf.keras.losses.MeanSquaredError(),
        metrics=[tf.keras.metrics.MeanSquaredError()]
    )

# Defining the federated averaging process (Averaging Client to Server)

federated_averaging = tff.learning.algorithms.build_weighted_fed_avg(
    model_fn,
    client_optimizer_fn=lambda: tf.keras.optimizers.SGD(learning_rate=0.02),
    server_optimizer_fn=lambda: tf.keras.optimizers.SGD(learning_rate=1.0)
)

# Running the federated averaging process
state = federated_averaging.initialize()
for _ in range(10):
    state, metrics = federated_averaging.next(state, client_datasets)
    print('metrics:', metrics)

The reason why the the code did not work (input_shape=(6,) instead of input_shape=(9,) due to get.dummies )

In [None]:
# Wont work without the it being Pre-proceesed
df['sex'] = df['sex'].map({'female': 0, 'male': 1})
df['smoker'] = df['smoker'].map({'no': 0, 'yes': 1})
df['region'] = df['region'].map({'northeast': 0, 'northwest': 1, 'southeast': 2, 'southwest': 3})

# Create client IDs based on the last digit of SSN to simplify the process
# Instead of splitting the ssn to three different categories
df['client_id'] = df['SSN'].apply(lambda x: int(x[-1]))

# One-hot encode the 'region' column (Categorical Data)
df = pd.get_dummies(df, columns=['region'])

# The unique client IDs
client_ids = df['client_id'].unique()

# Partition the data into client datasets to simulate multiple devices 
client_datasets = []
for client_id in client_ids:
    client_df = df[df['client_id'] == client_id]
    
    # Ensure that there is more than one record for each client
    if len(client_df) > 1:
        features = client_df.drop(columns=['charges', 'SSN', 'client_id']).values
        labels = client_df['charges'].values
        client_tf_dataset = tf.data.Dataset.from_tensor_slices((features, labels))
        
        # Batch the dataset its adjustable if needed
        client_tf_dataset = client_tf_dataset.batch(1)
        client_datasets.append(client_tf_dataset)


# A keras model
# Reason why the previous code did not work (input_shape=(9,) due to get.dummies)
def create_keras_model():
    return tf.keras.models.Sequential([
        tf.keras.layers.Dense(10, input_shape=(9,), activation='relu'),  
        tf.keras.layers.Dense(1)
    ])

# The Keras model in a TFF learning model
def model_fn():
    keras_model = create_keras_model()
    return tff.learning.models.from_keras_model(
        keras_model,
        input_spec=client_datasets[0].element_spec,
        loss=tf.keras.losses.MeanSquaredError(),
        metrics=[tf.keras.metrics.MeanSquaredError()]
    )

# Defining the federated averaging process (Averaging Client to Server)
federated_averaging = tff.learning.algorithms.build_weighted_fed_avg(
    model_fn,
    client_optimizer_fn=lambda: tf.keras.optimizers.SGD(learning_rate=0.02),
    server_optimizer_fn=lambda: tf.keras.optimizers.SGD(learning_rate=1.0)
)

# Running the federated averaging process
state = federated_averaging.initialize()
for _ in range(10):
    state, metrics = federated_averaging.next(state, client_datasets)
    print('metrics:', metrics)

### Applying the correct client format for a true federated simulation:

In [None]:
from sklearn.preprocessing import StandardScaler
import numpy as np
scaler = StandardScaler()

# Applying scaling to the numerical columns
df[['age', 'bmi', 'children', 'charges']] = scaler.fit_transform(df[['age', 'bmi', 'children', 'charges']])


# One-hot encoding categorical columns
df = pd.get_dummies(df, columns=['sex', 'smoker', 'region'])

# Dropping SSN column
df = df.drop(columns=['SSN'])

# Splitting data into three parts for three clients
df_naif, df_han, df_tamara = np.array_split(df, 3)

# Converting to TensorFlow datasets for each client: https://www.tensorflow.org/guide/data
tf_dataset_naif = tf.data.Dataset.from_tensor_slices((df_naif.drop(columns=['charges']).values, df_naif['charges'].values)).batch(1)
tf_dataset_han = tf.data.Dataset.from_tensor_slices((df_han.drop(columns=['charges']).values, df_han['charges'].values)).batch(1)
tf_dataset_tamara = tf.data.Dataset.from_tensor_slices((df_tamara.drop(columns=['charges']).values, df_tamara['charges'].values)).batch(1)

# Creating a federated dataset from the individual client datasets: https://www.tensorflow.org/federated/tutorials/federated_learning_for_image_classification
federated_data = [tf_dataset_naif, tf_dataset_han, tf_dataset_tamara]

# Storing the datasets in a dictionary for easy access
client_datasets = {
    'Naif': tf_dataset_naif,
    'Han': tf_dataset_han,
    'Tamara': tf_dataset_tamara,
}

Added or Changed:
*   Feature scaling
*   Learning rate
*   Model Complexity
*   Loss Function

In [None]:
# Defining a more complex Keras model 
def create_keras_model():
    return tf.keras.models.Sequential([
        tf.keras.layers.Dense(64, activation='relu', input_shape=(11,)),
        tf.keras.layers.Dense(32, activation='relu'),
        tf.keras.layers.Dense(1)
    ])


# Defining a loss function changed it from MSE to MAE
loss_fn = tf.keras.losses.MeanAbsoluteError()

# Creating a sample batch to infer input types
sample_spec = tf_dataset_naif.element_spec

# Defining a TFF model 
def model_fn():
    keras_model = create_keras_model()
    return tff.learning.models.from_keras_model(
        keras_model,
        input_spec=sample_spec,
        loss=loss_fn,
        metrics=[tf.keras.metrics.MeanAbsoluteError()]
    )
# Building the federated averaging process tff.learning.algorithms.build_weighted_fed_avg changed the 
# learning rate from 0.1 to 0.01 for better metrics 
# To delay convergence 
fed_avg = tff.learning.algorithms.build_weighted_fed_avg(model_fn, client_optimizer_fn=lambda: tf.keras.optimizers.SGD(0.01))


# Initializing state
state = fed_avg.initialize()

# Defining total number of rounds
TOTAL_ROUNDS = 100

# Initializing the lists to collect metrics
losses = []
mae = []

# Looping for all rounds
for round_num in range(1, TOTAL_ROUNDS+1):
    state, metrics = fed_avg.next(state, federated_data)
    print('round {:2d}, metrics={}'.format(round_num, metrics))
    losses.append(metrics['client_work']['train']['loss'])
    mae.append(metrics['client_work']['train']['loss'])

# Plotting the loss over each round
plt.figure()
plt.plot(range(1, TOTAL_ROUNDS+1), losses)
plt.title('Loss over epochs')
plt.xlabel('Epochs')
plt.ylabel('Loss')

# Plotting the mean absolute error over each round
plt.figure()
plt.plot(range(1, TOTAL_ROUNDS+1), mae)  # Mean Squared Error couldn't work due to outliers
plt.title('Mean Absolute Error over epochs')
plt.xlabel('Epochs')
plt.ylabel('MAE')

plt.show()

### What worked:
Creation of a federated dataset: We have prepared our data in a federated manner, by splitting it across multiple clients and converting these splits into tf.data.Dataset instances.

Creation of a federated learning model: We defined a function to create a Keras model and then converted this into a TFF model using the tff.learning.models.from_keras_model function.

Running a federated learning algorithm: We used tff.learning.algorithms.build_weighted_fed_avg to create a federated learning algorithm, which we then run for multiple rounds of training.

### What could be improved:

Client Selection: The current implementation assumes that all clients are available for each round of training, which may be different in a realistic federated learning scenario. We should implement client selection strategies to select a subset of clients for each training round. 

Client Weighting: All clients contribute equally to the model update in the current code. In practice, we should assign different weights to different clients based on factors like the amount of local data, the quality of local data, or the client's contribution to the global model's performance. 

Secure Aggregation: Secure aggregation is an essential aspect of federated learning which allows the server to aggregate model updates from clients without being able to inspect individual updates, thereby enhancing privacy. This needs to be covered in our current implementation. 

Differential Privacy: Differential Privacy is another important concept in Federated Learning that helps preserve the privacy of the client's data. We may include mechanisms for adding noise to model updates to ensure differential privacy, which was already done as a standalone. 

Handling Non-IID data: In a realistic federated learning scenario, data across different clients may be non-IID (Independent and Identically Distributed). This poses several challenges, and there are various strategies to handle such systems which we should consider. 

Communication Efficiency: Communication over the network can be a bottleneck in Federated Learning. Techniques to reduce the communication cost, such as model compression like Tensorlite, could be considered. 

Model Personalization: The current implementation aims for a global model that performs well on all clients. However, in some scenarios, allowing for local model adaptations for personalization could be beneficial.

In [None]:
       (1) Initialize
             |
             v
   +---------+---------+     +---------+---------+     +---------+---------+
   |    Client 1       |     |    Client 2       |     |    Client N       |
   | - Local dataset   |     | - Local dataset   |     | - Local dataset   |
   | - Copy of model   |     | - Copy of model   |     | - Copy of model   |
   +---------+---------+     +---------+---------+     +---------+---------+
             | (2) Broadcast  | (2) Broadcast  | (2) Broadcast
             | Model Params   | Model Params   | Model Params
             v                v                v
             | (3) Local      | (3) Local      | (3) Local
             | Computation    | Computation    | Computation
             |                |                |
+------------+-------+     +--+------------+   |  +---------+---------+
| Updated Model Params |     | Updated Model Params |  | Updated Model Params |
+------------+-------+     +--+------------+   |  +---------+---------+
             | (4) Aggregation | (4) Aggregation | (4) Aggregation
             +-----------------+-----------------+-----------------+
                                       |
                                       v
                              (5) Global Model Update
                                       |
                                       v
                                  (6) iIterate


"Data Scaling" represents the step of scaling the numerical features in the dataset.

"One-Hot Encoding" indicates the process of converting categorical features into a binary vector representation.

"Data Splitting" represents the division of data into different parts for each client in the federated learning setting.

"Federated Data" indicates the dataset used for each client in the federated learning process.

"Keras Model" represents the machine learning model built using the Keras library.

"Loss Function" represents the function used to compute the loss or error of the model.

"TFF Model" represents the TensorFlow Federated (TFF) model, which adapts the Keras model for federated learning.

"Federated Averaging" represents the federated averaging algorithm used to train the model collaboratively across clients.

"Training Loop" represents the iterative training process of the federated averaging algorithm.

"Losses" indicates the loss values during the training process.

"Mean Absolute Error" represents the mean absolute error metric used to evaluate the model's performance.

In [None]:
# Creat nodes
nodes = [
    ("Initialize", (0, 4)),
    ("Naif", (2, 2)),
    ("Han", (2, 4)),
    ("Tamara", (2, 6)),
    ("Broadcast\nModel Params", (4, 4)),
    ("Local\nComputation", (6, 2)),
    ("Local\nComputation", (6, 4)),
    ("Local\nComputation", (6, 6)),
    ("Updated Model\nParams", (8, 2)),
    ("Updated Model\nParams", (8, 4)),
    ("Updated Model\nParams", (8, 6)),
    ("Aggregation", (10, 4)),
    ("Global Model\nUpdate", (12, 4)),
    ("Iterate", (14, 4))
]

edges = [
    ((0, 4), (2, 2)), ((0, 4), (2, 4)), ((0, 4), (2, 6)),
    ((2, 2), (4, 4)), ((2, 4), (4, 4)), ((2, 6), (4, 4)),
    ((4, 4), (6, 2)), ((4, 4), (6, 4)), ((4, 4), (6, 6)),
    ((6, 2), (8, 2)), ((6, 4), (8, 4)), ((6, 6), (8, 6)),
    ((8, 2), (10, 4)), ((8, 4), (10, 4)), ((8, 6), (10, 4)),
    ((10, 4), (12, 4)), ((12, 4), (14, 4))
]


fig, ax = plt.subplots(figsize=(16, 6))
ax.set_xlim([-1, 15])
ax.set_ylim([-1, 7])
for node, pos in nodes:
    ax.text(pos[0], pos[1], node, ha="center", va="center", fontsize=12, fontweight="bold",
            bbox=dict(boxstyle="round", facecolor="white"))

for start, end in edges:
    ax.annotate("", xy=start, xytext=end, arrowprops=dict(arrowstyle="->", linewidth=1, color="gray"))


ax.set_title("Federated Learning Process", fontsize=16, fontweight="bold")
ax.set_xticks([])
ax.set_yticks([])
ax.axis("off")
plt.show()

## Federated Learning Process

[Federated Learning](https://ai.googleblog.com/2017/04/federated-learning-collaborative.html) is a machine learning setting where multiple devices (clients) collaboratively learn a shared model while keeping all the training data on the original device, decoupling the ability to do machine learning from the need to store the data in the cloud. This has the advantage of privacy by design: The raw data is never exposed to the server, and sensitive information remains on the device. Here is the general flow of the Federated Learning process:

1. **Initialization**: The server initializes the global model parameters $w$.

2. **Broadcast**: The server sends these parameters $w$ to all participating devices (clients).

3. **Local Computation**: Each client computes an update to the model parameters based on its local data. Specifically, the client computes a gradient of the loss function with respect to the model parameters. Let $x_i$ represent the local data for client $i$ and $y_i$ represent the labels. The client computes a local update $\delta_i$ as follows:
$$\delta_i = -\eta \nabla_w L(w;x_i, y_i)$$
where $L$ is the loss function, $\eta$ is the learning rate, and $\nabla$ denotes the gradient.

4. **Send Model Updates**: Each client sends its computed update $\delta_i$ back to the server.

5. **Aggregation**: The server aggregates the updates from each client to compute an overall update. The simplest way to do this is to compute an average:
$$\Delta = \frac{1}{n}\sum_{i=1}^{n}\delta_i$$
where $n$ is the total number of clients in our case 3.

6. **Global Model Update**: The server updates the global model parameters based on the aggregated update:
$$w = w + \Delta$$

7. **Iterate**: The process repeats from step 2 until convergence, i.e., until the change in the global model parameters is smaller than a specified threshold.

For more detailed information, you can refer to [Communication-Efficient Learning of Deep Networks from Decentralized Data](https://arxiv.org/abs/1602.05629) and [Advances and Open Problems in Federated Learning](https://arxiv.org/abs/1912.04977).


In [None]:
!apt-get install -y graphviz libgraphviz-dev
!pip install pygraphviz
!pip install pydot pydotplus

In [None]:
import networkx as nx

G = nx.DiGraph()

nodes = {
'start': 'Initialize $w_i = w_{global}$',
'local_comp': r'Local Computation $grad = \nabla w_i L(w_i, x_i, y_i)$',
'update': 'Update $\delta_i = -\eta * grad$',
'updated_model_params': 'Updated Model Params $w_i = w_i + \delta_i$',
'aggregation': 'Aggregation $\Delta = \frac{1}{n}\sum_{i=1}^{n}\delta_i$',
'global_update': 'Global Model Update $w_{global} = w_{global} + \Delta$',
'iterate': 'Iterate for epochs'
}

for node, label in nodes.items():
  G.add_node(node, label=label)

edges = [
('start', 'local_comp'),
('local_comp', 'update'),
('update', 'updated_model_params'),
('updated_model_params', 'aggregation'),
('aggregation', 'global_update'),
('global_update', 'iterate'),
('iterate', 'start')
]
G.add_edges_from(edges)

edge_labels = {
('start', 'local_comp'): r'$1...n$',
('local_comp', 'update'): '',
('update', 'updated_model_params'): '',
('updated_model_params', 'aggregation'): r'$1...n$',
('aggregation', 'global_update'): '',
('global_update', 'iterate'): '',
('iterate', 'start'): r'$epoch < N$'
}

node_sizes = [60000 + len(label) * 250 for label in nx.get_node_attributes(G, 'label').values()]

color_palette = ['#84C2F2', '#4F81BD', '#C0504D', '#9BBB59', '#8064A2', '#4BACC6', '#F79646']
colors = [color_palette[i % len(color_palette)] for i in range(len(G))]

plt.figure(figsize=(24, 20))
pos = nx.circular_layout(G) # Use circular layout for the graph
nx.draw_networkx_nodes(G, pos, node_color=colors, node_size=node_sizes, node_shape='o', edgecolors='k', linewidths=2.0)
nx.draw_networkx_labels(G, pos, labels=nx.get_node_attributes(G, 'label'), font_size=12, font_weight='bold', font_family='serif', verticalalignment='center')
nx.draw_networkx_edges(G, pos, connectionstyle='arc3,rad=0.1', edge_color='black', width=4, arrowstyle='-|>', alpha=0.9)
nx.draw_networkx_edge_labels(G, pos, edge_labels=edge_labels, font_size=18, font_family='serif')
plt.axis('off')
plt.margins(0.1)
plt.tight_layout()
plt.show()

In [None]:
!pip install dot2tex

### Explanation
1.   Data Preparation: The script begins by manipulating the dataset. Numerical columns ('age', 'bmi', 'children', and 'charges') are standardized using sklearn's StandardScaler. This ensures that these features have a mean value of 0 and a standard deviation of 1, reducing the sensitivity of the model to varying scales. The script also converts categorical columns ('sex', 'smoker', and 'region') into binary vectors via one-hot encoding. An unneeded column, 'SSN', is subsequently discarded.

2.   Data Segregation: The processed data is then divided into three segments, each intended for a different participant ('Naif', 'Han', 'Tamara'). This aligns with the concept of federated learning, where each participant has a unique subset of the overall data.

3.   TensorFlow Dataset Creation: The script transforms each client's data into a TensorFlow dataset, which is then batched. The model's inputs are all columns excluding 'charges', while 'charges' is set as the target variable.

4.   Formation of Federated Dataset: All the individual client datasets are gathered into a list to form a federated dataset.

5.   Model Architecture: Using Keras, the script defines a neural network model with two dense layers (using ReLU activation) and a final output layer (without any activation function), typically used in regression problems.

6.   TFF Model Initialization: The Keras model is wrapped into a TFF model with the help of the tff.learning.from_keras_model function. The model uses mean absolute error (MAE) as its loss function and mean squared error (MSE) as its performance metric.

7.   Setting up Federated Learning Process: The script sets up the federated learning process by calling the tff.learning.build_federated_averaging_process function. This function establishes a federated averaging process, enabling the model to learn from the federated data. The client optimizer chosen for this process is the stochastic gradient descent (SGD) with a learning rate of 0.01.

8.   Model Training: The model training phase consists of a certain number of rounds. In each round, the script calls the 'next' method of the federated averaging process. This method executes one federated averaging step, which involves training the model on each client's data, transmitting the model updates to the server, and averaging these updates.

9.   Model Evaluation: After each round of training, the script logs the model's loss and MAE. These values are subsequently visualized to demonstrate how the model's performance evolves over the training rounds.

In [1]:
!pip install pydot pydotplus

Collecting pydotplus
  Downloading pydotplus-2.0.2.tar.gz (278 kB)
     ---------------------------------------- 0.0/278.7 kB ? eta -:--:--
     ---- -------------------------------- 30.7/278.7 kB 445.2 kB/s eta 0:00:01
     ---- -------------------------------- 30.7/278.7 kB 445.2 kB/s eta 0:00:01
     ---- -------------------------------- 30.7/278.7 kB 445.2 kB/s eta 0:00:01
     ---- -------------------------------- 30.7/278.7 kB 445.2 kB/s eta 0:00:01
     --------- --------------------------- 71.7/278.7 kB 302.7 kB/s eta 0:00:01
     -------------- --------------------- 112.6/278.7 kB 409.6 kB/s eta 0:00:01
     -------------- --------------------- 112.6/278.7 kB 409.6 kB/s eta 0:00:01
     --------------- -------------------- 122.9/278.7 kB 343.4 kB/s eta 0:00:01
     ------------------------- ---------- 194.6/278.7 kB 491.5 kB/s eta 0:00:01
     ------------------------------ ----- 235.5/278.7 kB 535.1 kB/s eta 0:00:01
     ------------------------------ ----- 235.5/278.7 kB 535

* !apt-get install -y graphviz libgraphviz-dev
* !pip install pygraphviz
* !pip install pydot pydotplus

In [None]:
import networkx as nx
from networkx.drawing.nx_agraph import graphviz_layout
import textwrap

# Defining the nodes and their attributes
nodes = {
    "Data Scaling": {"color": "pink", "shape": "box", "size": 2000},
    "One-Hot Encoding": {"color": "pink", "shape": "box", "size": 2000},
    "Data Splitting": {"color": "pink", "shape": "box", "size": 2000},
    "Federated Data": {"color": "pink", "shape": "box", "size": 2000},
    "Keras Model": {"color": "lightblue", "shape": "ellipse", "size": 2000},
    "Loss Function": {"color": "lightblue", "shape": "ellipse", "size": 2000},
    "TFF Model": {"color": "lightblue", "shape": "ellipse", "size": 2000},
    "Federated Averaging": {"color": "lightgreen", "shape": "diamond", "size": 2000},
    "Training Loop": {"color": "lightgreen", "shape": "diamond", "size": 2000},
    "Losses": {"color": "orange", "shape": "hexagon", "size": 2000},
    "Mean Absolute Error": {"color": "orange", "shape": "hexagon", "size": 2000},
}

# Creating a new graph
graph = nx.DiGraph()

# Adding nodes to the graph
for node, attributes in nodes.items():
    graph.add_node(node, **attributes)

# Defining the edges and their attributes
edges = {
    ("Data Scaling", "One-Hot Encoding"): {"label": "Transforms"},
    ("One-Hot Encoding", "Data Splitting"): {"label": "Encodes"},
    ("Data Splitting", "Federated Data"): {"label": "Splits"},
    ("Federated Data", "TFF Model"): {"label": "Feeds"},
    ("Keras Model", "TFF Model"): {"label": "Defines"},
    ("Loss Function", "TFF Model"): {"label": "Determines"},
    ("Federated Averaging", "Training Loop"): {"label": "Averages"},
    ("TFF Model", "Training Loop"): {"label": "Trains"},
    ("Training Loop", "Losses"): {"label": "Produces"},
    ("Training Loop", "Mean Absolute Error"): {"label": "Calculates"},
}

# Adding edges to the graph
for edge, attributes in edges.items():
    graph.add_edge(*edge, **attributes)

# Graphing layout setting
pos = nx.nx_agraph.graphviz_layout(graph, prog="dot")

# Specifying the positions of the nodes
node_positions = {
    "Data Scaling": (59.5, 540.0),
    "One-Hot Encoding": (59.5, 453.0),
    "Data Splitting": (59.5, 366.0),
    "Federated Data": (59.5, 279.0),
    "Keras Model": (184.5, 279.0),
    "Loss Function": (320.5, 279.0),
    "TFF Model": (184.5, 192.0),
    "Federated Averaging": (373.5, 192.0),
    "Training Loop": (279.5, 105.0),
    "Losses": (201.5, 18.0),
    "Mean Absolute Error": (358.5, 18.0),
}

# Creating a new figure with the desired size
plt.figure(figsize=(20, 18))

# Increasing the size of the node circles
node_sizes = [node[1]["size"] * 3 for node in graph.nodes(data=True)]

# Drawing the nodes with specified positions
nx.draw_networkx_nodes(
    graph,
    pos=node_positions,
    node_color=[node[1]["color"] for node in graph.nodes(data=True)],
    node_size=node_sizes,
    alpha=0.9,
)

# Drawing the node labels
labels = {node: textwrap.fill(node, width=10) for node in graph.nodes()}
nx.draw_networkx_labels(graph, node_positions, labels=labels, font_size=12)

# Drawing the edges
nx.draw_networkx_edges(graph, pos, arrows=True, arrowstyle="->", arrowsize=10)

# Drawing the edge labels
edge_labels = nx.get_edge_attributes(graph, "label")
nx.draw_networkx_edge_labels(graph, pos, edge_labels=edge_labels, font_size=12)
plt.title('Flowchart of the Federated Learning Architecture', fontsize=32)
plt.axis('off')
plt.tight_layout()
plt.show()

## EHR Archeticture with Diagram

In [None]:
import pandas as pd
import numpy as np

In [None]:
def data_tester(df):
    display(df)
    print()

    shape = df.shape
    display(shape)
    print()

    display('Missing Values:')
    missing_values = df.isnull().sum()
    display(missing_values)
    print()

    display('Data Types:')
    data_types = df.dtypes
    display(data_types)
    print()

    return df, shape, missing_values, data_types

In [None]:
df = pd.read_csv('/content/insurance_ssn.csv')
data_tester(df)

In [None]:
import uuid
import datetime
from pandas import DataFrame
import csv

# Load the dataset
df: DataFrame = pd.read_csv('/content/insurance_ssn.csv')

# Anonymize the data by replacing SSNs with random UUIDs
df['SSN'] = [uuid.uuid4() for _ in range(len(df))]

class Logger:
    def __init__(self, csv_filename="log.csv"):
        self.csv_filename = csv_filename
        self.log_file = open(self.csv_filename, 'a', newline='')
        self.writer = csv.writer(self.log_file)
        self.is_first_entry = self.log_file.tell() == 0

    def log(self, message):
        timestamp = datetime.datetime.now()
        print(f'[{timestamp}]: {message}')
        if self.is_first_entry:
            self.writer.writerow(["Timestamp", "Message"])
            self.is_first_entry = False
        self.writer.writerow([str(timestamp), message])

    def close(self):
        self.log_file.close()

class Role:
    def __init__(self, name, permissions):
        self.name = name
        self.permissions = permissions

class User:
    def __init__(self, name, role):
        self.name = name
        self.role = role

class Record:
    def __init__(self, data, logger):
        self.data = data
        self.third_party_permissions = []
        self.logger = logger

class Patient:
    def __init__(self, name, age, logger):
        self.name = name
        self.age = age
        self.logger = logger

class PatientRecord:
    def __init__(self, patient, data):
        self.patient = patient
        self.data = data
        self.logger = patient.logger

    def add_data(self, new_data):
        self.data.update(new_data)
        self.logger.log(f'New data added for patient {self.patient.name}: {new_data}')

class Doctor:
    def __init__(self, name, logger):
        self.name = name
        self.logger = logger

    def notify(self, message):
        self.logger.log(f"Doctor {self.name}, {message}")

class HealthRiskAssessment:
    def __init__(self, record):
        self.record = record

    def assess_bmi_and_update_record(self, new_bmi, doctor):
        self.record.add_data({'bmi': new_bmi})
        self.alert_if_at_risk(doctor)

    def alert_if_at_risk(self, doctor):
        bmi = self.record.data.get('bmi')
        if bmi is not None and (bmi < 18.5 or bmi > 25):
            doctor.notify(f"Patient {self.record.patient.name} is at risk due to BMI of {bmi}")

class Hospital:
    def __init__(self, name, logger):
        self.name = name
        self.doctors = []
        self.patients = []
        self.logger = logger

    def add_doctor(self, doctor):
        if doctor not in self.doctors:
            self.doctors.append(doctor)
            self.logger.log(f'Doctor {doctor.name} has joined the hospital {self.name}.')

    def add_patient(self, patient):
        if patient not in self.patients:
            self.patients.append(patient)
            self.logger.log(f'Patient {patient.name} has joined the hospital {self.name}.')

    def assign_doctor_to_patient(self, doctor, patient):
        if doctor in self.doctors and patient in self.patients:
            patient_record = PatientRecord(patient, {'doctor': doctor.name})
            self.logger.log
            (f'Doctor {doctor.name} has been assigned to patient {patient.name}.')
            return patient_record
        else:
            self.logger.log(f'Error: Doctor {doctor.name} and/or patient {patient.name} are not part of the hospital {self.name}.')

class Pharmacy:
    def __init__(self, name):
        self.name = name
        self.medication_stock = {}
        self.logger = Logger()

    def add_medication(self, medication, quantity):
        if medication in self.medication_stock:
            self.medication_stock[medication] += quantity
        else:
            self.medication_stock[medication] = quantity
            self.logger.log(f'{quantity} units of {medication} added to the pharmacy {self.name}.')

    def dispense_medication(self, medication, quantity):
        if medication in self.medication_stock and self.medication_stock[medication] >= quantity:
            self.medication_stock[medication] -= quantity
            self.logger.log(f'{quantity} units of {medication} dispensed from the pharmacy {self.name}.')
        else:
            self.logger.log(f'Error: Insufficient stock of {medication} in the pharmacy {self.name}.')

In [None]:
# Testing the Classes Part 1: 
# ------------------------------------------------------------------------------

# Initializing the Logger object before creating the rest of the data
logger = Logger('hospital_1_log.csv')

# Creating a role with rules and responsibilities and a user
admin_role = Role('admin', ['read', 'write', 'delete'])
admin_user = User('Prof. Tamara', admin_role)

# Printing user name and the role
print(f'User: {admin_user.name}, Role: {admin_user.role.name}')

# Creating a Doctor
doctor_1 = Doctor('Dr. Han', logger)

# Creating a hospital and adding the patient and the doctor to it
hospital_1 = Hospital('UW Medical Center',logger)
hospital_1.add_doctor(doctor_1)

# Creating a Patient and assigning the doctor to the patient
# Pass the logger object explicitly
patient_1 = Patient('Naif', 25, logger)  

hospital_1.add_patient(patient_1)
patient_record_1 = hospital_1.assign_doctor_to_patient(doctor_1, patient_1)

patient_record_1.add_data({'weight': 85, 'height': 1.85})

# Creating a health risk assessment for the patient based on BMI
hra_1 = HealthRiskAssessment(patient_record_1)

# Calculating BMI (weight in kg / (height in m)^2) and updating patient's record
bmi = patient_record_1.data.get('weight') / (patient_record_1.data.get('height') ** 2)
hra_1.assess_bmi_and_update_record(bmi, doctor_1)

# Working in Progress with Pharmacy
pharmacy_1 = Pharmacy('UW Medical Center Pharmacy')
pharmacy_1.add_medication('Iburprofen', 500)
pharmacy_1.dispense_medication('Aspirin', 50)

# Closing the Logger
logger.close()


In [None]:
# Testing the Classes Part 2: 
# ------------------------------------------------------------------------------

# Initializing the Logger object before creating the rest of the data
logger = Logger('hospital_2_log.csv')

# Creating a role with rules and responsibilities and a user
patient_role = Role('patient', ['read'])
patient_user = User('Zack', patient_role)

# Printing user name and the role
print(f'User: {patient_user.name}, Role: {patient_user.role.name}')

# Creating a Doctor
doctor_2 = Doctor('Dr. Steve', logger)

# Creating a hospital and adding the patient and the doctor to it
hospital_2 = Hospital('Seattle Hospital', logger)
hospital_2.add_doctor(doctor_2)

# Creating a Patient and assigning the doctor to the patient
# Pass the logger object explicitly
patient_2 = Patient('Zack', 35, logger)  

hospital_2.add_patient(patient_2)
patient_record_2 = hospital_2.assign_doctor_to_patient(doctor_2, patient_2)

patient_record_2.add_data({'weight': 60, 'height': 1.55})

# Creating a health risk assessment for the patient based on BMI
hra_2 = HealthRiskAssessment(patient_record_2)

# Calculating BMI (weight in kg / (height in m)^2) and updating patient's record
bmi = patient_record_2.data.get('weight') / (patient_record_2.data.get('height') ** 2)
hra_2.assess_bmi_and_update_record(bmi, doctor_2)

# Working in Progress with Pharmacy
pharmacy_2 = Pharmacy('Seattle Pharmacy')
pharmacy_2.add_medication('XYZ', 500)
pharmacy_2.dispense_medication('XYZ', 50)

# Closing the Logger
logger.close()

In [None]:
hs_1_log = pd.read_csv('/content/hospital_1_log.csv')
hs_2_log = pd.read_csv('/content/hospital_2_log.csv')

In [None]:
pd.set_option('display.max_colwidth', None) # To showcase the full message
display(hs_1_log)
print()
display(hs_2_log)

In [None]:
log = pd.read_csv('/content/log.csv')

In [None]:
import networkx as nx
import matplotlib.pyplot as plt

# Creating a new graph
graph = nx.DiGraph()

# Adding nodes for each class
graph.add_node("Logger")
graph.add_node("Role")
graph.add_node("User")
graph.add_node("Record")
graph.add_node("Patient")
graph.add_node("PatientRecord")
graph.add_node("Doctor")
graph.add_node("HealthRiskAssessment")
graph.add_node("Hospital")
graph.add_node("Pharmacy")

# Adding edges
graph.add_edge("Logger", "Record", label="Logs")
graph.add_edge("Logger", "Doctor", label="Logs")
graph.add_edge("Logger", "Patient", label="Logs")
graph.add_edge("Logger", "PatientRecord", label="Logs")
graph.add_edge("Role", "User", label="Manages")
graph.add_edge("User", "Logger", label="Accesses")
graph.add_edge("Record", "Patient", label="Belongs to")
graph.add_edge("PatientRecord", "Patient", label="Includes")
graph.add_edge("PatientRecord", "Record", label="Includes")
graph.add_edge("Doctor", "Logger", label="Logs")
graph.add_edge("Doctor", "Hospital", label="Works at")
graph.add_edge("Doctor", "HealthRiskAssessment", label="Performs")
graph.add_edge("HealthRiskAssessment", "Record", label="Generates")
graph.add_edge("Hospital", "Doctor", label="Has doctors")
graph.add_edge("Hospital", "Patient", label="Treats")
graph.add_edge("Hospital", "PatientRecord", label="Has records")
graph.add_edge("Pharmacy", "Logger", label="Logs")

node_attributes = {
    "shape": "box",
    "style": "filled",
    "fillcolor": "white",  
}
nx.set_node_attributes(graph, node_attributes)

plt.figure(figsize=(24, 18))

# Setting the node colors
node_colors = ["pink", "lightblue", "lightgreen", "orange", "yellow", "cyan", "violet", "lightgray", "salmon", "lightpink"]
node_color_values = [node_colors[i % len(node_colors)] for i in range(len(graph.nodes()))]

node_sizes = [5000 * graph.degree(node) for node in graph.nodes()]

edge_colors = ["gray"] * len(graph.edges())
edge_styles = ["solid"] * len(graph.edges())

pos = nx.shell_layout(graph)

# Drawing the nodes
nx.draw_networkx_nodes(graph, pos, node_color=node_color_values, node_size=node_sizes, alpha=1.0)  # Set alpha to 1.0

# Drawing the labels
nx.draw_networkx_labels(graph, pos, font_size=14, font_weight="bold")

# Drawing the edges
nx.draw_networkx_edges(graph, pos, edge_color=edge_colors, width=2.0, alpha=0.5, style=edge_styles, arrowsize=20)

# Drawing the edge labels
edge_labels = nx.get_edge_attributes(graph, "label")
nx.draw_networkx_edge_labels(graph, pos, edge_labels=edge_labels, font_size=12)

# Title
plt.title("EHR Architecture", fontsize=24)
plt.axis("off")
plt.show()