In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

In [2]:
def read_excel(file):
    df = pd.read_excel(file)
    return df

def read_files_and_coordinates(files_directory, coordinates_filepath):
    dfs = []
    filenames = []
    for file in os.listdir(files_directory):
        if file.endswith(".xlsx"):
            df = read_excel(files_directory + file)
            print("Reading file: ", file, " at index: ", len(dfs))
            dfs.append(df)
            filenames.append(file)
    
    coordinates_df = pd.read_excel(coordinates_filepath)

    # store the values from Coordinates column from coordinates_df to a 2d array
    coordinates = []
    for i in range(len(coordinates_df)):
        coordinates.append(coordinates_df.iloc[i, 1].split(','))
        for j in range(len(coordinates[i])):
            coordinates[i][j] = float(coordinates[i][j])

    return dfs, coordinates, filenames


In [3]:
# files_directory = "../../Data/Processed data/"
# coordinates_filepath = "../../Data/coordinates.xlsx"
# dfs, coordinates, filenames = read_files_and_coordinates(files_directory, coordinates_filepath)
# print("Data read successfully")
# print("Number of files read: ", len(dfs))
# print("Number of coordinates read: ", len(coordinates))

In [4]:
def make_dfs_equal_length(dfs):
    # store all the date values from all the dataframes in a single list, then keep only the unique values
    dates = []
    for i in range(len(dfs)):
        dates.extend(dfs[i]['Date'])
    dates = list(set(dates))
    print("Number of unique dates: ", len(dates))
    # print dates one in each line
    for date in dates:
        # if any datetime contains a time, remove the datetime
        if len(str(date).split(' ')) > 1:
            dates.remove(date)
            continue
    dates.sort()
    
    # Iterate over all the dataframes and for each date, if the date is not present in the dataframe, add a row with all values as 0
    for i in range(len(dfs)):
        for date in dates:
            if date not in dfs[i]['Date'].values:
                df = pd.DataFrame([[date, 0]], columns=dfs[i].columns)
                dfs[i] = pd.concat([dfs[i], df], ignore_index=True)  
        print("Shape of dataframe ", i, " is: ", dfs[i].shape)
        # sort the dataframe by Date
        dfs[i] = dfs[i].sort_values(by='Date')
        
    
    # remove null values from the dataframes
    for df in dfs:
        df['Affected'] = pd.to_numeric(df['Affected'], errors='coerce').fillna(0).astype(float)

    return dfs
    

In [5]:
def write_to_excel(dfs, filenames, files_directory):
    for i in range(len(dfs)):
        dfs[i].to_excel(files_directory + filenames[i], index=False)

In [6]:
def data_preprocessing(data_read_directory, coordinates_filepath, data_write_directory):
    dfs, coordinates, filenames = read_files_and_coordinates(data_read_directory, coordinates_filepath)
    print("Data read successfully")
    print("Number of files read: ", len(dfs))
    print("Number of coordinates read: ", len(coordinates))
    dfs = make_dfs_equal_length(dfs)
    write_to_excel(dfs, filenames, data_write_directory)
    print("Data written successfully")
    return dfs, coordinates, filenames

In [7]:
data_read_directory = "../../Data/Processed data/"
coordinates_filepath = "../../Data/coordinates.xlsx"
data_write_directory = "../../Data/Hospital Dataset/"

dfs, coordinates, filenames = data_preprocessing(data_read_directory, coordinates_filepath, data_write_directory)

Reading file:  250 shojja.xlsx  at index:  0
Reading file:  31 shojja.xlsx  at index:  1
Reading file:  ad-din.xlsx  at index:  2
Reading file:  aichi.xlsx  at index:  3
Reading file:  Al manar.xlsx  at index:  4
Reading file:  amz hospital.xlsx  at index:  5
Reading file:  Anowar khan.xlsx  at index:  6
Reading file:  Apollo.xlsx  at index:  7
Reading file:  azgor ali.xlsx  at index:  8
Reading file:  Bangladesh Medical College Hospital.xlsx  at index:  9
Reading file:  BD Specialized hospital.xlsx  at index:  10
Reading file:  BGB Hospital.xlsx  at index:  11
Reading file:  BIRDEM.xlsx  at index:  12
Reading file:  brb hospital.xlsx  at index:  13
Reading file:  BSSMU.xlsx  at index:  14
Reading file:  Comfort Nursing.xlsx  at index:  15
Reading file:  Dedicated Covid-19 Hospital.xlsx  at index:  16
Reading file:  Delta medical.xlsx  at index:  17
Reading file:  Dhaka central.xlsx  at index:  18
Reading file:  Dhaka healthcare.xlsx  at index:  19
Reading file:  Dhaka Mahanagar Medica

In [12]:
# Initialize an empty numpy array with the required shape
timestep = dfs[0].shape[0]
hospitals = len(dfs)
affected_array = np.zeros((timestep, hospitals, 1))

# Populate the numpy array with the 'Affected' values from each dataframe
for i in range(hospitals):
    affected_array[:, i, 0] = dfs[i]['Affected'].values

# Save the numpy array to a .npy file
np.save('../../Data/Hospitals_of_Dhaka.npy', affected_array)


In [13]:
# split affected_array into training,val,test (75%,15%,10%)

train_size = int(affected_array.shape[0] * 0.75)
val_size = int(affected_array.shape[0] * 0.15)
test_size = int(affected_array.shape[0] * 0.10)

train_data = affected_array[:train_size]
val_data = affected_array[train_size:train_size+val_size]
test_data = affected_array[train_size+val_size:]

np.save('../../Data/Hospitals_of_Dhaka_train.npy', train_data)
np.save('../../Data/Hospitals_of_Dhaka_val.npy', val_data)
np.save('../../Data/Hospitals_of_Dhaka_test.npy', test_data)

In [14]:
# save the dates in a separate excel file
dates = dfs[0]['Date'].values
dates = np.array(dates)
dates = dates.reshape((dates.shape[0], 1))
dates_df = pd.DataFrame(dates, columns=['Date'])
dates_df.to_excel('../../dates.xlsx', index=False)