In [66]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

In [67]:
def read_excel(file):
    df = pd.read_excel(file)
    return df

def read_files_and_coordinates(files_directory, coordinates_filepath):
    dfs = []
    filenames = []
    for file in os.listdir(files_directory):
        if file.endswith(".xlsx"):
            df = read_excel(files_directory + file)
            dfs.append(df)
            filenames.append(file)
    
    coordinates_df = pd.read_excel(coordinates_filepath)

    # store the values from Coordinates column from coordinates_df to a 2d array
    coordinates = []
    for i in range(len(coordinates_df)):
        coordinates.append(coordinates_df.iloc[i, 1].split(','))
        for j in range(len(coordinates[i])):
            coordinates[i][j] = float(coordinates[i][j])

    return dfs, coordinates, filenames


In [68]:
# files_directory = "../../Data/Processed data/"
# coordinates_filepath = "../../Data/coordinates.xlsx"
# dfs, coordinates, filenames = read_files_and_coordinates(files_directory, coordinates_filepath)
# print("Data read successfully")
# print("Number of files read: ", len(dfs))
# print("Number of coordinates read: ", len(coordinates))

In [69]:
def make_dfs_equal_length(dfs):
    # store all the date values from all the dataframes in a single list, then keep only the unique values
    dates = []
    for i in range(len(dfs)):
        dates.extend(dfs[i]['Date'])
    dates = list(set(dates))
    print("Number of unique dates: ", len(dates))
    # print dates one in each line
    for date in dates:
        # if any datetime contains a time, remove the datetime
        if len(str(date).split(' ')) > 1:
            dates.remove(date)
            continue
    dates.sort()
    
    # Iterate over all the dataframes and for each date, if the date is not present in the dataframe, add a row with all values as 0
    for i in range(len(dfs)):
        for date in dates:
            if date not in dfs[i]['Date'].values:
                df = pd.DataFrame([[date, 0]], columns=dfs[i].columns)
                dfs[i] = pd.concat([dfs[i], df], ignore_index=True)  
        print("Shape of dataframe ", i, " is: ", dfs[i].shape)
        # sort the dataframe by Date
        dfs[i] = dfs[i].sort_values(by='Date')
        
    
    # remove null values from the dataframes
    for df in dfs:
        df['Affected'] = pd.to_numeric(df['Affected'], errors='coerce').fillna(0).astype(float)

    return dfs
    

In [70]:
def write_to_excel(dfs, filenames, files_directory):
    for i in range(len(dfs)):
        dfs[i].to_excel(files_directory + filenames[i], index=False)

In [71]:
def data_preprocessing(data_read_directory, coordinates_filepath, data_write_directory):
    dfs, coordinates, filenames = read_files_and_coordinates(data_read_directory, coordinates_filepath)
    print("Data read successfully")
    print("Number of files read: ", len(dfs))
    print("Number of coordinates read: ", len(coordinates))
    dfs = make_dfs_equal_length(dfs)
    write_to_excel(dfs, filenames, data_write_directory)
    print("Data written successfully")
    return dfs, coordinates, filenames

In [72]:
data_read_directory = "../../Data/Processed data/"
coordinates_filepath = "../../Data/coordinates.xlsx"
data_write_directory = "../../Data/Hospital Dataset/"

dfs, coordinates, filenames = data_preprocessing(data_read_directory, coordinates_filepath, data_write_directory)

Data read successfully
Number of files read:  39
Number of coordinates read:  39
Number of unique dates:  720
Shape of dataframe  0  is:  (720, 2)
Shape of dataframe  1  is:  (720, 2)
Shape of dataframe  2  is:  (720, 2)
Shape of dataframe  3  is:  (720, 2)
Shape of dataframe  4  is:  (720, 2)
Shape of dataframe  5  is:  (720, 2)
Shape of dataframe  6  is:  (720, 2)
Shape of dataframe  7  is:  (720, 2)
Shape of dataframe  8  is:  (720, 2)
Shape of dataframe  9  is:  (720, 2)
Shape of dataframe  10  is:  (720, 2)
Shape of dataframe  11  is:  (720, 2)
Shape of dataframe  12  is:  (720, 2)
Shape of dataframe  13  is:  (720, 2)
Shape of dataframe  14  is:  (720, 2)
Shape of dataframe  15  is:  (720, 2)
Shape of dataframe  16  is:  (720, 2)
Shape of dataframe  17  is:  (720, 2)
Shape of dataframe  18  is:  (720, 2)
Shape of dataframe  19  is:  (720, 2)
Shape of dataframe  20  is:  (720, 2)
Shape of dataframe  21  is:  (720, 2)
Shape of dataframe  22  is:  (720, 2)
Shape of dataframe  23  is