In [None]:
# read the sheet names from openpyxl import load_workbook
from openpyxl import load_workbook
def get_sheet_names(file_path):
    workbook = load_workbook(filename=file_path, read_only=True)
    return workbook.sheetnames
file_path = 'malaria.xlsx'
sheet_names = get_sheet_names(file_path)
print("Sheet names in the workbook:" + str(sheet_names))
# Output the sheet names

In [None]:
import pandas as pd
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

# Specify the file and sheet to work with
file_path = 'malaria.xlsx'  # Make sure this file is in your current directory

df_new = pd.DataFrame()

for i in range(len(sheet_names)):
    print(f"Sheet {i}: {sheet_names[i]}")
    sheet_name = sheet_names[i]  # Use the first sheet name from the list

    print(f"Reading data from sheet: {sheet_name}")
    # Read Excel using openpyxl engine
    df_raw = pd.read_excel(file_path, sheet_name=sheet_name, header=None, engine='openpyxl')
    
    # get the years from the first row
    years = df_raw.iloc[0, 1:].tolist()

    # print("Years extracted from the first row:", years)

    df_raw 
    
    # invert the table
    df = df_raw.T

    # make the first row as the header
    df.columns = df.iloc[0]
    df = df[1:]
    # reset the index
    df.reset_index(drop=True, inplace=True)

    # rename the columns
    df.columns = ['Year', 'Month'] + list(df.columns[2:])
    # Display the first few rows of the DataFrame

    # copy the years to the first column for the months Jan to Dec
    df['Year'] = df['Year'].ffill()
    print("DataFrame after processing: \n")

    # fill nan with 0
    df.fillna(-1, inplace=True)
    # df.fillna(-1, inplace=True)
    df = df.infer_objects(copy=False)   
    
    df['District'] = sheet_name  # Add the sheet name as a new column

    df_new = pd.concat([df_new, df], ignore_index=True)

print("Final DataFrame after processing all sheets:")

In [None]:
df_new

In [None]:
df_new_ = df_new.copy()
# from year reomove last three characters
df_new_['Year'] = df_new_['Year'].str[:4]

# change the value in month 'octuber' to 'october'
df_new_['Month'] = df_new_['Month'].replace('octuber', 'october')

# change the value in month 'jan' to 'january'
df_new_['Month'] = df_new_['Month'].replace('jan ', 'january')
# change the value in month 'feb' to 'february'
df_new_['Month'] = df_new_['Month'].replace('feb ', 'february')
# change the value in month 'mar' to 'march'
df_new_['Month'] = df_new_['Month'].replace('nov', 'november')
# change the value in month 'apr' to 'april'
df_new_['Month'] = df_new_['Month'].replace('dec', 'december')

# Month and Year columns are now in the correct format
df_new_['Month'] = df_new_['Month'].str.lower()
df_new_['Month'] = df_new_['Month'].str.strip()
df_new_['Year'] = df_new_['Year'].astype(int)

#Capitalize the first letter of each month
df_new_['Month'] = df_new_['Month'].str.capitalize()


df_new_ = df_new_.copy()


In [100]:
# df_new_[(df_new_['Year'] == years[0]) & (df_new_['District'] == district)]
year = 2010  # Specify the year you want to filter
district = 'Uttarkashi'  # Specify the district you want to filter

print(f"Filtering data for Year: {year} and District: {district}")

df_new_ = df_new_.copy()  # Create a copy of the DataFrame for filtering

subset = df_new_[(df_new_['Year'] == year) & (df_new_['District'] == district)]


# Ensure months are ordered correctly and first letter is capitalized
months_order = [
    'january', 'february', 'march', 'april', 'may', 'june',
    'july', 'august', 'september', 'october', 'november', 'december'
]
# Make sure Month column is lowercase and stripped for matching
subset.loc[:, 'Month'] = subset['Month'].str.lower().str.strip()
subset = subset.set_index('Month').reindex(months_order).reset_index()
# Capitalize the first letter of each month
subset.loc[:, 'Month'] = subset['Month'].str.capitalize()
# Move 'Year' column to the first position
cols = subset.columns.tolist()
if 'Year' in cols:
    cols.insert(0, cols.pop(cols.index('Year')))
    subset = subset[cols]
# Display the subset DataFrame
print("Subset DataFrame for the specified year and district:")
subset

Filtering data for Year: 2010 and District: Uttarkashi
Subset DataFrame for the specified year and district:


Unnamed: 0,Year,Month,Total Blood Smears Examined for Malaria,Malaria (Microscopy Tests ) - Plasmodium Vivax test positive,Malaria (Microscopy Tests ) - Plasmodium Falciparum test positive,Inpatient - Malaria,Inpatient - Dengue,Inpatient - Typhoid,"Inpatient - Asthma, Chronic Obstructive Pulmonary Disease (COPD), Respiratory infections",Inpatient - Tuberculosis,Inpatient - Diarrhea with dehydration,Inpatient - Hepatitis,District
0,2010,January,392,-1,-1,-1,-1,-1,-1,-1,-1,-1,Uttarkashi
1,2010,February,142,-1,-1,-1,-1,-1,-1,-1,-1,-1,Uttarkashi
2,2010,March,127,-1,-1,-1,-1,-1,-1,-1,-1,-1,Uttarkashi
3,2010,April,94,-1,-1,-1,-1,-1,-1,-1,-1,-1,Uttarkashi
4,2010,May,255,-1,-1,-1,-1,-1,-1,-1,-1,-1,Uttarkashi
5,2010,June,274,-1,-1,-1,-1,-1,-1,-1,-1,-1,Uttarkashi
6,2010,July,426,-1,-1,-1,-1,-1,-1,-1,-1,-1,Uttarkashi
7,2010,August,458,-1,-1,-1,-1,-1,-1,-1,-1,-1,Uttarkashi
8,2010,September,670,1,-1,-1,-1,-1,-1,-1,-1,-1,Uttarkashi
9,2010,October,326,2,-1,-1,-1,-1,-1,-1,-1,-1,Uttarkashi


In [None]:
#Save the data from all years and months for each district in different sheets
output_file = 'malaria_processed.xlsx'
with pd.ExcelWriter(output_file, engine='openpyxl') as writer:
    for district in df_new_['District'].unique():
        district_data = df_new_[df_new_['District'] == district].copy()
        months_order = [
            'january', 'february', 'march', 'april', 'may', 'june',
            'july', 'august', 'september', 'october', 'november', 'december'
        ]
        # For each year in the district, keep all months in order
        result = []
        for year in sorted(district_data['Year'].unique()):
            year_data = district_data[district_data['Year'] == year].copy()
            year_data.loc[:, 'Month'] = year_data['Month'].str.lower().str.strip()
            year_data = year_data.set_index('Month').reindex(months_order).reset_index()
            year_data.loc[:, 'Month'] = year_data['Month'].str.capitalize()
            # Move 'Year' column to the first position
            cols = year_data.columns.tolist()
            if 'Year' in cols:
                cols.insert(0, cols.pop(cols.index('Year')))
                year_data = year_data[cols]
            result.append(year_data)
        district_final = pd.concat(result, ignore_index=True)
        district_final.to_excel(writer, sheet_name=district, index=False)
print(f"Data saved to {output_file} with separate sheets for each district.")

# Save the final DataFrame to an Excel file (all districts together, same format)
df_final = []
for district in df_new_['District'].unique():
    district_data = df_new_[df_new_['District'] == district].copy()
    for year in sorted(district_data['Year'].unique()):
        year_data = district_data[district_data['Year'] == year].copy()
        year_data.loc[:, 'Month'] = year_data['Month'].str.lower().str.strip()
        year_data = year_data.set_index('Month').reindex(months_order).reset_index()
        year_data.loc[:, 'Month'] = year_data['Month'].str.capitalize()
        cols = year_data.columns.tolist()
        if 'Year' in cols:
            cols.insert(0, cols.pop(cols.index('Year')))
            year_data = year_data[cols]
        df_final.append(year_data)
df_final = pd.concat(df_final, ignore_index=True)
df_final.to_excel('malaria_final.xlsx', index=False)
print("Final DataFrame saved to 'malaria_final.xlsx'.")
print("Final DataFrame after processing all sheets:")
print(df_final.head())

IndexError: At least one sheet must be visible

In [None]:
import matplotlib.pyplot as plt

# Get unique years
years = df_new_['Year'].unique()

for year in years:
    plt.figure(figsize=(14, 6))
    for district in df_new_[df_new_['Year'] == year]['District'].unique():
        subset = df_new_[(df_new_['Year'] == year) & (df_new_['District'] == district)]
        # Ensure months are ordered correctly
        months_order = [
            'January', 'February', 'March', 'April', 'May', 'June',
            'July', 'August', 'September', 'October', 'November', 'December'
        ]
        #subset['Month'] = subset['Month'].str.capitalize()
        subset = subset.set_index('Month').reindex(months_order).reset_index()
        subset['Month'] = subset['Month'].str.capitalize()
        plt.plot(subset['Month'], subset['Total Blood Smears Examined for Malaria'], label=district, 
                    marker='o', linestyle='-', markersize=5, linewidth=2, )
    plt.xlabel('Month')
    plt.ylabel('Total Blood Smears Examined for Malaria')
    plt.title(f'Malaria Blood Smears Examined in {year} by District')
    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.xticks(rotation=45)
    # make y-axis log scale
    plt.yscale('log')
    plt.grid()
    plt.tight_layout()
    #plt.savefig(f'figs/malaria_blood_smears_examined_{year}.png')
    #plt.show()

In [None]:
import matplotlib.pyplot as plt

# Get unique years
years = df_new_['Year'].unique()

for year in years:
    plt.figure(figsize=(14, 6))
    for district in df_new_[df_new_['Year'] == year]['District'].unique():
        subset = df_new_[(df_new_['Year'] == year) & (df_new_['District'] == district)]
        # Ensure months are ordered correctly
        months_order = [
            'January', 'February', 'March', 'April', 'May', 'June',
            'July', 'August', 'September', 'October', 'November', 'December'
        ]
        subset = subset.set_index('Month').reindex(months_order).reset_index()
        subset['Month'] = subset['Month'].str.capitalize()
        # Plotting the data
        plt.plot(subset['Month'], subset['Malaria (Microscopy Tests ) - Plasmodium Vivax test positive'], label=district, 
                    marker='o', linestyle='-', markersize=5, linewidth=2, )
    plt.xlabel('Month')
    plt.ylabel('Malaria (Microscopy Tests ) - Plasmodium Vivax test positive')
    plt.title(f'Malaria (Microscopy Tests ) - Plasmodium Vivax test positive in {year} by District')
    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.xticks(rotation=45)
    # make y-axis log scale
    #plt.yscale('log')
    plt.grid()
    plt.tight_layout()
    #plt.savefig(f'figs/malaria_blood_smears_examined_{year}.png')
    #plt.show()