## pips and includes

In [1]:
import pandas as pd
import requests
import json
from datetime import datetime
import os
from tqdm import tqdm

## Constants

In [6]:
DATA_DIRECTORY = "../data/"
ANALYSIS_DIRECTORY = "analysis/"

## Analysis

### missing data analysis 

In [3]:
def calculate_missing_percentage_and_earliest_year(csv_path):
    df = pd.read_csv(csv_path, na_values=['None', 'null', '-', '', ' ', 'NaN', 'nan', 'NAN'], low_memory=False)
    columns_to_check = ['Rain', 'RH (%)', 'TD (degC)', 'TDmax (degC)', 'TDmin (degC)', 'WD (deg)', 'WDmax (deg)', 'WS (m/s)', 'Ws1mm (m/s)', 'Ws10mm (m/s)', 'WSmax (m/s)', 'STDwd (deg)']
    # Only check for columns that exist in the dataframe
    existing_columns = [col for col in columns_to_check if col in df.columns]
    missing_percentages = df[existing_columns].isnull().mean() * 100
    earliest_year = df['Year'].min() if 'Year' in df.columns else None
    return missing_percentages, earliest_year

def process_missing_data(data_directory):
    missing_data = {}
    earliest_years = {}
    csv_files = [f for f in os.listdir(data_directory) if f.endswith(".csv")]
    
    for filename in tqdm(csv_files, desc="Processing files", unit="file"):
        file_path = os.path.join(data_directory, filename)
        station_name = filename.replace('_data_2000_2023.csv', '')
        missing_percentages, earliest_year = calculate_missing_percentage_and_earliest_year(file_path)
        missing_data[station_name] = missing_percentages
        earliest_years[station_name] = earliest_year
    
    missing_data_df = pd.DataFrame(missing_data).transpose()
    missing_data_df.fillna(100, inplace=True)
    missing_data_df['Average Percentage'] = missing_data_df.mean(axis=1)
    
    missing_data_df['Earliest Year'] = pd.Series(earliest_years)
    missing_data_df = missing_data_df.sort_values(by='Average Percentage')
    missing_data_df = missing_data_df.round(1)
    
    return missing_data_df

missing_data_df = process_missing_data(DATA_DIRECTORY)

Processing files: 100%|██████████| 70/70 [03:08<00:00,  2.69s/file]


In [4]:
missing_data_df.to_csv(f'{ANALYSIS_DIRECTORY}data_analysis_after_imputation.csv', index=True)
missing_data_df

Unnamed: 0,Rain,RH (%),TD (degC),TDmax (degC),TDmin (degC),WD (deg),WDmax (deg),WS (m/s),WSmax (m/s),STDwd (deg),Average Percentage,Earliest Year
Dafna.csv,0.0,0.1,0.1,0.1,0.1,0.0,0.0,0.0,0.0,0.0,0.1,2018
Newe Yaar.csv,0.2,0.6,0.0,0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.1,2005
Avdat.csv,0.0,0.2,0.3,0.3,0.3,0.0,0.0,0.0,0.0,0.0,0.1,2008
Mizpe Ramon.csv,0.0,0.9,0.1,0.1,0.1,0.0,0.0,0.0,0.0,0.0,0.1,2012
Maale Adummim.csv,0.0,0.3,0.1,0.1,0.1,0.2,0.3,0.2,0.3,0.2,0.2,2006
...,...,...,...,...,...,...,...,...,...,...,...,...
Paran.csv,8.7,9.2,9.3,9.3,9.3,8.8,8.8,8.8,8.8,8.8,9.0,2004
Haifa Technion.csv,29.9,12.3,0.4,0.4,0.4,25.5,25.6,25.5,25.8,25.6,17.1,2000
Deir Hanna.csv,1.1,2.0,1.9,1.9,1.9,35.9,35.9,36.2,36.6,35.9,18.9,2002
Ammiad.csv,8.0,1.3,1.3,1.6,1.6,45.2,45.2,45.3,45.3,45.2,24.0,2004


### calc the impact of the data imputation: 

In [7]:
def calculate_difference(before_file, after_file, output_file):
    before_df = pd.read_csv(before_file)
    after_df = pd.read_csv(after_file)
    
    # Ensure both dataframes have the same columns
    if not before_df.columns.equals(after_df.columns):
        raise ValueError("The columns of the before and after dataframes do not match.")
    
    # Calculate the difference
    difference_df = after_df.set_index(before_df.columns[0]) - before_df.set_index(before_df.columns[0])
    
    # Save the result to a new CSV file
    difference_df.to_csv(output_file)

before_file = f'{ANALYSIS_DIRECTORY}data_analysis_before_imputation.csv'
after_file = f'{ANALYSIS_DIRECTORY}data_analysis_after_imputation.csv'
output_file = f'{ANALYSIS_DIRECTORY}data_imputation_improvments.csv'

calculate_difference(before_file, after_file, output_file)
