## pips and includes

In [3]:
import pandas as pd
import requests
import json
from datetime import datetime
import os
from tqdm import tqdm

## Constants

In [4]:
DATA_DIRECTORY = "../data/"
ANALYSIS_DIRECTORY = "analysis/"

## Analysis

### missing data analysis 

In [7]:
def calculate_missing_percentage_and_earliest_year(pickle_path):
    df = pd.read_pickle(pickle_path)
    #if 'Year' in df.columns:
        # delete rows with years that are less then 2006
        #df = df[df['Year'] >= 2006]
    columns_to_check = ['Rain', 'RH (%)', 'TD (degC)', 'TDmax (degC)', 'TDmin (degC)', 'Wind_x', 'Wind_y', 'Gust_x', 'Gust_y', 'STDwd (deg)']
    # Only check for columns that exist in the dataframe
    existing_columns = [col for col in columns_to_check if col in df.columns]
    missing_percentages = df[existing_columns].isnull().mean() * 100
    earliest_year = df['Year'].min() if 'Year' in df.columns else None
    return missing_percentages, earliest_year

def process_missing_data(pickle_directory):
    missing_data = {}
    earliest_years = {}
    pickle_files = [f for f in os.listdir(pickle_directory) if f.endswith(".pkl")]
    
    for filename in tqdm(pickle_files, desc="Processing files", unit="file"):
        file_path = os.path.join(pickle_directory, filename)
        station_name = filename.replace('.pkl', '')
        missing_percentages, earliest_year = calculate_missing_percentage_and_earliest_year(file_path)
        missing_data[station_name] = missing_percentages
        earliest_years[station_name] = earliest_year
    
    missing_data_df = pd.DataFrame(missing_data).transpose()
    missing_data_df.fillna(100, inplace=True)
    missing_data_df['Average Percentage'] = missing_data_df.mean(axis=1)
    
    missing_data_df['Earliest Year'] = pd.Series(earliest_years)
    missing_data_df = missing_data_df.sort_values(by='Average Percentage')
    missing_data_df = missing_data_df.round(1)
    
    return missing_data_df

missing_data_df = process_missing_data(DATA_DIRECTORY)

Processing files: 100%|██████████| 70/70 [00:40<00:00,  1.74file/s]


In [8]:
missing_data_df.to_csv(f'{ANALYSIS_DIRECTORY}data_analysis.csv', index=True)
missing_data_df

Unnamed: 0,Rain,RH (%),TD (degC),TDmax (degC),TDmin (degC),Wind_x,Wind_y,Gust_x,Gust_y,STDwd (deg),Average Percentage,Earliest Year
Dafna,0.0,0.1,0.1,0.1,0.1,0.0,0.0,0.0,0.0,0.0,0.1,2018
Bet Dagan,0.0,0.1,0.0,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,2006
Newe Yaar,0.2,0.6,0.0,0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.1,2006
Avdat,0.0,0.2,0.3,0.3,0.3,0.0,0.0,0.0,0.0,0.0,0.1,2008
Mizpe Ramon,0.0,0.9,0.1,0.1,0.1,0.0,0.0,0.0,0.0,0.0,0.1,2012
...,...,...,...,...,...,...,...,...,...,...,...,...
Harashim,1.0,3.7,0.3,0.3,0.3,16.1,16.1,15.8,15.8,15.9,8.5,2008
Haifa Technion,6.6,2.3,0.3,0.3,0.3,33.8,33.8,34.3,34.3,33.9,18.0,2006
Deir Hanna,1.3,2.4,2.3,2.3,2.3,43.1,43.1,43.6,43.6,42.8,22.7,2006
Ammiad,1.3,1.4,1.4,1.8,1.8,49.5,49.5,49.5,49.5,49.5,25.5,2006


### calc the impact of the data imputation: 

In [5]:
def calculate_difference(before_file, after_file, output_file):
    before_df = pd.read_csv(before_file)
    after_df = pd.read_csv(after_file)
    
    # Ensure both dataframes have the same columns
    if not before_df.columns.equals(after_df.columns):
        raise ValueError("The columns of the before and after dataframes do not match.")
    
    # Calculate the difference
    difference_df = after_df.set_index(before_df.columns[0]) - before_df.set_index(before_df.columns[0])
    
    # Save the result to a new CSV file
    difference_df.to_csv(output_file)

before_file = f'{ANALYSIS_DIRECTORY}data_analysis_before_imputation.csv'
after_file = f'{ANALYSIS_DIRECTORY}data_analysis_after_imputation.csv'
output_file = f'{ANALYSIS_DIRECTORY}data_imputation_improvments.csv'

# calculate_difference(before_file, after_file, output_file)
