## pips and includes

In [1]:
import pandas as pd
import requests
import json
from datetime import datetime
import os

## Constants

In [2]:
DATA_DIRECTORY = "../data/"

## Analysis

### missing data analysis 

In [6]:
def calculate_missing_percentage_and_earliest_year(csv_path):
    df = pd.read_csv(csv_path, na_values=['None', 'null', '-', '', ' ', 'NaN', 'nan', 'NAN'], low_memory=False)
    columns_to_check = ['Rain', 'RH (%)', 'TD (degC)', 'TDmax (degC)', 'TDmin (degC)', 
                        'WD (deg)', 'WDmax (deg)', 'WS (m/s)', 'Ws1mm (m/s)', 
                        'Ws10mm (m/s)', 'WSmax (m/s)', 'STDwd (deg)']
    missing_percentages = df[columns_to_check].isnull().mean() * 100
    earliest_year = df['Date Time'].apply(lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S').year).min()
    return missing_percentages, earliest_year

def process_missing_data(data_directory):
    missing_data = {}
    earliest_years = {}
    for filename in os.listdir(data_directory):
        if filename.endswith(".csv"):
            file_path = os.path.join(data_directory, filename)
            station_name = filename.replace('_data_2000_2023.csv', '')
            missing_percentages, earliest_year = calculate_missing_percentage_and_earliest_year(file_path)
            missing_data[station_name] = missing_percentages
            earliest_years[station_name] = earliest_year
    
    missing_data_df = pd.DataFrame(missing_data).transpose()
    missing_data_df.fillna(100, inplace=True)
    missing_data_df['Average Percentage'] = missing_data_df.mean(axis=1)
    
    missing_data_df['Earliest Year'] = pd.Series(earliest_years)
    missing_data_df = missing_data_df.sort_values(by='Average Percentage')
    missing_data_df = missing_data_df.round(1)
    
    return missing_data_df

missing_data_df = process_missing_data(DATA_DIRECTORY)


In [7]:
missing_data_df.to_csv('missing_data_analysis.csv', index=True)
missing_data_df

Unnamed: 0,Date Time,RH (%),Rain,STDwd (deg),TD (degC),TDmax (degC),TDmin (degC),WD (deg),WDmax (deg),WS (m/s),WSmax (m/s),Ws10mm (m/s),Ws1mm (m/s),Average Percentage,Row Count
Dafna,0.0,0.1,0.0,0.0,0.1,0.1,0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.1
Avdat,0.0,0.2,0.0,0.0,0.3,0.3,0.3,0.0,0.0,0.0,0.0,0.0,0.0,0.1,15.2
Mizpe Ramon,0.0,0.9,0.0,0.0,0.1,0.1,0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.1,6.9
Kefar Blum,0.0,1.0,0.0,0.1,0.2,0.2,0.2,0.2,0.2,0.1,0.1,0.1,0.1,0.2,18.5
Eshhar,0.0,0.5,0.7,0.1,0.3,0.3,0.3,0.2,0.2,0.2,0.2,0.1,0.2,0.2,18.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Beit Jimal,0.0,6.7,1.5,100.0,0.2,0.2,0.2,100.0,100.0,100.0,100.0,100.0,100.0,54.5,24.0
Haifa Refineries,0.0,6.7,3.2,100.0,0.2,0.2,0.2,100.0,100.0,100.0,100.0,100.0,100.0,54.6,24.0
En Hahoresh,0.0,0.7,9.9,100.0,0.2,0.2,0.2,100.0,100.0,100.0,100.0,100.0,100.0,54.7,20.3
Qevuzat Yavne,0.0,9.9,0.9,100.0,0.4,0.4,0.4,100.0,100.0,100.0,100.0,100.0,100.0,54.8,23.9
