## pips and includes

In [2]:
import pandas as pd
import requests
import json
from datetime import datetime
import os

## Constants

In [3]:
DATA_DIRECTORY = "../data/"

## Analysis

### missing data analysis 

In [7]:
def calculate_missing_percentage_and_row_count(csv_path):
    df = pd.read_csv(csv_path, na_values=['None', 'null', '-', '', ' ', 'NaN', 'nan', 'NAN'], low_memory=False)
    missing_percentages = df.isnull().mean() * 100
    row_count = len(df)
    return missing_percentages, row_count

def process_missing_data(data_directory):
    missing_data = {}
    row_counts = {}
    for filename in os.listdir(data_directory):
        if filename.endswith(".csv"):
            file_path = os.path.join(data_directory, filename)
            station_name = filename.replace('_data_2000_2023.csv', '')
            missing_percentages, row_count = calculate_missing_percentage_and_row_count(file_path)
            missing_data[station_name] = missing_percentages
            row_counts[station_name] = row_count
    
    missing_data_df = pd.DataFrame(missing_data).transpose()
    missing_data_df.fillna(100, inplace=True)
    missing_data_df['Average Percentage'] = missing_data_df.mean(axis=1)
    
    columns_to_exclude = ['BP (hPa)', 'Date Time', 'DiffR (w/m^2)', 'Grad (w/m^2)', 'NIP']
    columns_to_include = [col for col in missing_data_df.columns if col not in columns_to_exclude]
    
    missing_data_df['Average Percentage Excluding Specific Columns'] = missing_data_df[columns_to_include].mean(axis=1)
    missing_data_df['Row Count'] = pd.Series(row_counts)
    missing_data_df['Row Count'] = missing_data_df['Row Count'] / (365 * 24 * 6)
    missing_data_df = missing_data_df.sort_values(by='Average Percentage Excluding Specific Columns')
    missing_data_df = missing_data_df.round(1)
    
    return missing_data_df

missing_data_df = process_missing_data(DATA_DIRECTORY)


In [8]:
missing_data_df.to_csv('missing_data_analysis.csv', index=True)
missing_data_df

Unnamed: 0,BP (hPa),Date Time,DiffR (w/m^2),Grad (w/m^2),NIP (w/m^2),RH (%),Rain,STDwd (deg),TD (degC),TDmax (degC),...,Time,WD (deg),WDmax (deg),WS (m/s),WSmax (m/s),Ws10mm (m/s),Ws1mm (m/s),Average Percentage,Average Percentage Excluding Specific Columns,Row Count
Ashalim,0.5,0.0,1.8,1.6,10.1,0.8,0.7,5.6,0.1,0.2,...,3.0,0.1,0.1,0.0,0.0,0.0,0.4,1.4,1.5,365933
Besor Farm,100.0,0.0,9.9,4.7,23.1,9.6,1.4,0.9,1.4,1.4,...,6.9,0.7,0.7,0.7,0.7,4.9,4.9,9.6,4.6,1252593
Dafna,100.0,0.0,100.0,100.0,100.0,0.1,0.0,0.0,0.1,0.1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,22.3,8.2,267649
Avdat,100.0,0.0,100.0,100.0,100.0,0.2,0.0,0.0,0.3,0.3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,22.3,8.2,801520
Mizpe Ramon,100.0,0.0,100.0,100.0,100.0,0.9,0.0,0.0,0.1,0.1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,22.3,8.2,360090
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Beit Jimal,100.0,0.0,100.0,100.0,100.0,6.7,1.5,100.0,0.2,0.2,...,100.0,100.0,100.0,100.0,100.0,100.0,100.0,67.2,65.1,1259781
Haifa Refineries,100.0,0.0,100.0,100.0,100.0,6.7,3.2,100.0,0.2,0.2,...,100.0,100.0,100.0,100.0,100.0,100.0,100.0,67.2,65.2,1261165
En Hahoresh,100.0,0.0,100.0,100.0,100.0,0.7,9.9,100.0,0.2,0.2,...,100.0,100.0,100.0,100.0,100.0,100.0,100.0,67.3,65.2,1068196
Qevuzat Yavne,100.0,0.0,100.0,100.0,100.0,9.9,0.9,100.0,0.4,0.4,...,100.0,100.0,100.0,100.0,100.0,100.0,100.0,67.3,65.3,1257232
