## pips and includes:

In [2]:
import pandas as pd
import requests
import json
from datetime import datetime
import os

## constants:

In [5]:
BEGINING_OF_YEAR = "01010000"
ENDING_OF_YEAR = "12312350"
START_YEAR = 2000
END_YEAR = 2023

DATA_DIRECTORY = "../data/"

columns = [
    "Date Time", "BP (hPa)", "DiffR (w/m^2)", "Grad (w/m^2)", "NIP (w/m^2)", "RH (%)",
    "TD (degC)", "TDmax (degC)", "TDmin (degC)", "WD (deg)", "WDmax (deg)",
    "WS (m/s)", "Ws1mm (m/s)", "Ws10mm (m/s)", "WSmax (m/s)", "STDwd (deg)"
]

column_pairs = [
    ("date", "Date Time"),
    ("BP", "BP (hPa)"),
    ("DiffR", "DiffR (w/m^2)"),
    ("Grad", "Grad (w/m^2)"),
    ("NIP", "NIP (w/m^2)"),
    ("RH", "RH (%)"),
    ("TD", "TD (degC)"),
    ("TDmax", "TDmax (degC)"),
    ("TDmin", "TDmin (degC)"),
    ("WD", "WD (deg)"),
    ("WDmax", "WDmax (deg)"),
    ("WS", "WS (m/s)"),
    ("WS1mm", "Ws1mm (m/s)"),
    ("Ws10mm", "Ws10mm (m/s)"),
    ("WSmax", "WSmax (m/s)"),
    ("STDwd", "STDwd (deg)")
]


## utills functions:

In [6]:
def fetch_weather_data(station_id, start_date, end_date):
    url = f"https://ims.gov.il/he/envista_station_all_data_time_range/{station_id}/BP%26DiffR%26Grad%26NIP%26RH%26TD%26TDmax%26TDmin%26TW%26WD%26WDmax%26WS%26WS1mm%26Ws10mm%26Ws10maxEnd%26WSmax%26STDwd%26Rain/{start_date}/{end_date}/1/S"
    response = requests.get(url)
    data = json.loads(response.content)
    return data

def remove_unwanted_keys(data):
    # Remove 'sid', 'sname', and 'date_for_sort' from each record in data
    for record in data['data']['records']:
        if 'date_for_sort' in record:
            del record['date_for_sort']
        if 'sid' in record:
            del record['sid']
        if 'TW' in record:
            del record['TW']
        if 'sname' in record:
            del record['sname']

def replace_column_names(data):
    # Replace the names of the columns by the pairs in column_pairs
    for record in data['data']['records']:
        for new_name, old_name in column_pairs:
            if new_name in record:
                record[old_name] = record.pop(new_name)

def process_data(data):
    remove_unwanted_keys(data)
    replace_column_names(data)

def save_to_csv(data, filename):
    import csv

    # Extract the column names from the first record
    column_names = data['data']['records'][0].keys()

    # Open the file in write mode
    with open(DATA_DIRECTORY+filename, mode='w', newline='') as file:
        writer = csv.DictWriter(file, fieldnames=column_names)

        # Write the header
        writer.writeheader()

        # Write the data
        for record in data['data']['records']:
            writer.writerow(record)

def fetch_data_for_station(station_id, start_year, end_year):
    all_data = []

    for year in range(start_year, end_year + 1):
        today_fore0 = f"{year}" + BEGINING_OF_YEAR
        today_fore23 = f"{year}" + ENDING_OF_YEAR

        data = fetch_weather_data(station_id, today_fore0, today_fore23)
        process_data(data)
        
        # Convert the data to a DataFrame and append to the list
        df = pd.DataFrame(data['data']['records'])
        all_data.append(df)

    # Concatenate all DataFrames
    combined_df = pd.concat(all_data, ignore_index=True)
    return combined_df

def get_and_save_station_data(station_name, station_id, start_year, end_year):
    # Get all data for the station
    combined_df = fetch_data_for_station(station_id, start_year, end_year)
    
    # Convert the DataFrame back to the dictionary format expected by process_data
    data = {'data': {'records': combined_df.to_dict(orient='records')}}
    
    # Process the data
    process_data(data)
    
    # Save the data to a CSV file
    filename = f"{station_name}_data_{start_year}_{end_year}.csv"
    save_to_csv(data, filename)



### load all data

In [6]:
# Load stations_ids.json
with open('stations_ids.json', 'r', encoding='utf-8') as f:
    stations_ids = json.load(f)

# Iterate over all stations and use the function get_and_save_station_data
for station_name, station_id in stations_ids.items():
    filename = f"{station_name}_data_{START_YEAR}_{END_YEAR}.csv"
    if not os.path.exists(filename):
        get_and_save_station_data(station_name, station_id, START_YEAR, END_YEAR)


KeyboardInterrupt: 

### getting out the stations ids:

In [51]:
# Initialize the station map
station_map = {}

# Load stations.json
with open('stations.json', 'r', encoding='utf-8') as f:
    stations_data = json.load(f)

# Iterate through all categories and areas to populate station_map
for category, areas in stations_data['data']['area_stations'].items():
    for area, stations in areas.items():
        for station in stations.values():
            station_name = station.get('name')
            sid = station.get('envista_id')
            if station_name and sid:
                station_map[station_name] = sid

print(station_map)

with open('stations_ids.json', 'w', encoding='utf-8') as f:
    json.dump(station_map, f, ensure_ascii=False, indent=4)

{'Kefar Giladi': '241', 'Dafna': '499', 'Kefar Blum': '202', 'Merom Golan Picman': '10', 'Rosh Haniqra': '106', 'Elon': '73', 'Ayyelet Hashahar': '353', 'Shave Ziyyon': '343', 'Zefat Har Kenaan': '62', 'Harashim': '269', 'Ammiad': '123', 'Gamla': '227', 'Eshhar': '205', 'Kefar Nahum': '233', 'Bet Zayda': '6', 'Deir Hanna': '99', 'Afeq': '78', 'Avne Etan': '2', 'Haifa Refineries': '41', 'Haifa Technion': '43', 'Tiberias': '502', 'Haifa University': '42', 'Nazareth': '666', 'Newe Yaar': '186', 'Tavor Kadoorie': '13', 'Zemah': '8', 'Yavneel': '11', 'Massada': '355', 'En Karmel': '44', 'En Hashofet': '67', 'Afula Nir Haemeq': '16', 'Zikhron Yaaqov': '45', 'Tel Yosef': '380', 'Maale Gilboa': '224', 'Hadera Port': '46', 'Eden Farm': '206', 'Sede Eliyyahu': '366', 'En Hahoresh': '107', 'Qarne Shomeron': '20', 'Itamar': '90', 'Hakfar Hayarok': '275', 'Ariel': '21', 'Tel Aviv Coast': '178', 'Bet Dagan': '54', 'Gilgal': '30', 'Har Harasha': '24', 'Ashdod Port': '124', 'Nahshon': '259', 'Qevuzat 

### calc the missing 

In [25]:
def calculate_missing_percentage(csv_path):
    df = pd.read_csv(csv_path, na_values=['None', 'null', '-', '', ' ', 'NaN', 'nan', 'NAN'], low_memory=False)
    missing_percentages = df.isnull().mean() * 100
    return missing_percentages

missing_data = {}

for filename in os.listdir(DATA_DIRECTORY):
    if filename.endswith(".csv"):
        file_path = os.path.join(DATA_DIRECTORY, filename)
        missing_data[filename] = calculate_missing_percentage(file_path)

missing_data_df = pd.DataFrame(missing_data).transpose()
# Replace NaN values with 100%
missing_data_df.fillna(100, inplace=True)
# Add a column to missing_data_df with the average percentage for each station
missing_data_df['Average Percentage'] = missing_data_df.mean(axis=1)
# Calculate the average percentage for each row excluding specific columns
columns_to_exclude = ['BP (hPa)', 'Date Time', 'DiffR (w/m^2)', 'Grad (w/m^2)', 'NIP']
columns_to_include = [col for col in missing_data_df.columns if col not in columns_to_exclude]

# sort the dataframe by the average percentage excluding specific columns
missing_data_df = missing_data_df.sort_values(by='Average Percentage Excluding Specific Columns')

missing_data_df['Average Percentage Excluding Specific Columns'] = missing_data_df[columns_to_include].mean(axis=1)



missing_data_df = missing_data_df.round(1)


In [30]:
missing_data_df

Unnamed: 0,BP (hPa),Date Time,DiffR (w/m^2),Grad (w/m^2),NIP (w/m^2),RH (%),Rain,STDwd (deg),TD (degC),TDmax (degC),TDmin (degC),Time,WD (deg),WDmax (deg),WS (m/s),WSmax (m/s),Ws10mm (m/s),Ws1mm (m/s),Average Percentage,Average Percentage Excluding Specific Columns
Afeq_data_2000_2023.csv,37.8,0.0,100.0,100.0,100.0,11.0,4.3,12.1,1.9,1.9,1.9,13.1,12.2,12.2,11.6,11.6,20.9,11.6,25.8,16.8
Ammiad_data_2000_2023.csv,100.0,0.0,100.0,100.0,100.0,1.3,8.0,45.2,1.3,1.6,1.6,45.1,45.2,45.2,45.3,45.3,45.2,45.2,43.1,34.6
Avne Etan_data_2000_2023.csv,100.0,0.0,100.0,100.0,100.0,3.0,0.3,0.6,0.1,0.1,0.1,2.3,0.9,0.9,0.6,0.7,3.1,3.1,23.1,9.2
Ayyelet Hashahar_data_2000_2023.csv,100.0,0.0,100.0,100.0,100.0,0.4,1.1,100.0,0.1,0.1,0.1,100.0,100.0,100.0,100.0,100.0,100.0,100.0,66.8,64.6
Bet Zayda_data_2000_2023.csv,100.0,0.0,100.0,100.0,100.0,5.6,2.0,0.5,0.4,0.4,0.4,4.4,4.8,4.8,0.9,0.9,6.1,6.1,24.3,10.8
Dafna_data_2000_2023.csv,100.0,0.0,100.0,100.0,100.0,0.1,0.0,0.0,0.1,0.1,0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,22.3,8.2
Deir Hanna_data_2000_2023.csv,100.0,0.0,100.0,100.0,100.0,2.0,1.1,35.9,1.9,1.9,1.9,35.9,35.9,35.9,36.2,36.6,36.6,36.6,38.8,29.1
Elon_data_2000_2023.csv,100.0,0.0,100.0,100.0,100.0,10.1,7.9,0.4,0.4,0.4,0.4,0.3,0.4,0.5,0.4,0.4,0.4,0.4,23.5,9.7
Eshhar_data_2000_2023.csv,100.0,0.0,100.0,100.0,100.0,0.5,0.7,0.1,0.3,0.3,0.3,0.0,0.2,0.2,0.2,0.2,0.1,0.2,22.4,8.4
Gamla_data_2000_2023.csv,100.0,0.0,100.0,74.7,100.0,1.0,0.4,0.3,0.1,0.1,0.1,5.7,0.5,0.5,0.5,0.5,5.9,0.2,21.7,9.1


In [53]:
import json

# Read the stations.json file
with open('stations.json', 'r', encoding='utf-8') as f:
    stations_data = json.load(f)

# Count the number of stations
number_of_stations = len(stations_data["data"]["area_stations"]["auto"])

# Print the number of stations
print(f'The number of stations in stations.json is: {number_of_stations}')


FileNotFoundError: [Errno 2] No such file or directory: 'data code files/stations.json'