In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import glob

RAW_DATA_DIR = "raw_data/*.csv"
METADATA_ROWS = 91

def load_and_preprocess_csv(file):
    """Load csv file, drop metadata rows and parse dates."""
    dataframe = pd.read_csv(
        file,
        skiprows=METADATA_ROWS,
        parse_dates=False
    )

    dataframe['ob_end_time'] = pd.to_datetime(
        dataframe['ob_end_time'], errors='coerce'
    )
    return dataframe

def filter_july_data_and_drop_irrelevant_columns(dataframe):
    all_july_data = dataframe[dataframe['ob_end_time'].dt.month == 7]

    july_data_filtered = (
        all_july_data
        .sort_values(by='ob_end_time')
        .pipe(lambda df: df[['ob_end_time', 'min_air_temp', 'max_air_temp']])
        .reset_index(drop=True)
    )

    return july_data_filtered

def add_column_for_average_air_temp(dataframe):
    dataframe['avg_air_temp'] = (
        (dataframe['max_air_temp'] + dataframe['min_air_temp']) / 2
    )
    return dataframe

def load_all_csvs(data_directory):
    files = glob.glob(data_directory)
    all_data = pd.concat(
        (load_and_preprocess_csv(file) for file in files),
        ignore_index=True
    )
    return all_data

whole_dataset = load_all_csvs(RAW_DATA_DIR)
july_data_filtered = filter_july_data_and_drop_irrelevant_columns(whole_dataset)
july_data_with_averages = add_column_for_average_air_temp(july_data_filtered)
july_data_with_averages.head()




Unnamed: 0,ob_end_time,min_air_temp,max_air_temp,avg_air_temp
0,1950-07-01 09:00:00,11.1,22.8,16.95
1,1950-07-02 09:00:00,11.7,22.8,17.25
2,1950-07-03 09:00:00,14.4,24.4,19.4
3,1950-07-04 09:00:00,12.8,16.7,14.75
4,1950-07-05 09:00:00,13.3,17.2,15.25


In [2]:


# all_july_data['max_air_temp'].hist(bins=50, figsize=(12,8))
# plt.show()

# july_averages = (all_july_data['max_air_temp'] + all_july_data['min_air_temp']) / 2






# csv_files = glob.glob("raw_data/*.csv")

# def load_file_and_drop_metadata_rows(csv_file):
#     return pd.read_csv(
#         file,
#         skiprows=91, # 91 metadata rows
#         parse_dates=False
#     )

# def get_all_data_for_july(dataframe):
#     return dataframe[dataframe['ob_end_time'].dt.month == 7]


# dataframes = []
# for file in csv_files:
#     dataframe = load_file_and_drop_metadata_rows(file)
#     dataframes.append(dataframe)

# whole_dataset = pd.concat(dataframes)

# whole_dataset['ob_end_time'] = pd.to_datetime(
#     whole_dataset['ob_end_time'], errors='coerce'
# )
    
# july_data_filtered = (
#     get_all_data_for_july(whole_dataset)
#     .sort_values(by='ob_end_time')
#     .pipe(lambda df: df[['ob_end_time', 'min_air_temp', 'max_air_temp']])
#     .reset_index(drop=True)
# )

# july_data_filtered['avg_air_temp'] = (
#     (july_data_filtered['max_air_temp'] + july_data_filtered['min_air_temp']) / 2
# )
