In [60]:
import pandas as pd
import glob
import os
import seaborn as sns
import matplotlib.pyplot as plt
import re

In [47]:
# Remove all empty files
folder_path = './raw_stations'
for file_name in os.listdir(folder_path):
    file_path = os.path.join(folder_path, file_name)
    if os.path.isfile(file_path) and os.path.getsize(file_path) < 5: # Less than 5 bytes
        os.remove(file_path)
        print(f'Deleted: {file_path}')

Deleted: ./raw_stations/70:ee:50:12:f0:f6.csv
Deleted: ./raw_stations/70:ee:50:28:e1:6e.csv
Deleted: ./raw_stations/70:ee:50:7a:97:cc.csv
Deleted: ./raw_stations/70:ee:50:af:4f:f6.csv
Deleted: ./raw_stations/70:ee:50:a2:0b:54.csv
Deleted: ./raw_stations/70:ee:50:02:bf:4c.csv
Deleted: ./raw_stations/70:ee:50:90:dd:7a.csv
Deleted: ./raw_stations/70:ee:50:71:35:d2.csv
Deleted: ./raw_stations/70:ee:50:84:81:46.csv
Deleted: ./raw_stations/70:ee:50:af:44:c2.csv
Deleted: ./raw_stations/70:ee:50:b5:30:5a.csv
Deleted: ./raw_stations/70:ee:50:71:59:f2.csv
Deleted: ./raw_stations/70:ee:50:5e:da:62.csv
Deleted: ./raw_stations/70:ee:50:33:42:f4.csv


In [48]:
station_files = sorted(glob.glob('./raw_stations/*.csv'))
station_files

['./raw_stations/70:ee:50:01:60:78.csv',
 './raw_stations/70:ee:50:01:cf:12.csv',
 './raw_stations/70:ee:50:04:ac:ea.csv',
 './raw_stations/70:ee:50:04:b4:36.csv',
 './raw_stations/70:ee:50:04:d3:de.csv',
 './raw_stations/70:ee:50:05:61:46.csv',
 './raw_stations/70:ee:50:17:c7:ee.csv',
 './raw_stations/70:ee:50:17:d3:1a.csv',
 './raw_stations/70:ee:50:19:9f:dc.csv',
 './raw_stations/70:ee:50:1c:43:78.csv',
 './raw_stations/70:ee:50:20:d1:2c.csv',
 './raw_stations/70:ee:50:28:9c:ec.csv',
 './raw_stations/70:ee:50:28:b2:92.csv',
 './raw_stations/70:ee:50:28:e2:26.csv',
 './raw_stations/70:ee:50:2b:4a:4a.csv',
 './raw_stations/70:ee:50:2b:58:c2.csv',
 './raw_stations/70:ee:50:2f:25:74.csv',
 './raw_stations/70:ee:50:33:41:34.csv',
 './raw_stations/70:ee:50:33:41:7a.csv',
 './raw_stations/70:ee:50:36:cf:1c.csv',
 './raw_stations/70:ee:50:37:00:ca.csv',
 './raw_stations/70:ee:50:37:11:4c.csv',
 './raw_stations/70:ee:50:3b:e9:d4.csv',
 './raw_stations/70:ee:50:3b:f5:64.csv',
 './raw_stations

In [49]:
#for csv_file in station_files:
df = pd.read_csv(station_files[0], delimiter=',')
df.head()

Unnamed: 0,date,time,temp
0,2023-05-31,21:00:17,21.9
1,2023-05-31,21:09:40,21.8
2,2023-05-31,21:19:55,21.3
3,2023-05-31,21:30:11,21.2
4,2023-05-31,21:40:26,20.9


In [50]:
df['time'] = pd.to_datetime(df['date'] + ' ' + df['time'], format='%Y-%m-%d %H:%M:%S')
df.head(10)

Unnamed: 0,date,time,temp
0,2023-05-31,2023-05-31 21:00:17,21.9
1,2023-05-31,2023-05-31 21:09:40,21.8
2,2023-05-31,2023-05-31 21:19:55,21.3
3,2023-05-31,2023-05-31 21:30:11,21.2
4,2023-05-31,2023-05-31 21:40:26,20.9
5,2023-05-31,2023-05-31 21:50:41,20.8
6,2023-05-31,2023-05-31 22:00:04,20.7
7,2023-05-31,2023-05-31 22:15:27,21.1
8,2023-05-31,2023-05-31 22:25:42,21.1
9,2023-05-31,2023-05-31 22:35:59,21.1


In [51]:
df['hour'] = df['time'].dt.hour
df.head(10)

Unnamed: 0,date,time,temp,hour
0,2023-05-31,2023-05-31 21:00:17,21.9,21
1,2023-05-31,2023-05-31 21:09:40,21.8,21
2,2023-05-31,2023-05-31 21:19:55,21.3,21
3,2023-05-31,2023-05-31 21:30:11,21.2,21
4,2023-05-31,2023-05-31 21:40:26,20.9,21
5,2023-05-31,2023-05-31 21:50:41,20.8,21
6,2023-05-31,2023-05-31 22:00:04,20.7,22
7,2023-05-31,2023-05-31 22:15:27,21.1,22
8,2023-05-31,2023-05-31 22:25:42,21.1,22
9,2023-05-31,2023-05-31 22:35:59,21.1,22


In [52]:
# For each hour, find the closest timestamp (either before or after)
def get_closest_hour(row, df):
    # Find all rows for the target hour and the next hour (or previous if needed)
    target_hour = row['hour']
    closest_row = df.iloc[(df['time'] - row['time']).abs().argmin()]
    return closest_row

# Apply the function to each row in the dataframe
closest_temps = df.groupby(['date', 'hour'], group_keys=False).apply(lambda group: get_closest_hour(group.iloc[0], group))

# Convert the result back into a dataframe
result = closest_temps.reset_index(drop=True)[['date', 'time', 'temp']]
result.head(10)

  closest_temps = df.groupby(['date', 'hour'], group_keys=False).apply(lambda group: get_closest_hour(group.iloc[0], group))


Unnamed: 0,date,time,temp
0,2023-05-31,2023-05-31 21:00:17,21.9
1,2023-05-31,2023-05-31 22:00:04,20.7
2,2023-05-31,2023-05-31 23:05:52,21.1
3,2023-06-01,2023-06-01 00:01:24,20.6
4,2023-06-01,2023-06-01 01:02:04,19.9
5,2023-06-01,2023-06-01 02:07:51,19.5
6,2023-06-01,2023-06-01 03:08:31,19.0
7,2023-06-01,2023-06-01 04:03:12,19.1
8,2023-06-01,2023-06-01 05:05:34,18.7
9,2023-06-01,2023-06-01 06:01:06,18.1


In [53]:
result['time'] = result['time'].dt.hour
result.head()

Unnamed: 0,date,time,temp
0,2023-05-31,21,21.9
1,2023-05-31,22,20.7
2,2023-05-31,23,21.1
3,2023-06-01,0,20.6
4,2023-06-01,1,19.9


In [54]:
# Filter by date interval
start_date = '2023-06-01'
end_date = '2023-08-31'

result = result[(result['date'] >= start_date) & (result['date'] <= end_date)]
result.head()

Unnamed: 0,date,time,temp
3,2023-06-01,0,20.6
4,2023-06-01,1,19.9
5,2023-06-01,2,19.5
6,2023-06-01,3,19.0
7,2023-06-01,4,19.1


In [55]:
result.tail()

Unnamed: 0,date,time,temp
2206,2023-08-31,19,28.0
2207,2023-08-31,20,26.9
2208,2023-08-31,21,26.0
2209,2023-08-31,22,25.2
2210,2023-08-31,23,24.4


## Preprocess all stations

In [61]:
all_dataframes = []

In [63]:
# For each hour, find the closest timestamp (either before or after)
def get_closest_hour(row, df):
    # Find all rows for the target hour and the next hour (or previous if needed)
    target_hour = row['hour']
    closest_row = df.iloc[(df['time'] - row['time']).abs().argmin()]
    return closest_row

for csv_file in station_files:
    print(f'Processing {csv_file}')
    df = pd.read_csv(csv_file, delimiter=',')
    df['time'] = pd.to_datetime(df['date'] + ' ' + df['time'], format='%Y-%m-%d %H:%M:%S')
    df['hour'] = df['time'].dt.hour
    
    # Apply the function to each row in the dataframe
    closest_temps = df.groupby(['date', 'hour'], group_keys=False).apply(lambda group: get_closest_hour(group.iloc[0], group))

    # Convert the result back into a dataframe
    result = closest_temps.reset_index(drop=True)[['date', 'time', 'temp']]

    # Get just the time
    result['time'] = result['time'].dt.hour
    
    # Filter by date interval
    start_date = '2023-06-01'
    end_date = '2023-08-31'
    result = result[(result['date'] >= start_date) & (result['date'] <= end_date)]
    result['station'] = re.search(r"/([^/]+)\.csv$", csv_file).group(1)

    all_dataframes.append(result)

Processing ./raw_stations/70:ee:50:01:60:78.csv


  closest_temps = df.groupby(['date', 'hour'], group_keys=False).apply(lambda group: get_closest_hour(group.iloc[0], group))


Processing ./raw_stations/70:ee:50:01:cf:12.csv


  closest_temps = df.groupby(['date', 'hour'], group_keys=False).apply(lambda group: get_closest_hour(group.iloc[0], group))


Processing ./raw_stations/70:ee:50:04:ac:ea.csv


  closest_temps = df.groupby(['date', 'hour'], group_keys=False).apply(lambda group: get_closest_hour(group.iloc[0], group))


Processing ./raw_stations/70:ee:50:04:b4:36.csv


  closest_temps = df.groupby(['date', 'hour'], group_keys=False).apply(lambda group: get_closest_hour(group.iloc[0], group))


Processing ./raw_stations/70:ee:50:04:d3:de.csv


  closest_temps = df.groupby(['date', 'hour'], group_keys=False).apply(lambda group: get_closest_hour(group.iloc[0], group))


Processing ./raw_stations/70:ee:50:05:61:46.csv


  closest_temps = df.groupby(['date', 'hour'], group_keys=False).apply(lambda group: get_closest_hour(group.iloc[0], group))


Processing ./raw_stations/70:ee:50:17:c7:ee.csv


  closest_temps = df.groupby(['date', 'hour'], group_keys=False).apply(lambda group: get_closest_hour(group.iloc[0], group))
  closest_temps = df.groupby(['date', 'hour'], group_keys=False).apply(lambda group: get_closest_hour(group.iloc[0], group))


Processing ./raw_stations/70:ee:50:17:d3:1a.csv
Processing ./raw_stations/70:ee:50:19:9f:dc.csv


  closest_temps = df.groupby(['date', 'hour'], group_keys=False).apply(lambda group: get_closest_hour(group.iloc[0], group))


Processing ./raw_stations/70:ee:50:1c:43:78.csv


  closest_temps = df.groupby(['date', 'hour'], group_keys=False).apply(lambda group: get_closest_hour(group.iloc[0], group))


Processing ./raw_stations/70:ee:50:20:d1:2c.csv


  closest_temps = df.groupby(['date', 'hour'], group_keys=False).apply(lambda group: get_closest_hour(group.iloc[0], group))


Processing ./raw_stations/70:ee:50:28:9c:ec.csv


  closest_temps = df.groupby(['date', 'hour'], group_keys=False).apply(lambda group: get_closest_hour(group.iloc[0], group))


Processing ./raw_stations/70:ee:50:28:b2:92.csv


  closest_temps = df.groupby(['date', 'hour'], group_keys=False).apply(lambda group: get_closest_hour(group.iloc[0], group))


Processing ./raw_stations/70:ee:50:28:e2:26.csv


  closest_temps = df.groupby(['date', 'hour'], group_keys=False).apply(lambda group: get_closest_hour(group.iloc[0], group))


Processing ./raw_stations/70:ee:50:2b:4a:4a.csv


  closest_temps = df.groupby(['date', 'hour'], group_keys=False).apply(lambda group: get_closest_hour(group.iloc[0], group))


Processing ./raw_stations/70:ee:50:2b:58:c2.csv


  closest_temps = df.groupby(['date', 'hour'], group_keys=False).apply(lambda group: get_closest_hour(group.iloc[0], group))


Processing ./raw_stations/70:ee:50:2f:25:74.csv


  closest_temps = df.groupby(['date', 'hour'], group_keys=False).apply(lambda group: get_closest_hour(group.iloc[0], group))


Processing ./raw_stations/70:ee:50:33:41:34.csv


  closest_temps = df.groupby(['date', 'hour'], group_keys=False).apply(lambda group: get_closest_hour(group.iloc[0], group))


Processing ./raw_stations/70:ee:50:33:41:7a.csv


  closest_temps = df.groupby(['date', 'hour'], group_keys=False).apply(lambda group: get_closest_hour(group.iloc[0], group))


Processing ./raw_stations/70:ee:50:36:cf:1c.csv


  closest_temps = df.groupby(['date', 'hour'], group_keys=False).apply(lambda group: get_closest_hour(group.iloc[0], group))


Processing ./raw_stations/70:ee:50:37:00:ca.csv


  closest_temps = df.groupby(['date', 'hour'], group_keys=False).apply(lambda group: get_closest_hour(group.iloc[0], group))


Processing ./raw_stations/70:ee:50:37:11:4c.csv


  closest_temps = df.groupby(['date', 'hour'], group_keys=False).apply(lambda group: get_closest_hour(group.iloc[0], group))


Processing ./raw_stations/70:ee:50:3b:e9:d4.csv


  closest_temps = df.groupby(['date', 'hour'], group_keys=False).apply(lambda group: get_closest_hour(group.iloc[0], group))


Processing ./raw_stations/70:ee:50:3b:f5:64.csv


  closest_temps = df.groupby(['date', 'hour'], group_keys=False).apply(lambda group: get_closest_hour(group.iloc[0], group))


Processing ./raw_stations/70:ee:50:3c:ee:4c.csv


  closest_temps = df.groupby(['date', 'hour'], group_keys=False).apply(lambda group: get_closest_hour(group.iloc[0], group))


Processing ./raw_stations/70:ee:50:3d:23:92.csv


  closest_temps = df.groupby(['date', 'hour'], group_keys=False).apply(lambda group: get_closest_hour(group.iloc[0], group))


Processing ./raw_stations/70:ee:50:3d:26:c8.csv


  closest_temps = df.groupby(['date', 'hour'], group_keys=False).apply(lambda group: get_closest_hour(group.iloc[0], group))


Processing ./raw_stations/70:ee:50:3f:18:7e.csv


  closest_temps = df.groupby(['date', 'hour'], group_keys=False).apply(lambda group: get_closest_hour(group.iloc[0], group))


Processing ./raw_stations/70:ee:50:3f:63:68.csv


  closest_temps = df.groupby(['date', 'hour'], group_keys=False).apply(lambda group: get_closest_hour(group.iloc[0], group))


Processing ./raw_stations/70:ee:50:3f:68:bc.csv


  closest_temps = df.groupby(['date', 'hour'], group_keys=False).apply(lambda group: get_closest_hour(group.iloc[0], group))


Processing ./raw_stations/70:ee:50:52:d9:cc.csv


  closest_temps = df.groupby(['date', 'hour'], group_keys=False).apply(lambda group: get_closest_hour(group.iloc[0], group))


Processing ./raw_stations/70:ee:50:53:34:be.csv


  closest_temps = df.groupby(['date', 'hour'], group_keys=False).apply(lambda group: get_closest_hour(group.iloc[0], group))


Processing ./raw_stations/70:ee:50:58:98:7c.csv


  closest_temps = df.groupby(['date', 'hour'], group_keys=False).apply(lambda group: get_closest_hour(group.iloc[0], group))


Processing ./raw_stations/70:ee:50:5e:df:26.csv


  closest_temps = df.groupby(['date', 'hour'], group_keys=False).apply(lambda group: get_closest_hour(group.iloc[0], group))


Processing ./raw_stations/70:ee:50:5f:09:04.csv


  closest_temps = df.groupby(['date', 'hour'], group_keys=False).apply(lambda group: get_closest_hour(group.iloc[0], group))


Processing ./raw_stations/70:ee:50:5f:56:10.csv


  closest_temps = df.groupby(['date', 'hour'], group_keys=False).apply(lambda group: get_closest_hour(group.iloc[0], group))


Processing ./raw_stations/70:ee:50:64:ff:ee.csv


  closest_temps = df.groupby(['date', 'hour'], group_keys=False).apply(lambda group: get_closest_hour(group.iloc[0], group))


Processing ./raw_stations/70:ee:50:65:6b:86.csv


  closest_temps = df.groupby(['date', 'hour'], group_keys=False).apply(lambda group: get_closest_hour(group.iloc[0], group))


Processing ./raw_stations/70:ee:50:65:86:d2.csv


  closest_temps = df.groupby(['date', 'hour'], group_keys=False).apply(lambda group: get_closest_hour(group.iloc[0], group))


Processing ./raw_stations/70:ee:50:65:89:f2.csv


  closest_temps = df.groupby(['date', 'hour'], group_keys=False).apply(lambda group: get_closest_hour(group.iloc[0], group))


Processing ./raw_stations/70:ee:50:6b:2a:9a.csv


  closest_temps = df.groupby(['date', 'hour'], group_keys=False).apply(lambda group: get_closest_hour(group.iloc[0], group))


Processing ./raw_stations/70:ee:50:71:15:80.csv


  closest_temps = df.groupby(['date', 'hour'], group_keys=False).apply(lambda group: get_closest_hour(group.iloc[0], group))


Processing ./raw_stations/70:ee:50:71:22:16.csv


  closest_temps = df.groupby(['date', 'hour'], group_keys=False).apply(lambda group: get_closest_hour(group.iloc[0], group))


Processing ./raw_stations/70:ee:50:73:c5:a8.csv


  closest_temps = df.groupby(['date', 'hour'], group_keys=False).apply(lambda group: get_closest_hour(group.iloc[0], group))


Processing ./raw_stations/70:ee:50:73:ce:d0.csv


  closest_temps = df.groupby(['date', 'hour'], group_keys=False).apply(lambda group: get_closest_hour(group.iloc[0], group))


Processing ./raw_stations/70:ee:50:74:0d:90.csv


  closest_temps = df.groupby(['date', 'hour'], group_keys=False).apply(lambda group: get_closest_hour(group.iloc[0], group))


Processing ./raw_stations/70:ee:50:74:29:4c.csv


  closest_temps = df.groupby(['date', 'hour'], group_keys=False).apply(lambda group: get_closest_hour(group.iloc[0], group))


Processing ./raw_stations/70:ee:50:74:50:56.csv


  closest_temps = df.groupby(['date', 'hour'], group_keys=False).apply(lambda group: get_closest_hour(group.iloc[0], group))


Processing ./raw_stations/70:ee:50:7a:6f:c6.csv


  closest_temps = df.groupby(['date', 'hour'], group_keys=False).apply(lambda group: get_closest_hour(group.iloc[0], group))


Processing ./raw_stations/70:ee:50:7a:74:da.csv


  closest_temps = df.groupby(['date', 'hour'], group_keys=False).apply(lambda group: get_closest_hour(group.iloc[0], group))


Processing ./raw_stations/70:ee:50:7a:8d:66.csv


  closest_temps = df.groupby(['date', 'hour'], group_keys=False).apply(lambda group: get_closest_hour(group.iloc[0], group))


Processing ./raw_stations/70:ee:50:7a:8f:60.csv


  closest_temps = df.groupby(['date', 'hour'], group_keys=False).apply(lambda group: get_closest_hour(group.iloc[0], group))


Processing ./raw_stations/70:ee:50:7a:ae:b2.csv


  closest_temps = df.groupby(['date', 'hour'], group_keys=False).apply(lambda group: get_closest_hour(group.iloc[0], group))


Processing ./raw_stations/70:ee:50:7a:d1:ce.csv


  closest_temps = df.groupby(['date', 'hour'], group_keys=False).apply(lambda group: get_closest_hour(group.iloc[0], group))


Processing ./raw_stations/70:ee:50:90:e1:86.csv


  closest_temps = df.groupby(['date', 'hour'], group_keys=False).apply(lambda group: get_closest_hour(group.iloc[0], group))


Processing ./raw_stations/70:ee:50:a2:07:2e.csv


  closest_temps = df.groupby(['date', 'hour'], group_keys=False).apply(lambda group: get_closest_hour(group.iloc[0], group))


In [65]:
all_dataframes[0]

Unnamed: 0,date,time,temp,station
3,2023-06-01,0,20.6,70:ee:50:01:60:78
4,2023-06-01,1,19.9,70:ee:50:01:60:78
5,2023-06-01,2,19.5,70:ee:50:01:60:78
6,2023-06-01,3,19.0,70:ee:50:01:60:78
7,2023-06-01,4,19.1,70:ee:50:01:60:78
...,...,...,...,...
2206,2023-08-31,19,28.0,70:ee:50:01:60:78
2207,2023-08-31,20,26.9,70:ee:50:01:60:78
2208,2023-08-31,21,26.0,70:ee:50:01:60:78
2209,2023-08-31,22,25.2,70:ee:50:01:60:78


In [66]:
temperatures = pd.concat(all_dataframes, ignore_index=True)
temperatures.head()

Unnamed: 0,date,time,temp,station
0,2023-06-01,0,20.6,70:ee:50:01:60:78
1,2023-06-01,1,19.9,70:ee:50:01:60:78
2,2023-06-01,2,19.5,70:ee:50:01:60:78
3,2023-06-01,3,19.0,70:ee:50:01:60:78
4,2023-06-01,4,19.1,70:ee:50:01:60:78


In [67]:
temperatures.tail()

Unnamed: 0,date,time,temp,station
113675,2023-08-31,19,28.0,70:ee:50:a2:07:2e
113676,2023-08-31,20,25.9,70:ee:50:a2:07:2e
113677,2023-08-31,21,24.3,70:ee:50:a2:07:2e
113678,2023-08-31,22,23.1,70:ee:50:a2:07:2e
113679,2023-08-31,23,22.3,70:ee:50:a2:07:2e


In [68]:
temperatures.to_csv('temperatures.csv', index=False)

## Compute temperature differences

In [69]:
# As reference, we'll use 70:ee:50:3b:f5:64 (Venta del Olivar)