In [26]:
# Import the necessary libraries
from datetime import date
import numpy as np
import pandas as pd
import geopandas as gpd
import plotly.graph_objects as go
import plotly.express as px
import matplotlib.pyplot as plt

#----------------Utils--------------------------
import pandas as pd
import plotly
import plotly.graph_objects as go
import geopandas as gpd
import matplotlib.pyplot as plt
import numpy as np

In [27]:
#Helper function for plotting DataFrames
def show_plot(df, x_col, y_col, name="Unnamed Plot", additional_traces=[]):
    if type(df) != list:
        df = [df]
        x_col = [x_col]
        y_col = [y_col]
        name = [name]
    
    fig = go.Figure()
    for i, single_df in enumerate(df):
        x = single_df[x_col[i]]
        y = single_df[y_col[i]]
        fig_name = name[i]
        fig.add_trace(go.Scatter(x=x, y=y, mode='lines+markers',name=fig_name))
    
    if len(additional_traces) > 0:
        for trace in additional_traces:
            fig.add_trace(trace)
    fig.show()

In [28]:
#function to import the csv file and do some preprocessing
def import_df(path):
    df = pd.read_csv(path, index_col = 0)
    df['date'] = pd.to_datetime(df['date'],  format="%Y-%m-%d")
    return df

In [29]:
# Temperature, wind direction, and wind speed
base_data_path = "../data/milano_meteo_data"
temperature_path = f"{base_data_path}/temperature.csv"
temperature_raw = import_df(temperature_path)


In [30]:
# Remove useless columns
useless_columns = ["type","unit","stationID","altitude","province"]
temperature_raw.drop(columns=useless_columns, inplace=True)

In [47]:
# take data from years 2015 to 2021
start_date = date(2020,1,1)
end_date = date(2020,12,31)
temperature = temperature_raw.loc[
    (temperature_raw['date'].dt.date >= start_date) &
    (temperature_raw['date'].dt.date <= end_date)
]

# Group dates daily
temperature['date'] = temperature['date'].dt.date

# Group by date and location
temperature = temperature.groupby(
    ['date','sensorID','lat','lng']
).mean().reset_index()




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [49]:
temperature

Unnamed: 0,date,sensorID,lat,lng,value
0,2020-01-01,2001,45.496780,9.257515,2.895833
1,2020-01-01,2039,45.171919,9.488997,4.795833
2,2020-01-01,4001,45.281956,8.988563,2.420833
3,2020-01-01,4058,45.542665,9.205603,2.039583
4,2020-01-01,4066,45.548517,8.847322,0.945833
...,...,...,...,...,...
5425,2020-12-31,8162,45.491633,9.248738,0.229167
5426,2020-12-31,12757,45.607845,8.952897,-0.008333
5427,2020-12-31,12759,45.535577,9.005200,0.287500
5428,2020-12-31,14742,45.517811,9.087923,0.525000


In [50]:
temperature.describe()

Unnamed: 0,sensorID,lat,lng,value
count,5430.0,5430.0,5430.0,5430.0
mean,6673.426519,45.449517,9.155294,14.621903
std,4165.879761,0.099244,0.16366,7.908107
min,2001.0,45.171919,8.847322,-2.25
25%,4066.0,45.458065,9.087923,7.642708
50%,5909.0,45.473226,9.18911,14.154167
75%,5920.0,45.49678,9.248738,21.463542
max,17488.0,45.607845,9.488997,31.75


In [51]:
sensors = list(temperature.sensorID.unique())
show_plot(
    [ temperature.loc[temperature['sensorID']==sensor ] for sensor in sensors ],
    ['date']*len(sensors),
    ['value']*len(sensors),
    [ f'Sensor {sensor}' for sensor in sensors ],
)

In [57]:
# Days with data per each location:
temperature_per_sensor = temperature.copy()[['sensorID','date']]
temperature_per_sensor = temperature_per_sensor.groupby(['sensorID']).count()
temperature_per_sensor

Unnamed: 0_level_0,date
sensorID,Unnamed: 1_level_1
2001,366
2039,366
4001,238
4058,366
4066,366
5897,363
5903,366
5909,366
5910,366
5911,366


In [84]:
# Selecting only locations with al days
# 366 because 2020 is a leap day
complete_stations_list = list(temperature_per_sensor.loc[temperature_per_sensor['date'] == 366].reset_index()['sensorID'].unique())
complete_stations_df = temperature.loc[temperature['sensorID'].isin(complete_stations_list)]
complete_stations_df

Unnamed: 0,date,sensorID,lat,lng,value
0,2020-01-01,2001,45.496780,9.257515,2.895833
1,2020-01-01,2039,45.171919,9.488997,4.795833
3,2020-01-01,4058,45.542665,9.205603,2.039583
4,2020-01-01,4066,45.548517,8.847322,0.945833
6,2020-01-01,5903,45.436109,9.097411,2.800000
...,...,...,...,...,...
5422,2020-12-31,5911,45.496316,9.190934,2.041667
5424,2020-12-31,5920,45.476063,9.141786,2.017391
5425,2020-12-31,8162,45.491633,9.248738,0.229167
5428,2020-12-31,14742,45.517811,9.087923,0.525000


In [85]:
sensors = complete_stations_list
show_plot(
    [ complete_stations_df.loc[complete_stations_df['sensorID']==sensor ] for sensor in sensors ],
    ['date']*len(sensors),
    ['value']*len(sensors),
    [ f'Sensor {sensor}' for sensor in sensors ],
)

In [100]:
fig = go.Figure()
lng = complete_stations_df['lng']
lat = complete_stations_df['lat']
plot = go.Scatter(x=lng, y=lat, mode='markers+text', textposition="bottom center",
    text=[ f'Sensor {sensor}' for sensor in sensors ]
)
fig.add_trace(plot)