In [1]:
from meteostat import Point, Daily
import pandas as pd
from datetime import datetime
import matplotlib.pyplot as plt

In [2]:
# Let's define the location of the New York
new_york = Point(40.712778, -74.006111)

# Now, let's define the time range
start_date = datetime.strptime('2015-01-01', '%Y-%m-%d')
end_date = datetime.strptime('2024-12-31', '%Y-%m-%d')

# Fetch daily weather data
data = Daily(new_york, start_date, end_date)
data = data.fetch()

# Save specific columns to CSV
selected_columns = [
    'tavg',       # Average temperature
    'tmin',       # Daily minimum temperature
    'tmax',       # Daily maximum temperature
    'prcp',       # Precipitation
    'snow',       # Maximum snow depth
    'wspd',       # Average wind speed
    'pres',       # Average sea-level pressure
    'wdir'        # Wind direction
]



In [3]:
data = data[selected_columns]

data.to_csv('weather_data_new_york.csv')
print("Data saved to 'weather_data_new_york.csv'")

Data saved to 'weather_data_new_york.csv'


In [4]:
print(data.head())

print(data.tail())

            tavg  tmin  tmax  prcp  snow  wspd    pres   wdir
time                                                         
2015-01-01  -1.7  -6.6   3.9   0.0   0.0  20.2  1019.0  232.0
2015-01-02   2.6  -0.5   6.7   0.0   0.0  16.9  1023.3  252.0
2015-01-03   0.9  -3.2   3.9  20.1   0.0  11.2  1031.0   37.0
2015-01-04   6.6   3.9  13.3   6.9   0.0  16.6  1010.5  248.0
2015-01-05   5.4  -6.6   8.9   0.0   0.0  31.0  1019.8  286.0
            tavg  tmin  tmax  prcp  snow  wspd    pres   wdir
time                                                         
2024-12-27   2.5  -8.0   6.7   0.0   0.0   4.1  1030.9  309.0
2024-12-28   5.4   2.8   7.9   3.4   0.0   4.8  1025.1  312.0
2024-12-29   9.6   4.7  15.4   3.0   0.0   7.3  1017.4  271.0
2024-12-30  11.8   8.9  14.4  11.0   0.0  17.9  1005.8  249.0
2024-12-31   8.6   5.6  11.7   9.6   0.0  13.2  1010.1  216.0


In [5]:
missing_values = data.isnull().sum
print("Missing Values:\n:", missing_values)

Missing Values:
: <bound method DataFrame.sum of              tavg   tmin   tmax   prcp   snow   wspd   pres   wdir
time                                                              
2015-01-01  False  False  False  False  False  False  False  False
2015-01-02  False  False  False  False  False  False  False  False
2015-01-03  False  False  False  False  False  False  False  False
2015-01-04  False  False  False  False  False  False  False  False
2015-01-05  False  False  False  False  False  False  False  False
...           ...    ...    ...    ...    ...    ...    ...    ...
2024-12-27  False  False  False  False  False  False  False  False
2024-12-28  False  False  False  False  False  False  False  False
2024-12-29  False  False  False  False  False  False  False  False
2024-12-30  False  False  False  False  False  False  False  False
2024-12-31  False  False  False  False  False  False  False  False

[3653 rows x 8 columns]>


In [6]:
data.describe()

Unnamed: 0,tavg,tmin,tmax,prcp,snow,wspd,pres,wdir
count,3653.0,3653.0,3653.0,3653.0,3652.0,3653.0,3646.0,3427.0
mean,14.029428,10.395538,18.186039,3.393841,5.818729,11.851027,1018.498958,202.720163
std,9.263199,9.204797,9.802898,9.498735,33.368587,5.015655,8.052311,105.897477
min,-13.2,-17.7,-7.7,0.0,0.0,1.8,989.7,0.0
25%,6.4,3.3,10.0,0.0,0.0,8.2,1013.0,92.5
50%,14.0,10.1,18.3,0.0,0.0,10.7,1017.9,246.0
75%,22.6,18.9,27.0,2.0,0.0,14.2,1024.5,283.5
max,31.7,28.0,36.7,280.4,510.0,39.1,1046.9,359.0


In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 3653 entries, 2015-01-01 to 2024-12-31
Freq: D
Data columns (total 8 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   tavg    3653 non-null   float64
 1   tmin    3653 non-null   float64
 2   tmax    3653 non-null   float64
 3   prcp    3653 non-null   float64
 4   snow    3652 non-null   float64
 5   wspd    3653 non-null   float64
 6   pres    3646 non-null   float64
 7   wdir    3427 non-null   float64
dtypes: float64(8)
memory usage: 256.9 KB


In [8]:
data = data.copy()
data['week'] = data.index.isocalendar().week
data['year'] = data.index.isocalendar().year

In [10]:
# Define the columns to fill
cols_to_fill = ['snow', 'pres', 'wdir']

# Fill missing values with weekly median
for col in cols_to_fill:
    # Compute the weekly medians
    weekly_median = data.groupby(['year', 'week'])[col].transform('median')
    
    # Fill missing values
    data[col] = data[col].fillna(weekly_median)


In [12]:
#drop the helper columns
data.drop(columns=['week', 'year'], inplace=True)

In [13]:
data.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 3653 entries, 2015-01-01 to 2024-12-31
Freq: D
Data columns (total 8 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   tavg    3653 non-null   float64
 1   tmin    3653 non-null   float64
 2   tmax    3653 non-null   float64
 3   prcp    3653 non-null   float64
 4   snow    3653 non-null   float64
 5   wspd    3653 non-null   float64
 6   pres    3653 non-null   float64
 7   wdir    3646 non-null   float64
dtypes: float64(8)
memory usage: 256.9 KB


In [15]:
data['year'] = data.index.isocalendar().year
data['month'] = data.index.month
monthly_median = data.groupby(['year', 'month'])['wdir'].transform('median')
data['wdir'] = data['wdir'].fillna(monthly_median)
data.drop(columns=['month', 'year'], inplace=True)

In [16]:
data.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 3653 entries, 2015-01-01 to 2024-12-31
Freq: D
Data columns (total 8 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   tavg    3653 non-null   float64
 1   tmin    3653 non-null   float64
 2   tmax    3653 non-null   float64
 3   prcp    3653 non-null   float64
 4   snow    3653 non-null   float64
 5   wspd    3653 non-null   float64
 6   pres    3653 non-null   float64
 7   wdir    3653 non-null   float64
dtypes: float64(8)
memory usage: 256.9 KB


In [17]:
data.to_csv('weather_data_new_york.csv')