<a href="https://colab.research.google.com/github/antbartash/max_temp/blob/main/data_feature_extraction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

In [2]:
data_path = 'https://raw.githubusercontent.com/antbartash/max_temp/master/data/data2.csv'
data = pd.read_csv(data_path, index_col=0)
data['DATE'] = data['DATE'].astype('datetime64[ns]')

print(data.shape)
data.head()

(40898, 4)


Unnamed: 0,STATION,NAME,DATE,TMAX
0,USW00012916,"NEW ORLEANS AIRPORT, LA US",2010-01-01,12.2
1,USW00012916,"NEW ORLEANS AIRPORT, LA US",2010-01-02,10.6
2,USW00012916,"NEW ORLEANS AIRPORT, LA US",2010-01-03,8.3
3,USW00012916,"NEW ORLEANS AIRPORT, LA US",2010-01-04,6.1
4,USW00012916,"NEW ORLEANS AIRPORT, LA US",2010-01-05,6.1


In [3]:
data = data.head(100).copy()
print(data.shape)
data.head()

(100, 4)


Unnamed: 0,STATION,NAME,DATE,TMAX
0,USW00012916,"NEW ORLEANS AIRPORT, LA US",2010-01-01,12.2
1,USW00012916,"NEW ORLEANS AIRPORT, LA US",2010-01-02,10.6
2,USW00012916,"NEW ORLEANS AIRPORT, LA US",2010-01-03,8.3
3,USW00012916,"NEW ORLEANS AIRPORT, LA US",2010-01-04,6.1
4,USW00012916,"NEW ORLEANS AIRPORT, LA US",2010-01-05,6.1


In [4]:
data_features = pd.DataFrame()

for station in data['STATION'].unique():
  data_station = data[data['STATION'] == station].copy()
  data_station = data_station.sort_values('DATE').reset_index(drop=True)
  for id in data_station.index.values:
    if id >= 15:
      data_new_row = pd.DataFrame({
          'STATION': data_station.loc[id, 'STATION'],
          'NAME': data_station.loc[id, 'NAME'],
          'DATE': data_station.loc[id, 'DATE'],
          'TARGET': data_station.loc[id, 'TMAX'],
          'TMAX_d1': data_station.loc[id-1, 'TMAX'],
          'TMAX_d1_d2_diff': data_station.loc[id-1, 'TMAX'] - data_station.loc[id-2, 'TMAX']
      }, index=[0])
      for window in [3, 5, 7, 10, 14]:
        for agg_func in ['mean', 'std', 'min', 'max', 'median']:
          data_new_row[f'TMAX_{window}day_{agg_func}'] = data_station.loc[id-window:id-1, 'TMAX'].agg(agg_func)
      for window in [3, 5, 7, 10, 14]:
        data_new_row[f'TMAX_{window}day_mean_diff'] = np.mean(data_station.loc[id-window:id-1, 'TMAX'].values - data_station.loc[id-window-1:id-2, 'TMAX'].values)
      data_features = pd.concat([data_features, data_new_row], ignore_index=True)

print(data_features.shape)
data_features.head()

(85, 36)


Unnamed: 0,STATION,NAME,DATE,TARGET,TMAX_d1,TMAX_d1_d2_diff,TMAX_3day_mean,TMAX_3day_std,TMAX_3day_min,TMAX_3day_max,...,TMAX_14day_mean,TMAX_14day_std,TMAX_14day_min,TMAX_14day_max,TMAX_14day_median,TMAX_3day_mean_diff,TMAX_5day_mean_diff,TMAX_7day_mean_diff,TMAX_10day_mean_diff,TMAX_14day_mean_diff
0,USW00012916,"NEW ORLEANS AIRPORT, LA US",2010-01-16,15.6,15.0,-1.1,13.7,3.251154,10.0,16.1,...,9.278571,4.868925,1.7,16.7,8.85,0.2,2.12,1.9,0.89,0.2
1,USW00012916,"NEW ORLEANS AIRPORT, LA US",2010-01-17,14.4,15.6,0.6,15.566667,0.550757,15.0,16.1,...,9.635714,5.148653,1.7,16.7,8.85,1.866667,1.24,1.828571,0.73,0.357143
2,USW00012916,"NEW ORLEANS AIRPORT, LA US",2010-01-18,17.2,14.4,-1.2,15.0,0.6,14.4,15.6,...,10.071429,5.283272,1.7,16.7,9.7,-0.566667,0.0,1.428571,-0.23,0.435714
3,USW00012916,"NEW ORLEANS AIRPORT, LA US",2010-01-19,21.1,17.2,2.8,15.733333,1.404754,14.4,17.2,...,10.864286,5.470987,1.7,17.2,12.2,0.733333,1.44,1.114286,1.55,0.792857
4,USW00012916,"NEW ORLEANS AIRPORT, LA US",2010-01-20,23.9,21.1,3.9,17.566667,3.365016,14.4,21.1,...,11.935714,5.916809,1.7,21.1,14.4,1.833333,1.0,0.957143,1.83,1.071429


In [8]:
data_features['MONTH'] = data_features['DATE'].dt.month.values

encoder = OneHotEncoder()
encoder.fit(data_features[['STATION']])
stations = pd.DataFrame.sparse.from_spmatrix(encoder.transform(data_features[['STATION']]), columns=[x[0] for x in encoder.categories_])
data_features = pd.concat([data_features, stations], axis=1)

print(data_features.shape)
data_features.head()

(85, 38)


Unnamed: 0,STATION,NAME,DATE,TARGET,TMAX_d1,TMAX_d1_d2_diff,TMAX_3day_mean,TMAX_3day_std,TMAX_3day_min,TMAX_3day_max,...,TMAX_14day_min,TMAX_14day_max,TMAX_14day_median,TMAX_3day_mean_diff,TMAX_5day_mean_diff,TMAX_7day_mean_diff,TMAX_10day_mean_diff,TMAX_14day_mean_diff,MONTH,"(USW00012916,)"
0,USW00012916,"NEW ORLEANS AIRPORT, LA US",2010-01-16,15.6,15.0,-1.1,13.7,3.251154,10.0,16.1,...,1.7,16.7,8.85,0.2,2.12,1.9,0.89,0.2,1,1.0
1,USW00012916,"NEW ORLEANS AIRPORT, LA US",2010-01-17,14.4,15.6,0.6,15.566667,0.550757,15.0,16.1,...,1.7,16.7,8.85,1.866667,1.24,1.828571,0.73,0.357143,1,1.0
2,USW00012916,"NEW ORLEANS AIRPORT, LA US",2010-01-18,17.2,14.4,-1.2,15.0,0.6,14.4,15.6,...,1.7,16.7,9.7,-0.566667,0.0,1.428571,-0.23,0.435714,1,1.0
3,USW00012916,"NEW ORLEANS AIRPORT, LA US",2010-01-19,21.1,17.2,2.8,15.733333,1.404754,14.4,17.2,...,1.7,17.2,12.2,0.733333,1.44,1.114286,1.55,0.792857,1,1.0
4,USW00012916,"NEW ORLEANS AIRPORT, LA US",2010-01-20,23.9,21.1,3.9,17.566667,3.365016,14.4,21.1,...,1.7,21.1,14.4,1.833333,1.0,0.957143,1.83,1.071429,1,1.0
