In [1]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor

from dataSplit import data_split_generator
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error
import numpy as np
from tqdm import tqdm
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [2]:
def make_x_y(data: pd.DataFrame, city_encoder: OneHotEncoder = None, country_encoder: OneHotEncoder = None):
    data = data.copy(deep=True)
    data.reset_index(drop=True, inplace=True)
    data.drop(columns=['datetime', 'target_dates'], inplace=True)
    if city_encoder is None:
        city_encoder = OneHotEncoder()
        city_encoder.fit(data[['City']])
    city = city_encoder.transform(data[['City']]).toarray()
    city = pd.DataFrame(city, columns= [str(x) for x in  city_encoder.categories_[0].tolist()])
    if country_encoder is None :
        country_encoder = OneHotEncoder()
        country_encoder.fit(data[['Country']])
    country = country_encoder.transform(data[['Country']]).toarray()
    country = pd.DataFrame(country, columns=[str(x) for x in country_encoder.categories_[0].tolist()])
    data:pd.DataFrame = pd.concat([data, city, country], axis = 1)
    data.drop(columns = ['City','Country'],inplace= True)
    Y = data.pop('target_AvgTemp')
    return data , Y, city_encoder, country_encoder
def get_metric(y_true, y_predict):
    RMSE = np.sqrt(mean_squared_error(y_true, y_predict))
    # MAE = mean_absolute_error(y_true, y_predict)
    # MAPE = mean_absolute_percentage_error(y_true, y_predict)
    return RMSE

In [3]:
final_metric = []
for j in tqdm(range(120, 133)):
    data = pd.read_csv(f'./edaData/processed_data_{j}.csv')
    data['target_dates'] = pd.to_datetime(data['target_dates'])
    data_split = data_split_generator(data)
    city_encoder = None
    country_encoder = None
    metrics = []
    for train_data, vali_data in data_split:
        train_x, train_y, city_encoder, country_encoder = make_x_y(train_data, city_encoder, country_encoder)
        rf = RandomForestRegressor(n_jobs=4)
        rf.fit(X=train_x, y=train_y)
        vali_x, vali_y, city_encoder, country_encoder = make_x_y(vali_data, city_encoder, country_encoder)
        predict = rf.predict(vali_x)
        metric = get_metric(vali_y, predict)
        metrics.append(metric)
    metrics = np.mean(metrics)
metrics

100%|██████████| 13/13 [34:44<00:00, 160.35s/it]


1.2489093060127723

In [29]:
# metrics
train_x, train_y, city_encoder, country_encoder = make_x_y(train_data, city_encoder, country_encoder)
train_x

Unnamed: 0,AverageTemperature,Latitude,Lag_1_Temp,Lag_2_Temp,Lag_3_Temp,Lag_4_Temp,Lag_5_Temp,Lag_6_Temp,Lag_12_Temp,Lag_24_Temp,...,Syria,Taiwan,Tanzania,Thailand,Turkey,Ukraine,United Kingdom,United States,Vietnam,Zimbabwe
0,26.098,5.63,25.145,25.889,24.590,24.307,23.464,23.586,26.556,25.915,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,26.682,5.63,26.098,25.145,25.889,24.590,24.307,23.464,27.370,27.913,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,28.107,5.63,26.682,26.098,25.145,25.889,24.590,24.307,27.822,27.512,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,27.139,5.63,28.107,26.682,26.098,25.145,25.889,24.590,26.625,26.816,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,26.066,5.63,27.139,28.107,26.682,26.098,25.145,25.889,26.533,25.837,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
93295,17.675,34.56,12.678,4.993,-0.364,-2.106,1.190,4.421,16.252,19.142,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
93296,21.962,34.56,17.675,12.678,4.993,-0.364,-2.106,1.190,23.683,22.855,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
93297,23.077,34.56,21.962,17.675,12.678,4.993,-0.364,-2.106,23.686,23.797,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
93298,22.194,34.56,23.077,21.962,17.675,12.678,4.993,-0.364,23.355,24.570,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [31]:
vali_x, vali_y, city_encoder, country_encoder = make_x_y(vali_data, city_encoder, country_encoder)
vali_x
vali_y

0        27.266
1        28.353
2        28.614
3        27.305
4        26.855
          ...  
11995    22.517
11996    17.823
11997    11.116
11998     4.095
11999     1.265
Name: target_AvgTemp, Length: 12000, dtype: float64