In [1]:
import numpy as np
import pandas as pd 
import os
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression

In [2]:
train_df = pd.read_csv("/kaggle/input/widsdatathon2023/train_data.csv")

In [3]:
X = train_df.drop(columns=['contest-tmp2m-14d__tmp2m'])
Y = train_df['contest-tmp2m-14d__tmp2m']

In [4]:
X_copy = X

# **Preprocessing**

In [5]:
def preprocess(X):
    X.fillna(method='ffill',inplace=True)
    X['startdate']=pd.to_datetime(X['startdate'])
    X['startdate_year']=X['startdate'].dt.year
    X['startdate_month']=X['startdate'].dt.month
    X['startdate_day']=X['startdate'].dt.day
    X.drop(columns=['startdate'],inplace=True)
    encodeCategoricalData(X)
    X = dropNonCorrelatedVariables(X)
    return X
def encodeCategoricalData(X_train):
    onHotEncoder = OneHotEncoder()
    transformed = onHotEncoder.fit_transform(X_train[['climateregions__climateregion']])
    print(onHotEncoder.categories_)
    X_train[onHotEncoder.categories_[0]] = transformed.toarray()
    X_train.drop(columns=['climateregions__climateregion'],inplace=True)
    

In [6]:
def findCorrelatedVariables(X):
    correlation_mat=X.corrwith(Y)
    correlated_var=correlation_mat.loc[lambda y:abs(y) > 0.2]
    return correlated_var
def dropNonCorrelatedVariables(X):
    return X[X.columns.intersection(correlated_var.keys())]

In [7]:
correlated_var = findCorrelatedVariables(X)

In [8]:
X = preprocess(X)

[array(['BSh', 'BSk', 'BWh', 'BWk', 'Cfa', 'Cfb', 'Csa', 'Csb', 'Dfa',
       'Dfb', 'Dfc', 'Dsb', 'Dsc', 'Dwa', 'Dwb'], dtype=object)]


In [9]:
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size = 0.2,random_state = 0)

In [10]:
X_train

Unnamed: 0,index,lat,contest-pevpr-sfc-gauss-14d__pevpr,nmme0-tmp2m-34w__cancm30,nmme0-tmp2m-34w__cancm40,nmme0-tmp2m-34w__ccsm30,nmme0-tmp2m-34w__ccsm40,nmme0-tmp2m-34w__cfsv20,nmme0-tmp2m-34w__gfdlflora0,nmme0-tmp2m-34w__gfdlflorb0,...,wind-hgt-10-2010-1,wind-hgt-100-2010-1,wind-hgt-100-2010-2,wind-hgt-100-2010-9,wind-vwnd-925-2010-1,wind-vwnd-925-2010-3,wind-vwnd-925-2010-9,wind-vwnd-925-2010-15,wind-vwnd-925-2010-16,wind-vwnd-925-2010-18
45755,45755,0.272727,253.29,14.23,16.91,12.48,15.15,12.98,13.67,13.65,...,62981.47,6352.16,-17258.04,-1332.26,59.38,-33.91,-50.17,-72.53,0.45,12.98
202801,202801,0.636364,429.38,19.76,20.81,20.66,20.79,21.56,24.33,24.65,...,-119033.16,-43021.70,4753.25,1228.84,-143.97,14.05,-32.37,19.49,-14.69,20.40
346503,346503,0.954545,260.23,19.55,21.58,16.25,19.76,18.59,19.41,19.30,...,-50719.55,-21098.15,-5693.75,-1610.90,-69.74,102.12,35.23,-3.72,63.91,-40.83
147082,147082,0.500000,108.48,-0.86,1.28,0.03,1.50,-0.43,2.67,2.44,...,108732.74,40461.19,-1273.31,2069.91,146.94,39.91,28.49,43.41,-4.69,24.75
238542,238542,0.727273,345.13,5.57,4.62,5.09,4.43,6.51,2.63,2.49,...,-38156.60,-5023.20,6533.65,4.94,-49.67,-6.36,16.13,6.78,-17.67,-38.13
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
359783,359783,1.000000,6.72,-4.69,-2.32,-6.22,-3.27,-6.28,-2.91,-3.80,...,113960.81,39375.83,1141.43,-992.25,176.24,-32.23,82.02,-5.34,-5.53,65.24
358083,358083,1.000000,168.94,6.58,6.59,4.00,6.45,5.89,6.95,6.48,...,-59694.83,-15553.79,8724.15,-1785.63,-105.81,66.93,17.40,-23.43,-3.65,37.80
152315,152315,0.545455,483.62,14.85,15.43,15.91,16.07,15.67,15.72,15.87,...,-78178.14,-22921.26,9494.04,592.73,-95.01,39.58,3.47,-11.19,-25.75,9.48
117952,117952,0.454545,537.24,10.93,10.79,8.70,10.56,10.61,7.46,7.86,...,-71718.47,-19554.57,8995.46,-414.84,-70.29,83.08,22.51,-6.27,-18.93,16.90


In [11]:
import xgboost

In [12]:
import optuna
def objective(trial):
    param = {
        'max_depth': trial.suggest_int('max_depth', 1, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 1.0),
        'n_estimators': trial.suggest_int('n_estimators', 50, 1000),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'gamma': trial.suggest_float('gamma', 0.01, 1.0),
        'subsample': trial.suggest_float('subsample', 0.01, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.01, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.01, 1.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.01, 1.0),
        'random_state': trial.suggest_int('random_state', 1, 1000)
    }
    model = xgboost.XGBRegressor(**param)
    model.fit(X_train, Y_train)
    y_pred = model.predict(X_test)
    return mean_squared_error(Y_test, y_pred)

In [13]:
study = optuna.create_study(direction='minimize', study_name='regression')
study.optimize(objective, n_trials=20)

[32m[I 2023-03-01 10:36:30,036][0m A new study created in memory with name: regression[0m
[32m[I 2023-03-01 11:01:59,842][0m Trial 0 finished with value: 0.10034592920507564 and parameters: {'max_depth': 10, 'learning_rate': 0.1735019281400992, 'n_estimators': 744, 'min_child_weight': 8, 'gamma': 0.3231742265352549, 'subsample': 0.7014752199974251, 'colsample_bytree': 0.8128922271848302, 'reg_alpha': 0.19209567518989204, 'reg_lambda': 0.39480466049181456, 'random_state': 881}. Best is trial 0 with value: 0.10034592920507564.[0m
[32m[I 2023-03-01 11:03:39,768][0m Trial 1 finished with value: 0.8995162638832477 and parameters: {'max_depth': 2, 'learning_rate': 0.6679852652533286, 'n_estimators': 415, 'min_child_weight': 5, 'gamma': 0.8334752643972975, 'subsample': 0.5705425592678668, 'colsample_bytree': 0.3333789325422354, 'reg_alpha': 0.8393082718099192, 'reg_lambda': 0.5422831739139413, 'random_state': 802}. Best is trial 0 with value: 0.10034592920507564.[0m
[32m[I 2023-03-0

In [14]:
import xgboost
model = xgboost.XGBRegressor(max_depth=10, learning_rate=0.31980297530663365, n_estimators= 613, min_child_weight=6, gamma=0.9229319872511204, subsample= 0.8744744710561068, colsample_bytree= 0.7706469704837123, reg_alpha= 0.8610933743587899, reg_lambda=0.837931322426224,random_state=915)
model.fit(X_train, Y_train)
y_pred = model.predict(X_test)

In [15]:
y_pred

array([16.19919  , 16.89909  , 28.023489 , ..., -4.0831923, 12.316688 ,
        4.9463735], dtype=float32)

In [16]:
model.score(X_test,Y_test)

0.9981787700399272

In [17]:
from sklearn.metrics import mean_squared_error
rms = mean_squared_error(Y_test,y_pred,squared=False)

In [18]:
rms

0.42075647879537026

# **Test for test_data.csv**

In [19]:
test_df = pd.read_csv("/kaggle/input/widsdatathon2023/test_data.csv")

In [20]:
preProcessed_df = preprocess(test_df)

[array(['BSh', 'BSk', 'BWh', 'BWk', 'Cfa', 'Cfb', 'Csa', 'Csb', 'Dfa',
       'Dfb', 'Dfc', 'Dsb', 'Dsc', 'Dwa', 'Dwb'], dtype=object)]


In [21]:
Y_actual_pred = model.predict(preProcessed_df)

In [22]:
solution_df = pd.DataFrame(columns=['Index','contest-tmp2m-14d__tmp2m'])

In [23]:
solution_df['Index']=test_df.loc[:,'index']
solution_df['contest-tmp2m-14d__tmp2m']=Y_actual_pred

In [24]:
solution_df.to_csv('/kaggle/working/submission.csv',index=False)

In [25]:
pd.read_csv('/kaggle/working/submission.csv')

Unnamed: 0,Index,contest-tmp2m-14d__tmp2m
0,375734,30.080233
1,375735,30.106647
2,375736,30.031658
3,375737,30.017403
4,375738,30.013895
...,...,...
31349,407083,5.702414
31350,407084,4.726397
31351,407085,3.247447
31352,407086,3.288362
