## Common Functions

In [33]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

def get_predictions(df):
    X_train, X_test, y_train, y_test = train_test_split(df.drop("target", axis=1), df["target"], random_state=0, test_size=0.3)

    rf = RandomForestRegressor(
        n_estimators=1000,
        max_depth=7,
        n_jobs=-1,
        random_state=42)
    rf.fit(X_train, y_train)
    y_hat = rf.predict(X_test) # test set is hidden from you
    return y_hat

In [28]:
from sklearn.metrics import mean_squared_error

def get_rmse(y_test, y_pred):
    return mean_squared_error(y_test, y_pred, squared=False)

## Data Loading

In [1]:
import pandas as pd

In [9]:
df = pd.read_csv("data/sample_submission.csv", index_col="id")

In [12]:
df.head()

Unnamed: 0_level_0,target,O2_1,O2_2,O2_3,O2_4,O2_5,O2_6,O2_7,NH4_1,NH4_2,...,NO3_5,NO3_6,NO3_7,BOD5_1,BOD5_2,BOD5_3,BOD5_4,BOD5_5,BOD5_6,BOD5_7
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,8.59,7.5,9.0,9.545,9.265,8.11,8.43,7.15,0.18,0.2,...,4.95,1.73,1.8,4.8,3.15,10.665,10.465,16.645,5.75,10.37
1,9.1,13.533,40.9,8.77,9.265,6.015,10.07,7.15,1.107,1.027,...,20.05,9.53,7.695,4.55,6.95,2.04,5.2,5.725,2.95,2.23
2,8.21,3.71,5.42,8.77,9.265,4.55,10.07,7.15,0.02,0.02,...,4.58,3.025,3.96,4.935,4.95,4.725,6.075,6.75,3.5,3.17
3,8.39,8.7,8.1,9.5,9.2,5.2,8.67,6.67,0.28,0.27,...,8.45,2.07,1.73,6.3,4.7,3.5,6.2,8.67,2.9,7.37
4,8.07,8.05,8.65,7.96,9.265,3.29,10.07,7.15,0.36,0.435,...,2.02,1.73,0.76,4.8,4.97,3.95,2.8,8.4,3.5,3.9


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3500 entries, 0 to 3499
Data columns (total 36 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   target  3500 non-null   float64
 1   O2_1    3500 non-null   float64
 2   O2_2    3500 non-null   float64
 3   O2_3    3500 non-null   float64
 4   O2_4    3500 non-null   float64
 5   O2_5    3500 non-null   float64
 6   O2_6    3500 non-null   float64
 7   O2_7    3500 non-null   float64
 8   NH4_1   3500 non-null   float64
 9   NH4_2   3500 non-null   float64
 10  NH4_3   3500 non-null   float64
 11  NH4_4   3500 non-null   float64
 12  NH4_5   3500 non-null   float64
 13  NH4_6   3500 non-null   float64
 14  NH4_7   3500 non-null   float64
 15  NO2_1   3500 non-null   float64
 16  NO2_2   3500 non-null   float64
 17  NO2_3   3500 non-null   float64
 18  NO2_4   3500 non-null   float64
 19  NO2_5   3500 non-null   float64
 20  NO2_6   3500 non-null   float64
 21  NO2_7   3500 non-null   float64
 22  

In [11]:
df.describe()

Unnamed: 0,target,O2_1,O2_2,O2_3,O2_4,O2_5,O2_6,O2_7,NH4_1,NH4_2,...,NO3_5,NO3_6,NO3_7,BOD5_1,BOD5_2,BOD5_3,BOD5_4,BOD5_5,BOD5_6,BOD5_7
count,3500.0,3500.0,3500.0,3500.0,3500.0,3500.0,3500.0,3500.0,3500.0,3500.0,...,3500.0,3500.0,3500.0,3500.0,3500.0,3500.0,3500.0,3500.0,3500.0,3500.0
mean,8.473834,8.216993,9.291581,9.633064,8.065733,5.671581,9.461363,6.547433,0.340525,0.397956,...,8.710466,3.002243,3.242654,4.816718,4.754389,4.28566,5.96898,7.837995,3.750349,4.068639
std,1.885712,3.040743,6.817822,1.439039,1.463881,2.721399,1.140365,1.663266,0.284751,0.452237,...,6.438648,2.918355,2.713923,2.009161,1.547491,2.093388,2.576146,3.724861,1.682421,1.894632
min,1.3,0.0,0.0,4.9,2.3,0.2,0.0,0.0,0.02,0.02,...,0.044,0.011,0.0,0.57,0.089,0.85,2.025,2.02,0.364,1.6
25%,7.47,7.1,7.3,8.77,6.83,4.55,8.98,5.8775,0.2145,0.2,...,4.45,1.14,1.78,3.52,3.95,2.29,4.95,5.725,2.684,2.9
50%,8.28,7.89,8.3,9.5,7.98,5.8,9.43,6.43,0.247,0.245,...,6.054,1.73,1.8,4.6,4.8,3.95,5.725,7.5,3.5,3.7
75%,9.11,9.1,8.7,9.545,9.265,7.015,10.07,7.15,0.37,0.405,...,14.515,3.025,4.2375,5.0,5.4,4.725,6.25,8.4,4.15,4.45
max,65.93,46.95,65.95,16.9,21.8,59.4,40.19,15.9,4.2,3.6,...,35.0,19.355,27.025,11.65,10.7,19.8,55.4,82.45,19.375,10.37


## Exploration Stage

In [34]:
y_hat = get_predictions(df)
rmse = get_rmse(y_test, y_hat)

In [35]:
rmse

1.401317428662328