In [1]:
# Familiar imports
import numpy as np
import pandas as pd
#import pandas_profiling as pp


# For ordinal encoding categorical variables, splitting data
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
#from sklearn.impute import SimpleImputer

# For training random forest model
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# Functions for EDA - Exploratory Data Analysis

In [2]:
# Create a function that we can re-use
def show_distribution(var_data):
    from matplotlib import pyplot as plt

    # Get statistics
    min_val = var_data.min()
    max_val = var_data.max()
    mean_val = var_data.mean()
    med_val = var_data.median()
    mod_val = var_data.mode()[0]

    print('Minimum:{:.2f}\nMean:{:.2f}\nMedian:{:.2f}\nMode:{:.2f}\nMaximum:{:.2f}\n'.format(min_val,
                                                                                            mean_val,
                                                                                            med_val,
                                                                                            mod_val,
                                                                                            max_val))

    # Create a figure for 2 subplots (2 rows, 1 column)
    fig, ax = plt.subplots(2, 1, figsize = (10,4))

    # Plot the histogram   
    ax[0].hist(var_data)
    ax[0].set_ylabel('Frequency')

    # Add lines for the mean, median, and mode
    ax[0].axvline(x=min_val, color = 'gray', linestyle='dashed', linewidth = 2)
    ax[0].axvline(x=mean_val, color = 'cyan', linestyle='dashed', linewidth = 2)
    ax[0].axvline(x=med_val, color = 'red', linestyle='dashed', linewidth = 2)
    ax[0].axvline(x=mod_val, color = 'yellow', linestyle='dashed', linewidth = 2)
    ax[0].axvline(x=max_val, color = 'gray', linestyle='dashed', linewidth = 2)

    # Plot the boxplot   
    ax[1].boxplot(var_data, vert=False)
    ax[1].set_xlabel('Value')

    # Add a title to the Figure
    fig.suptitle('Data Distribution')

    # Show the figure
    plt.show()

def show_density(var_data):
    from matplotlib import pyplot as plt

    mean_val = var_data.mean()
    med_val = var_data.median()
    mod_val = var_data.mode()[0]

    print('Mean:{:.2f}\nMedian:{:.2f}\nMode:{:.2f}\n'.format(mean_val, med_val, mod_val))

    fig = plt.figure(figsize=(10,4))

    # Plot density
    var_data.plot.density()

    # Add titles and labels
    plt.title('Data Density')

    # Show the mean, median, and mode
    plt.axvline(x=var_data.mean(), color = 'cyan', linestyle='dashed', linewidth = 2)
    plt.axvline(x=var_data.median(), color = 'red', linestyle='dashed', linewidth = 2)
    plt.axvline(x=var_data.mode()[0], color = 'yellow', linestyle='dashed', linewidth = 2)

    # Show the figure
    plt.show()

# Load the data

Next, we'll load the training and test data.  

We set `index_col=0` in the code cell below to use the `id` column to index the DataFrame.  (*If you're not sure how this works, try temporarily removing `index_col=0` and see how it changes the result.*)

In [3]:
# Load the training data
train = pd.read_csv("../input/30-days-of-ml/train.csv") #, index_col=0)
test = pd.read_csv("../input/30-days-of-ml/test.csv",index_col=0)

train.drop("id", axis=1, inplace=True)
#test.drop("id", axis=1, inplace=True)

# Preview the data
train.head()

Unnamed: 0,cat0,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,cat9,...,cont5,cont6,cont7,cont8,cont9,cont10,cont11,cont12,cont13,target
0,B,B,B,C,B,B,A,E,C,N,...,0.400361,0.160266,0.310921,0.38947,0.267559,0.237281,0.377873,0.322401,0.86985,8.113634
1,B,B,A,A,B,D,A,F,A,O,...,0.533087,0.558922,0.516294,0.594928,0.341439,0.906013,0.921701,0.261975,0.465083,8.481233
2,A,A,A,C,B,D,A,D,A,F,...,0.650609,0.375348,0.902567,0.555205,0.843531,0.748809,0.620126,0.541474,0.763846,8.364351
3,B,B,A,C,B,D,A,E,C,K,...,0.66898,0.239061,0.732948,0.679618,0.574844,0.34601,0.71461,0.54015,0.280682,8.049253
4,A,A,A,C,B,D,A,E,A,N,...,0.686964,0.420667,0.648182,0.684501,0.956692,1.000773,0.776742,0.625849,0.250823,7.97226


The next code cell separates the target (which we assign to `y`) from the training features (which we assign to `features`).

In [4]:
# Separate target from features
y = train['target']
features = train.drop(['target'], axis=1)


# Preview features
#features.head()

In [5]:
# let's zoom in on our y variable
cont_cols = [col for col in features.columns if 'cont' in col]
cat_cols = [col for col in features.columns if 'cat' in col]

#show_density(y)
#show_distribution(y)



# Prepare the data


In [6]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder, StandardScaler


X = features.copy()
X_final = test.copy()

transformers = ColumnTransformer(
    [("ordinary_encoder", OrdinalEncoder(), cat_cols),
     ("standardize", StandardScaler(), cont_cols)],    #strictly not necessary for RandomForest
    remainder="passthrough"
)

X[features.columns] = transformers.fit_transform(X[features.columns])

X_final[features.columns] = transformers.transform(X_final[features.columns])


## Next, we break off a validation set from the training data.


In [7]:
# We want to speed up training so we use sample of 10,000 out of 300,000 which is 3,333%
# We can afford to take big validation set because we have enough data so we make it same size as training 
# Choosing a big validation set will show reality of accuracy of model
train_size = 1/30  # 10,000
test_size = 1/30   # 10,000
X_train, X_valid, y_train, y_valid = train_test_split(X, y, random_state=0, train_size=train_size, test_size=test_size )

In [8]:
#X.shape
X_train.shape, X_valid.shape


((10000, 24), (10000, 24))

## Base Line predict with mean

We need to do at least better than average prediction to show our model have learnt something

In [9]:
from sklearn.metrics import mean_squared_error

y_base = np.array([y_train.mean()]*y_valid.shape[0])
mean_squared_error(y_valid, y_base, squared=False)

0.7403795279628655

# Train a model

## We use RandomForest without tuning

In [10]:
# Define the model 
model = RandomForestRegressor(random_state=1, n_estimators=200, criterion="mae", n_jobs=-1, max_depth=3)

# Train the model 
model.fit(X_train, y_train)
preds_valid = model.predict(X_valid)
print(mean_squared_error(y_valid, preds_valid, squared=False))

#squared=False to get the root mean squared error (RMSE) on the validation data.


0.7350470019075017


In [11]:
# get R^2 to see how well our model explains variance in Y 
model.score(X_valid, y_valid, sample_weight=None)

0.013853084320098796

# Tuning model with Optuna

In [12]:
import optuna

def objective(trial):
    
    #n_estimators = trial.suggest_int("n_estimators", 100) # fixed
    max_depth = trial.suggest_int("max_depth", 1, 4, step=1)
    max_features = trial.suggest_float("max_features", 0.5, 1.0, step=0.1)
    
    model = RandomForestRegressor( 
                                  n_estimators=200, # to safe time because runing on max CPU not TPU
                                  criterion="mae", 
                                  n_jobs=-1, 
                                  max_depth=max_depth,
                                  max_features=max_features
                                 )
    
    model.fit(X_train, y_train)
    
    y_hat = model.predict(X_valid)
    
    return mean_squared_error(y_valid, y_hat, squared=False)

study = optuna.create_study()
study.optimize(objective, n_trials=4)

[32m[I 2021-08-24 15:11:44,243][0m A new study created in memory with name: no-name-6c172ee6-34dc-438a-93e2-97db3ab0c2db[0m
[32m[I 2021-08-24 15:19:13,669][0m Trial 0 finished with value: 0.7341653934851886 and parameters: {'max_depth': 4, 'max_features': 0.8}. Best is trial 0 with value: 0.7341653934851886.[0m
[32m[I 2021-08-24 15:27:20,787][0m Trial 1 finished with value: 0.7342714390476305 and parameters: {'max_depth': 4, 'max_features': 0.9}. Best is trial 0 with value: 0.7341653934851886.[0m
[32m[I 2021-08-24 15:32:07,855][0m Trial 2 finished with value: 0.7353370968825728 and parameters: {'max_depth': 3, 'max_features': 0.6}. Best is trial 0 with value: 0.7341653934851886.[0m
[32m[I 2021-08-24 15:40:08,663][0m Trial 3 finished with value: 0.7342539157687843 and parameters: {'max_depth': 4, 'max_features': 0.9}. Best is trial 0 with value: 0.7341653934851886.[0m


In [13]:
study.best_params

{'max_depth': 4, 'max_features': 0.8}

## Make prediction with our tuned parameters (max depth, max_features) 

In [14]:
from sklearn.model_selection import KFold

best_params =  study.best_params  
final_pred = []
total_loss = 0

kfold = KFold(n_splits=5, shuffle=True, random_state=42)

# train our model 5 times on a subset of 10,000 observations 
# each time use the trained model to predict on complete set of 300,000 observations
# then take the average of the 5 result sets as our final prediction
train_size = 1/30  # 10,000

# this is to get a random subset of rows of train_size
X_sample, _, y_sample, _ = train_test_split(X, y, random_state=123, train_size=train_size, test_size=test_size )

# now use kfold to go through this set a number of times using our best params
for train_indx, val_indx in kfold.split(X_sample):
    X_train, X_val = X_sample.iloc[train_indx], X_sample.iloc[val_indx] 
    y_train, y_val = y_sample.iloc[train_indx], y_sample.iloc[val_indx]
        
    model_final = RandomForestRegressor(
                                  n_estimators=200, 
                                  criterion="mae", 
                                  n_jobs=-1,
                                  **best_params)
    
    model_final.fit(X_train, y_train)  #fit on a subsete of the 10,000
    
    y_hat = model_final.predict(X_val)
    score = mean_squared_error(y_hat, y_val, squared=False)
    print(f"Loss:{score}")
    total_loss += score
    final_pred.append(model_final.predict(X_final))  #we predict use full test set based on this tained model
    #final_pred.append(model_final.predict(X))  #we predict use full test set based on this tained model

print(f"Avg. Loss: {total_loss/kfold.get_n_splits()}")  

Loss:0.751010726510877
Loss:0.7543207813989414
Loss:0.7260345979219538
Loss:0.7271151751534162
Loss:0.7476685725957887
Avg. Loss: 0.7412299707161953


In [15]:

predictions = np.mean(np.column_stack(final_pred), axis=1)
predictions.shape

(200000,)

In [16]:
# score = mean_squared_error(predictions, y, squared=False)
# print(f"Loss:{score}")

# Submit to the competition

We'll begin by using the trained model to generate predictions, which we'll save to a CSV file.

In [17]:
# Save the predictions to a CSV file
output = pd.DataFrame({'Id': X_final.index,
                       'target': predictions})
output.to_csv('submission.csv', index=False)