In [26]:
import os # for detecting CPU cores
import configparser # to load standard config and parameters
import pandas as pd
import matplotlib.pyplot as plt
import warnings
from icecream import ic

warnings.filterwarnings('ignore')
%load_ext watermark
%matplotlib inline


The watermark extension is already loaded. To reload it, use:
  %reload_ext watermark


In [27]:
# Load external config file
config = configparser.ConfigParser()
config.read("../src/config.ini")

PATH_DATA_RAW = config["PATHS"]["PATH_DATA_RAW"]
PATH_DATA_INT = config["PATHS"]["PATH_DATA_INT"]
PATH_DATA_PRO = config["PATHS"]["PATH_DATA_PRO"]
PATH_REPORTS = config["PATHS"]["PATH_REPORTS"]
PATH_MODELS = config["PATHS"]["PATH_MODELS"]
PATH_SUB = config["PATHS"]["PATH_SUB"]

# Telegram Bot
token = config["TELEGRAM"]["token"]
chat_id = config["TELEGRAM"]["chat_id"]
FILENAME_NB = "04_feature_engineering" # for Telegram messages

# Set global randome state
rnd_state = 42

# Define available cpu cores
n_cpu = os.cpu_count()
print("Number of CPUs used:", n_cpu)

Number of CPUs used: 16


In [28]:
import urllib, requests #for Telegram notifications

def send_telegram_message(message):
    """Sending messages to Telegram bot via requests.get()."""
    
    message = f"{FILENAME_NB}:\n{message}"

    # Using "try and except" to ensure that the notebook execution will not be stopped only because of problems with the bot.
    # Example: No network connection.
    # ISSUE: Be careful, an error messages will leak your Telegram Bot Token when uploaded to GitHub.
    try:
        url = 'https://api.telegram.org/bot%s/sendMessage?chat_id=%s&text=%s'%(token, chat_id, urllib.parse.quote_plus(message))
        _ = requests.get(url, timeout=10)
    
    except Exception as e:
        print('\n\nSending message to Telegram Bot was not successful.\n\n')
        print(e)
        
    return None

In [29]:
train_df = pd.read_pickle(PATH_DATA_INT+'train-opt.pkl')
test_df = pd.read_pickle(PATH_DATA_INT+'test-opt.pkl')
sample_df = pd.read_csv(PATH_DATA_RAW+'sample_submission.csv')

In [30]:
features_num = train_df.drop(['id','target'], axis=1).columns
feature_cols = features_num.to_list()
X = train_df.drop(['id','target'], axis=1).copy()
y = train_df['target'].copy()
X_test = test_df.drop(['id'], axis=1).copy()


# Feature Engineering

In [31]:
# Scaling all values
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler

# --------------------------------------------------------------------
# MinMaxScaler(feature_range = (0, 1)) will transform each value in the column proportionally within the range [0,1]. 
# Use this as the first scaler choice to transform a feature, as it will preserve the shape of the dataset 
# (no distortion).

# StandardScaler() will transform each value in the column to range about the mean 0 and standard deviation 1, ie, 
# each value will be normalised by subtracting the mean and dividing by standard deviation. Use StandardScaler if 
# you know the data distribution is normal.

# If there are outliers, use RobustScaler(). Alternatively you could remove the outliers and use either of the above 
# 2 scalers (choice depends on whether data is normally distributed)

#Additional Note: If scaler is used before train_test_split, data leakage will happen. 
# Do use scaler after train_test_split
# --------------------------------------------------------------------

#scaler = StandardScaler()
scaler = MinMaxScaler()
scaler = RobustScaler()

# train
X_scaled = scaler.fit_transform(X[feature_cols])
X = pd.DataFrame(X_scaled, columns=feature_cols, index=X.index)

# test
X_test_scaled = scaler.transform(X_test[feature_cols])
X_test = pd.DataFrame(X_test_scaled, columns=feature_cols, index=X_test.index)


## KMeans Clustering

In [32]:
from sklearn.cluster import KMeans

n_clusters_1 = 9
cluster_cols = [f"cluster{i+1}" for i in range(n_clusters_1)]
#kmeans = KMeans(n_clusters=n_clusters_1, n_init=50, max_iter=500, random_state=rnd_state)
kmeans = KMeans(n_clusters=n_clusters_1, init="k-means++", max_iter=500, random_state=rnd_state)

ic(n_clusters_1);
#ic(cluster_cols);

ic| n_clusters_1: 9


In [33]:
# cluster distance instead of cluster number

# train
X_cd = kmeans.fit_transform(X[feature_cols])
X_cd = pd.DataFrame(X_cd, columns=cluster_cols, index=train_df.index)
X = X.join(X_cd)

# test
X_cd = kmeans.transform(X_test[feature_cols])
X_cd = pd.DataFrame(X_cd, columns=cluster_cols, index=test_df.index)
X_test = X_test.join(X_cd)

In [34]:
feature_cols += cluster_cols
ic(len(feature_cols));
X.head()

ic| len(feature_cols): 109


Unnamed: 0,f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,f99,cluster1,cluster2,cluster3,cluster4,cluster5,cluster6,cluster7,cluster8,cluster9
0,0.023869,0.414343,-0.003178,0.223129,0.219181,-0.549174,0.356604,-0.117173,-0.149785,-0.569723,...,2.097848,49.92857,58.783047,76.852676,63.380802,101.94313,63.887043,66.759926,133.91951,104.602325
1,0.073411,-0.324111,-0.220699,0.301799,0.406584,0.980651,-0.58429,-1.216782,0.824004,-0.258296,...,-0.388992,23.266546,37.226959,62.405041,44.954506,92.868484,44.93232,49.091007,125.609657,95.555489
2,-0.165673,-0.391725,0.386256,-0.178365,-0.372808,0.210182,0.863859,0.518747,-0.268295,-0.021567,...,1.034235,10.174824,32.178036,59.304893,40.735165,86.596382,41.043392,44.821545,124.56649,92.688606
3,-0.301555,-0.872802,2.498527,-0.301544,-0.587486,-0.414987,-0.039545,0.787011,0.807039,0.794548,...,-0.156724,29.22122,41.284988,65.623566,48.782768,95.259262,49.66383,53.00539,127.605354,96.688698
4,-0.272393,0.460876,0.086985,-0.197278,-0.465605,-0.192678,0.518429,-1.042825,0.356489,-0.30174,...,0.165118,8.686808,27.725185,58.941174,39.883305,89.906441,39.718849,44.363586,123.93766,92.008736


## Polynomial Features


In [35]:
from sklearn.preprocessing import PolynomialFeatures


In [36]:
poly = PolynomialFeatures(degree=2, interaction_only=True)

X_poly = poly.fit_transform(X[cluster_cols])
T_poly = poly.transform(X_test[cluster_cols])

poly_cols = [f"cluster_poly{i+1}" for i in range(X_poly.shape[1])]

X_poly_df = pd.DataFrame(X_poly, columns=poly_cols, index=X.index)
T_poly_df = pd.DataFrame(T_poly, columns=poly_cols, index=X_test.index)

X = pd.concat([X, X_poly_df], axis=1)
X_test = pd.concat([X_test, T_poly_df], axis=1)


In [37]:
feature_cols += poly_cols
ic(len(feature_cols));
X.head()

ic| len(feature_cols): 155


Unnamed: 0,f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,cluster_poly37,cluster_poly38,cluster_poly39,cluster_poly40,cluster_poly41,cluster_poly42,cluster_poly43,cluster_poly44,cluster_poly45,cluster_poly46
0,0.023869,0.414343,-0.003178,0.223129,0.219181,-0.549174,0.356604,-0.117173,-0.149785,-0.569723,...,6512.845215,6805.71582,13652.173828,10663.488281,4265.094238,8555.72168,6682.733398,8940.457031,6983.243652,14008.291992
1,0.073411,-0.324111,-0.220699,0.301799,0.406584,0.980651,-0.58429,-1.216782,0.824004,-0.258296,...,4172.796387,4559.007324,11665.178711,8874.09375,2205.772949,5643.933105,4293.529785,6166.304688,4690.915039,12002.692383
2,-0.165673,-0.391725,0.386256,-0.178365,-0.372808,0.210182,0.863859,0.518747,-0.268295,-0.021567,...,3554.209229,3881.383545,10787.007812,8026.498047,1839.628296,5112.631348,3804.254883,5583.262695,4154.446289,11545.894531
3,-0.301555,-0.872802,2.498527,-0.301544,-0.587486,-0.414987,-0.039545,0.787011,0.807039,0.794548,...,4730.939941,5049.254395,12155.591797,9210.494141,2632.450684,6337.370605,4801.931152,6763.771484,5125.021973,12337.995117
4,-0.272393,0.460876,0.086985,-0.197278,-0.465605,-0.192678,0.518429,-1.042825,0.356489,-0.30174,...,3570.980469,3988.572266,11142.793945,8272.177734,1762.070557,4922.661133,3654.481201,5498.319336,4081.837402,11403.347656


In [38]:
X['target'] = y

In [39]:
FILENAME = "train-opt-fe.pkl"
X.to_pickle(PATH_DATA_INT + FILENAME)

FILENAME = "test-opt-fe.pkl"
X_test.to_pickle(PATH_DATA_INT + FILENAME)
