## Import the library

* pandas
* numpy
* matplotlib
* seaborn
* sklearn
* dask

In [1]:
import numpy as np
import math
import seaborn as sns
import matplotlib.pyplot as plt
import sklearnex

from xgboost import XGBClassifier
from xgboost import XGBRegressor

# INTEL sklearn patching for optimizing sklearn
from sklearnex import patch_sklearn
patch_sklearn(global_patch=True)

# Reimport sklearn
from sklearn.model_selection import train_test_split, cross_val_predict
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.feature_selection import mutual_info_regression

# # Modin, faster alternative compare to Pandas

# import modin.pandas as pd
# import modin.config as cfg

# cfg.Engine.put('Dask')
# cfg.Memory.put(9000000000)
# cfg.CpuCount.put(4)

#Dask
import dask.dataframe as dd
from dask.distributed import Client
client = Client()

from pprint import pprint

#Ray
# import ray
# ray.init()

Scikit-learn was successfully globally patched by Intel(R) Extension for Scikit-learn


Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)
distributed.diskutils - INFO - Found stale lock file and directory '/mnt/b7b917e1-da96-4995-b7db-c30035d41dbe/Machine Learning Project/AMEX_2022/dask-worker-space/worker-trbin9ll', purging
distributed.diskutils - INFO - Found stale lock file and directory '/mnt/b7b917e1-da96-4995-b7db-c30035d41dbe/Machine Learning Project/AMEX_2022/dask-worker-space/worker-ip_7s3d2', purging
distributed.diskutils - INFO - Found stale lock file and directory '/mnt/b7b917e1-da96-4995-b7db-c30035d41dbe/Machine Learning Project/AMEX_2022/dask-worker-space/worker-9gh46boc', purging
distributed.diskutils - INFO - Found stale lock file and directory '/mnt/b7b917e1-da96-4995-b7db-c30035d41dbe/Machine Learning Project/AMEX_2022/dask-worker-space/worker-mnos0cm9', purging


In [2]:
# Set max display dataframe
import pandas as pd
pd.set_option('display.max_column', None)
pd.set_option('display.max_rows', None)

## Preprocessing
- Import the train and test data
- Numerical features
- Categorical features
- Data type
- Missing value
- Train test split
- Handling missing values, data imputation or encoding or both
- Feature scaling, standardzation or normalization if necessary

In [3]:
# check the number CPU
# import modin
# print(modin.config.NPartitions.get())

In [4]:
# Check Dask client status
client

0,1
Connection method: Cluster object,Cluster type: distributed.LocalCluster
Dashboard: http://127.0.0.1:8787/status,

0,1
Dashboard: http://127.0.0.1:8787/status,Workers: 4
Total threads: 8,Total memory: 15.48 GiB
Status: running,Using processes: True

0,1
Comm: tcp://127.0.0.1:37131,Workers: 4
Dashboard: http://127.0.0.1:8787/status,Total threads: 8
Started: Just now,Total memory: 15.48 GiB

0,1
Comm: tcp://127.0.0.1:44667,Total threads: 2
Dashboard: http://127.0.0.1:38165/status,Memory: 3.87 GiB
Nanny: tcp://127.0.0.1:44681,
Local directory: /mnt/b7b917e1-da96-4995-b7db-c30035d41dbe/Machine Learning Project/AMEX_2022/dask-worker-space/worker-vct7b2wg,Local directory: /mnt/b7b917e1-da96-4995-b7db-c30035d41dbe/Machine Learning Project/AMEX_2022/dask-worker-space/worker-vct7b2wg
GPU: NVIDIA GeForce RTX 3050,GPU memory: 8.00 GiB

0,1
Comm: tcp://127.0.0.1:43325,Total threads: 2
Dashboard: http://127.0.0.1:33227/status,Memory: 3.87 GiB
Nanny: tcp://127.0.0.1:40735,
Local directory: /mnt/b7b917e1-da96-4995-b7db-c30035d41dbe/Machine Learning Project/AMEX_2022/dask-worker-space/worker-t1zub7fc,Local directory: /mnt/b7b917e1-da96-4995-b7db-c30035d41dbe/Machine Learning Project/AMEX_2022/dask-worker-space/worker-t1zub7fc
GPU: NVIDIA GeForce RTX 3050,GPU memory: 8.00 GiB

0,1
Comm: tcp://127.0.0.1:44901,Total threads: 2
Dashboard: http://127.0.0.1:46717/status,Memory: 3.87 GiB
Nanny: tcp://127.0.0.1:46599,
Local directory: /mnt/b7b917e1-da96-4995-b7db-c30035d41dbe/Machine Learning Project/AMEX_2022/dask-worker-space/worker-1ygn7o9g,Local directory: /mnt/b7b917e1-da96-4995-b7db-c30035d41dbe/Machine Learning Project/AMEX_2022/dask-worker-space/worker-1ygn7o9g
GPU: NVIDIA GeForce RTX 3050,GPU memory: 8.00 GiB

0,1
Comm: tcp://127.0.0.1:42685,Total threads: 2
Dashboard: http://127.0.0.1:38235/status,Memory: 3.87 GiB
Nanny: tcp://127.0.0.1:40437,
Local directory: /mnt/b7b917e1-da96-4995-b7db-c30035d41dbe/Machine Learning Project/AMEX_2022/dask-worker-space/worker-1mebpy0e,Local directory: /mnt/b7b917e1-da96-4995-b7db-c30035d41dbe/Machine Learning Project/AMEX_2022/dask-worker-space/worker-1mebpy0e
GPU: NVIDIA GeForce RTX 3050,GPU memory: 8.00 GiB


In [5]:
train_data = '/mnt/b7b917e1-da96-4995-b7db-c30035d41dbe/Machine Learning Project/AMEX_2022/df_parquet_train'


In [6]:
test_data = '/mnt/b7b917e1-da96-4995-b7db-c30035d41dbe/Machine Learning Project/AMEX_2022/df_parquet_test'

In [7]:
df = dd.read_parquet(train_data)

In [8]:
# Number of rows
df.shape[0].compute()

5531451

In [9]:
# Number of columns/features
df.shape[1]

190

In [10]:
df.info()

<class 'dask.dataframe.core.DataFrame'>
Columns: 190 entries, customer_ID to D_145
dtypes: object(4), float64(185), int64(1)

In [11]:
df.columns

Index(['customer_ID', 'S_2', 'P_2', 'D_39', 'B_1', 'B_2', 'R_1', 'S_3', 'D_41',
       'B_3',
       ...
       'D_136', 'D_137', 'D_138', 'D_139', 'D_140', 'D_141', 'D_142', 'D_143',
       'D_144', 'D_145'],
      dtype='object', length=190)

In [12]:
# Out of much memory
# pandas_df = df.compute()