Reference: https://www.kaggle.com/utcarshagrawal/ubiquant-exploration-baseline-w-shap/notebook

In [None]:
# matplotlib cyberpunk style

!pip -q --disable-pip-version-check install mplcyberpunk

# Importing Libraries

In [None]:
import os
import gc
import numpy as np
import pandas as pd

import shap
import random
from lightgbm import LGBMRegressor
from sklearn.model_selection import train_test_split

# 그래프 그리기
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import mplcyberpunk
plt.style.use('cyberpunk')
sns.set(rc = {'axes.facecolor': '#212946', 'figure.facecolor': '#ffffff'})

import warnings
warnings.filterwarnings('ignore')

# Reading the data

In [None]:
%%time
df = pd.read_parquet('../input/ubiquant-parquet/train_low_mem.parquet')

### parquet vs csv
- parquet이란?
>하둡에서 컬럼방식으로 저장한 포맷
- 장점
> 저장 크기가 작다.  
> 저장, 로드 시간 작다.  
> 칼럼별 dtype을 다시 지정해줄 필요 X  
- parquet >> csv
- Q.그럼 pickle과는??

In [None]:
df.head()

In [None]:
df.info()

# EDA

### Investment_id Distribution
> First of all we will look at the count of samples in each investment id

In [None]:
# 단순 investment 분포 정도를 그래프로 본다.

plt.figure(figsize = (18, 7))

# investment_id count df
df_temp = df.groupby('investment_id')['row_id'].count().reset_index()
sns.histplot(x = df_temp['investment_id'], bins = 50)
plt.xlabel('Investment_id')
plt.ylabel('Count')
plt.title('Investment_id Distribution')

least_id = int(df_temp[df_temp['row_id'] == df_temp['row_id'].describe()['min']]['investment_id'])
max_id = int(df_temp[df_temp['row_id'] == df_temp['row_id'].describe()['max']]['investment_id'])

print(f"Number of unique investments - {len(df_temp)}")
print(f"Investment id with least number of sample - {least_id}, Count - {int(df_temp['row_id'].describe()['min'])}")
print(f"Investment id with maximum number of sample - {max_id}, Count - {int(df_temp['row_id'].describe()['max'])}")

plt.show()

In [None]:
df_temp.head()

In [None]:
# investment id의 count 분포를 histogram으로 표현
# 개별 count는 df_temp로 기록함
# histogram으로 개별 count 경향 보다는 뭉쳐서 나타낸다.

plt.figure(figsize = (18, 7))
sns.histplot(x = df['investment_id'])
plt.xlabel('Investment_id')
plt.ylabel('Count')
plt.title('Sample count of Investment_id Distribution')
plt.show()

- print 에서 따옴표는 조심해야 한다. 작은 따옴표(') 대신 큰 따옴표(") 사용을 습관화하자

### Time_id Distribution
> Now let us look at the count of samples in each time id

In [None]:
fig, ax = plt.subplots(2, 1, figsize = (17, 10))

df_temp = df.iloc[:, 0:3]
df_temp2 = df_temp.groupby('time_id')['investment_id'].count().reset_index().rename(columns = {'investment_id' : 'Sample_Count'})

sns.distplot(x = df_temp2['Sample_Count'], ax = ax[0])
sns.scatterplot(x = df_temp2['time_id'], y = df_temp2['Sample_Count'], ax = ax[1])

ax[0].set_xlabel('Sample_Count')
ax[1].set_xlabel('Time_ID')

plt.show()

In [None]:
least_id = int(df_temp2[df_temp2['Sample_Count'] == df_temp2['Sample_Count'].describe()['min']]['time_id'])
max_id = int(df_temp2[df_temp2['Sample_Count'] == df_temp2['Sample_Count'].describe()['max']]['time_id'])

print(f"Number of unique time_ids - {len(df_temp2)}")
print(f"Time_id with least number of samples - {least_id}, Count - {int(df_temp2['Sample_Count'].describe()['min'])}")
print(f"Time_id with maximum number of samples - {max_id}, Count - {int(df_temp2['Sample_Count'].describe()['max'])}")

### Target Distribution
> Then let's analysis the target distribution

In [None]:
plt.figure(figsize = (15, 7))

sns.distplot(df['target'])
plt.title('Target Distribution')

print(f"Mean of target - {df['target'].describe()['mean']}")
print(f"Minimum value of target - {df['target'].describe()['min']}")
print(f"Maximum value of target - {df['target'].describe()['max']}")

> The `target` is normally distribution with mean of -0.0210... Let us also look the most skewed `target` distribution categorized by `investment_id` ans `time_id` respectively

In [None]:
fig, ax = plt.subplots(2, 2, figsize = (20, 10))

df_temp = df.iloc[:, 0:4]
df_temp1 = df_temp.groupby('investment_id').skew()['target'].reset_index()
id_1 = int(df_temp1[df_temp1['target'] == df_temp1['target'].describe()['max']]['investment_id'])
id_2 = int(df_temp1[df_temp1['target'] == df_temp1['target'].describe()['min']]['investment_id'])

select_1 = df_temp[df_temp['investment_id'] == id_1][['target']]
select_2 = df_temp[df_temp['investment_id'] == id_2][['target']]
sns.distplot(select_1['target'], ax = ax[0, 0])
sns.distplot(select_2['target'], ax = ax[0, 1])
print(f"Investment_IDs with most skewed target are - {id_1}, {id_2}")

df_temp2 = df_temp.groupby('time_id').skew()['target'].reset_index()
id_3 = int(df_temp2[df_temp2['target'] == df_temp2['target'].describe()['max']]['time_id'])
id_4 = int(df_temp2[df_temp2['target'] == df_temp2['target'].describe()['min']]['time_id'])

select_3 = df_temp[df_temp['time_id'] == id_3][['target']]
select_4 = df_temp[df_temp['time_id'] == id_4][['target']]
sns.distplot(select_3['target'], ax = ax[1, 0])
sns.distplot(select_4['target'], ax = ax[1, 1])
print(f"Time_IDs with most skewed target are - {id_3}, {id_4}")

plt.show()

### Time_id Categorized
> I decided to group `time_id` into 5 different categories according to their range, i.e. {(~, 250), (251, 500), (501, 750), (751, 1000), (1000, ~)} to know better about data with time. We will check different things like how many sample counts, missing values per category are there, target distribution across each `time_id` range and later on feature distribution.

In [None]:
def cate_time(time):
    if time in range(0, 251):
        x = '0-250'
    elif time in range(251, 501):
        x = '251-500'
    elif time in range(501, 751):
        x = '501-750'
    elif time in range(751, 1001):
        x = '751-1000'
    else:
        x = '1000+'
    return x

df_temp = df.iloc[:, 0:4]
df_temp2 = df_temp.groupby('time_id')['investment_id'].count().reset_index().rename(columns = {'investment_id' : 'Sample_Count'})
df_temp2['time_cat'] = df_temp2['time_id'].apply(lambda x : cate_time(x))

In [None]:
df_temp2.head()

In [None]:
fig, ax = plt.subplots(1, 2, figsize = (20, 7))
fig.suptitle('Time Categorized Distribution', size = 20, weight = 'bold')

sizes = []
for x in df_temp2['time_cat'].unique():
    sizes.append(df_temp2[df_temp2['time_cat'] == x]['Sample_Count'].sum())
labels = list(df_temp2['time_cat'].unique())
explode = (0.05, 0.05, 0.05, 0.05, 0.05)
colors = ['#FF2281', '#FF6600', '#13CA91', '#099FFF', '#CC00FF']
ax[0].pie(sizes, colors = colors, explode = explode, startangle = 90, labels = labels,
          autopct = '%1.0f%%', pctdistance = 0.7, textprops = {'fontsize': 12}, counterclock = False)
centre_circle = plt.Circle((0, 0), 0.5, fc = '#212946')
ax[0].add_artist(centre_circle)
ax[0].axis('equal')
ax[0].set_title('Sample Count Distribution', size = 15)

missing = {'0-250': 0, '251-500': 0, '501-750': 0, '751-1000': 0, '1000+': 0}
prev = 0

for i in list(df_temp2['time_id'].values):
    if i - prev == 0:
        prev = i
    else:
        for j in range(int(prev + 1), i):
            if j in range(0, 251):
                missing['0-250'] += 1
            elif j in range(251, 501):
                missing['251-500'] += 1
            elif j in range(251, 501):
                missing['501-750'] += 1
            elif j in range(251, 501):
                missing['751-1000'] += 1
            else:
                missing['1000+'] += 1
        prev = i
sns.barplot(x = list(missing.keys()), y = list(missing.values()), ax = ax[1])
ax[1].set_title('Missing value count in each time category', size = 15)
plt.show()

plt.figure(figsize = (20, 10))
df_temp['time_cat'] = df_temp['time_id'].apply(lambda x: cate_time(x))
sns.boxplot(y = df_temp['target'], x = df_temp['time_cat'])
plt.title('Target Distribution in each time category', size = 15)
plt.show()

### Features Distribution
> Now we will focus on `target`. There are total 300 anonymized features generated from market data. First we will look at distributions of few features

In [None]:
features = ['f_0', 'f_1', 'f_2', 'f_3', 'f_4', 'f_5']

fig, ax = plt.subplots(2, 3, figsize = (20, 10))
for i in range(2):
    for j in range(3):
        if i == 1:
            sns.violinplot(y = df[features[i*2 + j + 1]], ax = ax[i, j])
        else:
            sns.violinplot(y = df[features[i + j]], ax = ax[i, j])

In [None]:
features = ['f_100', 'f_101', 'f_102', 'f_103', 'f_104', 'f_105']

fig, ax = plt.subplots(2, 3, figsize = (20, 10))
for i in range(2):
    for j in range(3):
        if i == 1:
            sns.scatterplot(x = df[features[i*2 + j + 1]], y = df['target'], ax = ax[i, j])
        else:
            sns.scatterplot(x = df[features[i + j]], y = df['target'], ax = ax[i, j])
plt.show()

> Next we will calculate correlation of each feature with the `target` and will then plot the distribution of most correlated and least correlated features.

In [None]:
cor = {}

for i in range(300):
    corr_f = df[['target', f'f_{i}']].corr().iloc[0, 1]
    cor[f'f_{i}'] = abs(corr_f)
    
cor = {k: v for k, v in sorted(cor.items(), key = lambda item: item[1])}

In [None]:
# Highest correlation
fig, ax = plt.subplots(2, 3, figsize = (20, 10))

for i in range(2):
    for j in range(3):
        if i == 1:
            print(f"target & {list(cor.keys())[-(i*2 + j + 2)]} Correlation is {list(cor.values())[-(i*2 + j + 2)]}")
            sns.scatterplot(x = df[list(cor.keys())[-(i*2 + j + 2)]], y = df['target'], ax = ax[i, j])
        else:
            print(f"target & {list(cor.keys())[-(i + j + 1)]} Correlation is {list(cor.values())[-(i + j + 1)]}")
            sns.scatterplot(x = df[list(cor.keys())[-(i + j + 1)]], y = df['target'], ax = ax[i, j])
            
plt.suptitle('Distribution of most correlated feature with target', fontsize = 15)
plt.show()

In [None]:
# Lowest correlation
fig, ax = plt.subplots(2, 3, figsize = (20, 10))

for i in range(2):
    for j in range(3):
        if i == 1:
            print(f"target & {list(cor.keys())[i*2 + j + 1]} Correlation is {list(cor.values())[i*2 + j + 1]}")
            sns.scatterplot(x = df[list(cor.keys())[i*2 + j + 1]], y = df['target'], ax = ax[i, j])
        else:
            print(f"target & {list(cor.keys())[i + j]} Correlation is {list(cor.values())[i + j]}")
            sns.scatterplot(x = df[list(cor.keys())[i + j]], y = df['target'], ax = ax[i, j])
            
plt.suptitle('Distribution of least correlated feature with target', fontsize = 15)
plt.show()

> Now we will see which feature play more importance with respect to differenct `time_id` range. This could be helpful in analysing more about features and how they are changing with time.

### Time_id range: `0-250`

In [None]:
df_temp = df.iloc[:, 1:]
df_temp['time_cat'] = df_temp['time_id'].apply(lambda x: cate_time(x))

In [None]:
cor = {}
df_temp2 = df_temp[df_temp['time_cat'] == '0-250']

for i in range(300):
    corr_f = df_temp2[['target', f'f_{i}']].corr().iloc[0, 1]
    cor[f'f_{i}'] = abs(corr_f)
    
cor = {k: v for k, v in sorted(cor.items(), key = lambda item: item[1])}

In [None]:
# Highest correlation
fig, ax = plt.subplots(2, 3, figsize = (20, 10))

for i in range(2):
    for j in range(3):
        if i == 1:
            print(f"target & {list(cor.keys())[-(i*2 + j + 2)]} Correlation is {list(cor.values())[-(i*2 + j + 2)]}")
            sns.scatterplot(x = df_temp2[list(cor.keys())[-(i*2 + j + 2)]], y = df_temp2['target'], ax = ax[i, j], color = colors[0])
        else:
            print(f"target & {list(cor.keys())[-(i + j + 1)]} Correlation is {list(cor.values())[-(i + j + 1)]}")
            sns.scatterplot(x = df_temp2[list(cor.keys())[-(i + j + 1)]], y = df_temp2['target'], ax = ax[i, j], color = colors[0])
            
plt.suptitle('Distribution of most correlated feature with target of time_id between 0-250', fontsize = 15)
plt.show()

In [None]:
# Lowest correlation
fig, ax = plt.subplots(2, 3, figsize = (20, 10))

for i in range(2):
    for j in range(3):
        if i == 1:
            print(f"target & {list(cor.keys())[i*2 + j + 1]} Correlation is {list(cor.values())[i*2 + j + 1]}")
            sns.scatterplot(x = df_temp2[list(cor.keys())[i*2 + j + 1]], y = df_temp2['target'], ax = ax[i, j], color = colors[0])
        else:
            print(f"target & {list(cor.keys())[i + j]} Correlation is {list(cor.values())[i + j]}")
            sns.scatterplot(x = df_temp2[list(cor.keys())[i + j]], y = df_temp2['target'], ax = ax[i, j], color = colors[0])
            
plt.suptitle('Distribution of least correlated feature with target of time_id between 0-250', fontsize = 15)
plt.show()

### Time_id range: `251-500`

In [None]:
cor = {}
df_temp2 = df_temp[df_temp['time_cat'] == '251-500']

for i in range(300):
    corr_f = df_temp2[['target', f'f_{i}']].corr().iloc[0, 1]
    cor[f'f_{i}'] = abs(corr_f)
    
cor = {k: v for k, v in sorted(cor.items(), key = lambda item: item[1])}

In [None]:
# Highest correlation
fig, ax = plt.subplots(2, 3, figsize = (20, 10))

for i in range(2):
    for j in range(3):
        if i == 1:
            print(f"target & {list(cor.keys())[-(i*2 + j + 2)]} Correlation is {list(cor.values())[-(i*2 + j + 2)]}")
            sns.scatterplot(x = df_temp2[list(cor.keys())[-(i*2 + j + 2)]], y = df_temp2['target'], ax = ax[i, j], color = colors[1])
        else:
            print(f"target & {list(cor.keys())[-(i + j + 1)]} Correlation is {list(cor.values())[-(i + j + 1)]}")
            sns.scatterplot(x = df_temp2[list(cor.keys())[-(i + j + 1)]], y = df_temp2['target'], ax = ax[i, j], color = colors[1])
            
plt.suptitle('Distribution of most correlated feature with target of time_id between 251-500', fontsize = 15)
plt.show()

In [None]:
# Lowest correlation
fig, ax = plt.subplots(2, 3, figsize = (20, 10))

for i in range(2):
    for j in range(3):
        if i == 1:
            print(f"target & {list(cor.keys())[i*2 + j + 1]} Correlation is {list(cor.values())[i*2 + j + 1]}")
            sns.scatterplot(x = df_temp2[list(cor.keys())[i*2 + j + 1]], y = df_temp2['target'], ax = ax[i, j], color = colors[1])
        else:
            print(f"target & {list(cor.keys())[i + j]} Correlation is {list(cor.values())[i + j]}")
            sns.scatterplot(x = df_temp2[list(cor.keys())[i + j]], y = df_temp2['target'], ax = ax[i, j], color = colors[1])
            
plt.suptitle('Distribution of least correlated feature with target of time_id between 251-500', fontsize = 15)
plt.show()

### Time_id range: `501-750`

In [None]:
cor = {}
df_temp2 = df_temp[df_temp['time_cat'] == '501-750']

for i in range(300):
    corr_f = df_temp2[['target', f'f_{i}']].corr().iloc[0, 1]
    cor[f'f_{i}'] = abs(corr_f)
    
cor = {k: v for k, v in sorted(cor.items(), key = lambda item: item[1])}

In [None]:
# Highest correlation
fig, ax = plt.subplots(2, 3, figsize = (20, 10))

for i in range(2):
    for j in range(3):
        if i == 1:
            print(f"target & {list(cor.keys())[-(i*2 + j + 2)]} Correlation is {list(cor.values())[-(i*2 + j + 2)]}")
            sns.scatterplot(x = df_temp2[list(cor.keys())[-(i*2 + j + 2)]], y = df_temp2['target'], ax = ax[i, j], color = colors[2])
        else:
            print(f"target & {list(cor.keys())[-(i + j + 1)]} Correlation is {list(cor.values())[-(i + j + 1)]}")
            sns.scatterplot(x = df_temp2[list(cor.keys())[-(i + j + 1)]], y = df_temp2['target'], ax = ax[i, j], color = colors[2])
            
plt.suptitle('Distribution of most correlated feature with target of time_id between 501-750', fontsize = 15)
plt.show()

In [None]:
# Lowest correlation
fig, ax = plt.subplots(2, 3, figsize = (20, 10))

for i in range(2):
    for j in range(3):
        if i == 1:
            print(f"target & {list(cor.keys())[i*2 + j + 1]} Correlation is {list(cor.values())[i*2 + j + 1]}")
            sns.scatterplot(x = df_temp2[list(cor.keys())[i*2 + j + 1]], y = df_temp2['target'], ax = ax[i, j], color = colors[2])
        else:
            print(f"target & {list(cor.keys())[i + j]} Correlation is {list(cor.values())[i + j]}")
            sns.scatterplot(x = df_temp2[list(cor.keys())[i + j]], y = df_temp2['target'], ax = ax[i, j], color = colors[2])
            
plt.suptitle('Distribution of least correlated feature with target of time_id between 501-750', fontsize = 15)
plt.show()

### Time_id range: `751-1000`

In [None]:
cor = {}
df_temp2 = df_temp[df_temp['time_cat'] == '751-1000']

for i in range(300):
    corr_f = df_temp2[['target', f'f_{i}']].corr().iloc[0, 1]
    cor[f'f_{i}'] = abs(corr_f)
    
cor = {k: v for k, v in sorted(cor.items(), key = lambda item: item[1])}

In [None]:
# Highest correlation
fig, ax = plt.subplots(2, 3, figsize = (20, 10))

for i in range(2):
    for j in range(3):
        if i == 1:
            print(f"target & {list(cor.keys())[-(i*2 + j + 2)]} Correlation is {list(cor.values())[-(i*2 + j + 2)]}")
            sns.scatterplot(x = df_temp2[list(cor.keys())[-(i*2 + j + 2)]], y = df_temp2['target'], ax = ax[i, j], color = colors[3])
        else:
            print(f"target & {list(cor.keys())[-(i + j + 1)]} Correlation is {list(cor.values())[-(i + j + 1)]}")
            sns.scatterplot(x = df_temp2[list(cor.keys())[-(i + j + 1)]], y = df_temp2['target'], ax = ax[i, j], color = colors[3])
            
plt.suptitle('Distribution of most correlated feature with target of time_id between 751-1000', fontsize = 15)
plt.show()

In [None]:
# Lowest correlation
fig, ax = plt.subplots(2, 3, figsize = (20, 10))

for i in range(2):
    for j in range(3):
        if i == 1:
            print(f"target & {list(cor.keys())[i*2 + j + 1]} Correlation is {list(cor.values())[i*2 + j + 1]}")
            sns.scatterplot(x = df_temp2[list(cor.keys())[i*2 + j + 1]], y = df_temp2['target'], ax = ax[i, j], color = colors[3])
        else:
            print(f"target & {list(cor.keys())[i + j]} Correlation is {list(cor.values())[i + j]}")
            sns.scatterplot(x = df_temp2[list(cor.keys())[i + j]], y = df_temp2['target'], ax = ax[i, j], color = colors[3])
            
plt.suptitle('Distribution of least correlated feature with target of time_id between 751-1000', fontsize = 15)
plt.show()

### Time_id range: `1000+`

In [None]:
cor = {}
df_temp2 = df_temp[df_temp['time_cat'] == '1000+']

for i in range(300):
    corr_f = df_temp2[['target', f'f_{i}']].corr().iloc[0, 1]
    cor[f'f_{i}'] = abs(corr_f)
    
cor = {k: v for k, v in sorted(cor.items(), key = lambda item: item[1])}

In [None]:
# Highest correlation
fig, ax = plt.subplots(2, 3, figsize = (20, 10))

for i in range(2):
    for j in range(3):
        if i == 1:
            print(f"target & {list(cor.keys())[-(i*2 + j + 2)]} Correlation is {list(cor.values())[-(i*2 + j + 2)]}")
            sns.scatterplot(x = df_temp2[list(cor.keys())[-(i*2 + j + 2)]], y = df_temp2['target'], ax = ax[i, j], color = colors[4])
        else:
            print(f"target & {list(cor.keys())[-(i + j + 1)]} Correlation is {list(cor.values())[-(i + j + 1)]}")
            sns.scatterplot(x = df_temp2[list(cor.keys())[-(i + j + 1)]], y = df_temp2['target'], ax = ax[i, j], color = colors[4])
            
plt.suptitle('Distribution of most correlated feature with target of time_id between 1000+', fontsize = 15)
plt.show()

In [None]:
# Lowest correlation
fig, ax = plt.subplots(2, 3, figsize = (20, 10))

for i in range(2):
    for j in range(3):
        if i == 1:
            print(f"target & {list(cor.keys())[i*2 + j + 1]} Correlation is {list(cor.values())[i*2 + j + 1]}")
            sns.scatterplot(x = df_temp2[list(cor.keys())[i*2 + j + 1]], y = df_temp2['target'], ax = ax[i, j], color = colors[4])
        else:
            print(f"target & {list(cor.keys())[i + j]} Correlation is {list(cor.values())[i + j]}")
            sns.scatterplot(x = df_temp2[list(cor.keys())[i + j]], y = df_temp2['target'], ax = ax[i, j], color = colors[4])
            
plt.suptitle('Distribution of least correlated feature with target of time_id between 1000+', fontsize = 15)
plt.show()