# Libraries

In [None]:
# Own functions
from numpy import disp
from core.data import load_competition_from_kaggle

# Data manipulation
import pandas as pd

# Visualisation
import matplotlib.pyplot as plt
import seaborn as sns

  from .autonotebook import tqdm as notebook_tqdm


# Preparation

**Gather Data**

In [2]:
# Download data (Kaggle Competition)
competition_name = "bosch-production-line-performance" # replace with your competition link from Kaggle 
destination = "../data/raw"

files = load_competition_from_kaggle(
    competition_name=competition_name, 
    destination=destination,
    )

print(files)

Destination directory '../data/raw\bosch-production-line-performance' already exists with files. Skipping download (replace=False).
['sample_submission.csv.zip', 'test_categorical.csv.zip', 'test_date.csv.zip', 'test_numeric.csv.zip', 'train_categorical.csv.zip', 'train_date.csv.zip', 'train_numeric.csv.zip']


In [13]:
# Load data
print("Loading numeric data...")
df_num = pd.read_csv("../data/raw/bosch-production-line-performance/train_numeric.csv.zip", nrows=100000)
display(df_num.head())

print("Loading categorical data...")
df_cat = pd.read_csv("../data/raw/bosch-production-line-performance/train_categorical.csv.zip", nrows=100000)
display(df_cat.head())

print("Loading date data...")
df_date = pd.read_csv("../data/raw/bosch-production-line-performance/train_date.csv.zip", nrows=100000)
display(df_date.head())

Loading numeric data...


Unnamed: 0,Id,L0_S0_F0,L0_S0_F2,L0_S0_F4,L0_S0_F6,L0_S0_F8,L0_S0_F10,L0_S0_F12,L0_S0_F14,L0_S0_F16,...,L3_S50_F4245,L3_S50_F4247,L3_S50_F4249,L3_S50_F4251,L3_S50_F4253,L3_S51_F4256,L3_S51_F4258,L3_S51_F4260,L3_S51_F4262,Response
0,4,0.03,-0.034,-0.197,-0.179,0.118,0.116,-0.015,-0.032,0.02,...,,,,,,,,,,0
1,6,,,,,,,,,,...,,,,,,,,,,0
2,7,0.088,0.086,0.003,-0.052,0.161,0.025,-0.015,-0.072,-0.225,...,,,,,,,,,,0
3,9,-0.036,-0.064,0.294,0.33,0.074,0.161,0.022,0.128,-0.026,...,,,,,,,,,,0
4,11,-0.055,-0.086,0.294,0.33,0.118,0.025,0.03,0.168,-0.169,...,,,,,,,,,,0


Loading categorical data...


  df_cat = pd.read_csv("../data/raw/bosch-production-line-performance/train_categorical.csv.zip", nrows=100000)


Unnamed: 0,Id,L0_S1_F25,L0_S1_F27,L0_S1_F29,L0_S1_F31,L0_S2_F33,L0_S2_F35,L0_S2_F37,L0_S2_F39,L0_S2_F41,...,L3_S49_F4225,L3_S49_F4227,L3_S49_F4229,L3_S49_F4230,L3_S49_F4232,L3_S49_F4234,L3_S49_F4235,L3_S49_F4237,L3_S49_F4239,L3_S49_F4240
0,4,,,,,,,,,,...,,,,,,,,,,
1,6,,,,,,,,,,...,,,,,,,,,,
2,7,,,,,,,,,,...,,,,,,,,,,
3,9,,,,,,,,,,...,,,,,,,,,,
4,11,,,,,,,,,,...,,,,,,,,,,


Loading date data...


Unnamed: 0,Id,L0_S0_D1,L0_S0_D3,L0_S0_D5,L0_S0_D7,L0_S0_D9,L0_S0_D11,L0_S0_D13,L0_S0_D15,L0_S0_D17,...,L3_S50_D4246,L3_S50_D4248,L3_S50_D4250,L3_S50_D4252,L3_S50_D4254,L3_S51_D4255,L3_S51_D4257,L3_S51_D4259,L3_S51_D4261,L3_S51_D4263
0,4,82.24,82.24,82.24,82.24,82.24,82.24,82.24,82.24,82.24,...,,,,,,,,,,
1,6,,,,,,,,,,...,,,,,,,,,,
2,7,1618.7,1618.7,1618.7,1618.7,1618.7,1618.7,1618.7,1618.7,1618.7,...,,,,,,,,,,
3,9,1149.2,1149.2,1149.2,1149.2,1149.2,1149.2,1149.2,1149.2,1149.2,...,,,,,,,,,,
4,11,602.64,602.64,602.64,602.64,602.64,602.64,602.64,602.64,602.64,...,,,,,,,,,,


In [15]:
# Analyse data quality

print("Numeric Features")
display(
    "Shape",
    df_num.shape,
    "Description",
    df_num.describe().round(2).T,
    "Duplicates",
    df_num.duplicated().sum(),
)

pd.DataFrame(
    {
        "Data Types": df_num.dtypes,
        "Missing Values": df_num.isnull().sum(),
        "Unique Values": df_num.nunique(),
        "Sample Values": [df_num[col].sample(3).tolist() for col in df_num.columns]
    })

print("Categorical Features")
display(
    "Shape",
    df_cat.shape,
    "Description",
    df_cat.describe().round(2).T,
    "Duplicates",
    df_cat.duplicated().sum(),
)

pd.DataFrame(
    {
        "Data Types": df_cat.dtypes,
        "Missing Values": df_cat.isnull().sum(),
        "Unique Values": df_cat.nunique(),
        "Sample Values": [df_cat[col].sample(3).tolist() for col in df_cat.columns]
    })

print("Date Features")
display(
    "Shape",
    df_date.shape,
    "Description",
    df_date.describe().round(2).T,
    "Duplicates",
    df_date.duplicated().sum(),
)

pd.DataFrame(
    {
        "Data Types": df_date.dtypes,
        "Missing Values": df_date.isnull().sum(),
        "Unique Values": df_date.nunique(),
        "Sample Values": [df_date[col].sample(3).tolist() for col in df_date.columns]
    })

Numeric Features


'Shape'

(100000, 970)

'Description'

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Id,100000.0,100116.60,57830.69,4.00,50104.75,100148.00,150115.25,200352.00
L0_S0_F0,57915.0,-0.00,0.08,-0.40,-0.06,0.00,0.06,0.28
L0_S0_F2,57915.0,-0.00,0.09,-0.47,-0.06,0.00,0.06,0.28
L0_S0_F4,57915.0,-0.00,0.21,-0.40,-0.18,-0.03,0.29,0.57
L0_S0_F6,57915.0,-0.00,0.21,-0.42,-0.18,-0.03,0.29,0.57
...,...,...,...,...,...,...,...,...
L3_S51_F4256,5195.0,-0.00,0.00,-0.00,0.00,0.00,0.00,0.00
L3_S51_F4258,5195.0,0.00,0.02,0.00,0.00,0.00,0.00,1.00
L3_S51_F4260,5195.0,0.00,0.01,0.00,0.00,0.00,0.00,0.44
L3_S51_F4262,5195.0,0.00,0.00,0.00,0.00,0.00,0.00,0.05


'Duplicates'

np.int64(0)

Categorical Features


'Shape'

(100000, 2141)

'Description'

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Id,100000.0,100116.6,57830.69,4.0,50104.75,100148.0,150115.25,200352.0
L0_S3_F69,0.0,,,,,,,
L0_S3_F71,0.0,,,,,,,
L0_S3_F73,0.0,,,,,,,
L0_S3_F75,0.0,,,,,,,
...,...,...,...,...,...,...,...,...
L3_S47_F4182,0.0,,,,,,,
L3_S47_F4184,0.0,,,,,,,
L3_S47_F4187,0.0,,,,,,,
L3_S47_F4189,0.0,,,,,,,


'Duplicates'

np.int64(0)

Date Features


'Shape'

(100000, 1157)

'Description'

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Id,100000.0,100116.60,57830.69,4.00,50104.75,100148.00,150115.25,200352.00
L0_S0_D1,57915.0,873.06,501.78,0.01,402.12,904.37,1363.46,1713.71
L0_S0_D3,57915.0,873.06,501.78,0.01,402.12,904.37,1363.46,1713.71
L0_S0_D5,57915.0,873.06,501.78,0.01,402.12,904.37,1363.46,1713.71
L0_S0_D7,57915.0,873.06,501.78,0.01,402.12,904.37,1363.46,1713.71
...,...,...,...,...,...,...,...,...
L3_S51_D4255,5195.0,1031.18,428.80,1.42,558.15,1291.20,1408.50,1457.50
L3_S51_D4257,5195.0,1031.18,428.80,1.42,558.15,1291.20,1408.50,1457.50
L3_S51_D4259,5195.0,1031.18,428.80,1.42,558.15,1291.20,1408.50,1457.50
L3_S51_D4261,5195.0,1031.18,428.80,1.42,558.15,1291.20,1408.50,1457.50


'Duplicates'

np.int64(0)

Unnamed: 0,Data Types,Missing Values,Unique Values,Sample Values
Id,int64,0,100000,"[34538, 33921, 148912]"
L0_S0_D1,float64,42085,15308,"[nan, 1508.74, 1441.56]"
L0_S0_D3,float64,42085,15308,"[622.25, 1096.48, 166.23]"
L0_S0_D5,float64,42085,15308,"[nan, nan, nan]"
L0_S0_D7,float64,42085,15308,"[nan, 323.18, 169.11]"
...,...,...,...,...
L3_S51_D4255,float64,94805,1292,"[nan, nan, nan]"
L3_S51_D4257,float64,94805,1292,"[nan, nan, nan]"
L3_S51_D4259,float64,94805,1292,"[nan, 518.7, nan]"
L3_S51_D4261,float64,94805,1292,"[nan, nan, nan]"


In [None]:
fig, ax = plt.subplots(figsize=(12, 10), dpi=200)

sns.heatmap(df.corr(numeric_only=True), annot=True, cmap="coolwarm", center=0, ax=ax);