In [106]:
import pandas as pd
import missingno as mso
import matplotlib.pyplot as plt
import math
from sklearn.decomposition import PCA

In [107]:
df = pd.read_csv("../datasets/dataset.csv")

columns = list(df)
ROWS = df.shape[0]
COLS = df.shape[1]


cleaned_df = df.copy()
cleaned_df

Unnamed: 0,id,target,sample_type,continuous_0,categorical_0,categorical_1,categorical_2,categorical_3,categorical_4,categorical_5,...,continuous_660,continuous_661,continuous_662,continuous_663,continuous_664,continuous_665,continuous_666,categorical_15,categorical_16,continuous_667
0,14537510,good,dev,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,315.000000,315.00,315.00,0.0,0.0,0.0,0.0,0.0,0.0,
1,12527950,good,dev,1.0,1.0,1.0,1.0,1.0,1.0,0.0,...,155.000000,155.00,155.00,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,12277478,good,dev,1.0,1.0,1.0,1.0,1.0,2.0,0.0,...,163.125000,162.15,164.10,0.0,0.0,0.0,0.0,1.0,1.0,
3,13362203,good,dev,1.0,1.0,2.0,2.0,2.0,3.0,0.0,...,107.226666,75.00,153.68,0.0,0.0,0.0,0.0,2.0,2.0,0.0
4,14036057,bad,dev,1.0,1.0,1.0,1.0,1.0,1.0,0.0,...,157.150000,155.30,158.45,138.0,46.0,0.0,120.0,3.0,1.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31493,11486651,good,dev,1.0,1.0,13.0,13.0,13.0,13.0,0.0,...,,,,,,,,7.0,,0.0
31494,13808107,good,dev,1.0,4.0,4.0,4.0,4.0,4.0,0.0,...,140.000000,140.00,140.00,0.0,0.0,0.0,0.0,3.0,6.0,0.0
31495,12631253,good,dev,1.0,1.0,17.0,17.0,17.0,17.0,3.0,...,78.333333,75.00,85.00,0.0,0.0,0.0,0.0,4.0,16.0,0.0
31496,10898612,good,valid,1.0,1.0,14.0,14.0,14.0,14.0,0.0,...,,,,,,,,10.0,,0.0


## NaN values

In [108]:
def get_missing_values_count(dataframe: pd.DataFrame, col: str) -> int:
    return dataframe[col].isna().sum()

In [109]:
missing_values = {"name": [], "missing_count": [], "missing_persentage": []}

for col in columns:
    count = get_missing_values_count(cleaned_df, col)

    missing_values["name"].append(col)
    missing_values["missing_count"].append(count)
    missing_values["missing_persentage"].append(count / ROWS)


missing_df = pd.DataFrame.from_dict(missing_values)
missing_df

Unnamed: 0,name,missing_count,missing_persentage
0,id,0,0.000000
1,target,0,0.000000
2,sample_type,0,0.000000
3,continuous_0,0,0.000000
4,categorical_0,0,0.000000
...,...,...,...
683,continuous_665,15093,0.479173
684,continuous_666,15093,0.479173
685,categorical_15,10473,0.332497
686,categorical_16,15021,0.476887


### Drop columns with more than 20% truncatance

In [110]:
useless_columns = missing_df[missing_df["missing_persentage"] > 0.5]["name"].to_list()
useless_columns

['continuous_471',
 'continuous_472',
 'continuous_506',
 'continuous_507',
 'continuous_508',
 'continuous_509',
 'continuous_510',
 'continuous_511',
 'continuous_512',
 'continuous_513',
 'continuous_514',
 'continuous_515',
 'continuous_516',
 'continuous_517',
 'continuous_518',
 'continuous_519',
 'continuous_520',
 'continuous_521',
 'continuous_543',
 'continuous_544',
 'continuous_546',
 'continuous_547',
 'continuous_548',
 'continuous_549',
 'continuous_550',
 'continuous_575',
 'continuous_603',
 'continuous_636']

In [111]:
# cleaned_df = cleaned_df.drop(useless_columns, axis=1)
# cleaned_df

### Fill nan value with 0

In [112]:
def continuous_fill(frame: pd.DataFrame, col: str) -> pd.DataFrame:
    mean = frame[col].mean()
    return frame[col].fillna(mean)

def categorical_fill(frame: pd.DataFrame, col: str) -> pd.DataFrame:
    most_frequent_value = frame[col].value_counts().idxmax()
    return frame[col].fillna(most_frequent_value)

def fill_nan_values(frame: pd.DataFrame):
    for col in list(frame):
        if col.startswith("continuous"):
            frame[col] = continuous_fill(frame, col)
        elif col.startswith("categorical"):
            frame[col] = categorical_fill(frame, col)

In [113]:
fill_nan_values(cleaned_df)
cleaned_df

Unnamed: 0,id,target,sample_type,continuous_0,categorical_0,categorical_1,categorical_2,categorical_3,categorical_4,categorical_5,...,continuous_660,continuous_661,continuous_662,continuous_663,continuous_664,continuous_665,continuous_666,categorical_15,categorical_16,continuous_667
0,14537510,good,dev,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,315.000000,315.000000,315.000000,0.000000,0.000000,0.000000,0.0000,0.0,0.0,0.024846
1,12527950,good,dev,1.0,1.0,1.0,1.0,1.0,1.0,0.0,...,155.000000,155.000000,155.000000,0.000000,0.000000,0.000000,0.0000,0.0,1.0,0.000000
2,12277478,good,dev,1.0,1.0,1.0,1.0,1.0,2.0,0.0,...,163.125000,162.150000,164.100000,0.000000,0.000000,0.000000,0.0000,1.0,1.0,0.024846
3,13362203,good,dev,1.0,1.0,2.0,2.0,2.0,3.0,0.0,...,107.226666,75.000000,153.680000,0.000000,0.000000,0.000000,0.0000,2.0,2.0,0.000000
4,14036057,bad,dev,1.0,1.0,1.0,1.0,1.0,1.0,0.0,...,157.150000,155.300000,158.450000,138.000000,46.000000,0.000000,120.0000,3.0,1.0,0.024846
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31493,11486651,good,dev,1.0,1.0,13.0,13.0,13.0,13.0,0.0,...,107.082979,86.617275,128.735768,43.940013,15.567802,4.295324,30.8417,7.0,1.0,0.000000
31494,13808107,good,dev,1.0,4.0,4.0,4.0,4.0,4.0,0.0,...,140.000000,140.000000,140.000000,0.000000,0.000000,0.000000,0.0000,3.0,6.0,0.000000
31495,12631253,good,dev,1.0,1.0,17.0,17.0,17.0,17.0,3.0,...,78.333333,75.000000,85.000000,0.000000,0.000000,0.000000,0.0000,4.0,16.0,0.000000
31496,10898612,good,valid,1.0,1.0,14.0,14.0,14.0,14.0,0.0,...,107.082979,86.617275,128.735768,43.940013,15.567802,4.295324,30.8417,10.0,1.0,0.000000


## Target values

In [114]:
cleaned_df["target"] = cleaned_df["target"].apply(lambda x: 1 if x == "good" else 0)
cleaned_df

Unnamed: 0,id,target,sample_type,continuous_0,categorical_0,categorical_1,categorical_2,categorical_3,categorical_4,categorical_5,...,continuous_660,continuous_661,continuous_662,continuous_663,continuous_664,continuous_665,continuous_666,categorical_15,categorical_16,continuous_667
0,14537510,1,dev,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,315.000000,315.000000,315.000000,0.000000,0.000000,0.000000,0.0000,0.0,0.0,0.024846
1,12527950,1,dev,1.0,1.0,1.0,1.0,1.0,1.0,0.0,...,155.000000,155.000000,155.000000,0.000000,0.000000,0.000000,0.0000,0.0,1.0,0.000000
2,12277478,1,dev,1.0,1.0,1.0,1.0,1.0,2.0,0.0,...,163.125000,162.150000,164.100000,0.000000,0.000000,0.000000,0.0000,1.0,1.0,0.024846
3,13362203,1,dev,1.0,1.0,2.0,2.0,2.0,3.0,0.0,...,107.226666,75.000000,153.680000,0.000000,0.000000,0.000000,0.0000,2.0,2.0,0.000000
4,14036057,0,dev,1.0,1.0,1.0,1.0,1.0,1.0,0.0,...,157.150000,155.300000,158.450000,138.000000,46.000000,0.000000,120.0000,3.0,1.0,0.024846
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31493,11486651,1,dev,1.0,1.0,13.0,13.0,13.0,13.0,0.0,...,107.082979,86.617275,128.735768,43.940013,15.567802,4.295324,30.8417,7.0,1.0,0.000000
31494,13808107,1,dev,1.0,4.0,4.0,4.0,4.0,4.0,0.0,...,140.000000,140.000000,140.000000,0.000000,0.000000,0.000000,0.0000,3.0,6.0,0.000000
31495,12631253,1,dev,1.0,1.0,17.0,17.0,17.0,17.0,3.0,...,78.333333,75.000000,85.000000,0.000000,0.000000,0.000000,0.0000,4.0,16.0,0.000000
31496,10898612,1,valid,1.0,1.0,14.0,14.0,14.0,14.0,0.0,...,107.082979,86.617275,128.735768,43.940013,15.567802,4.295324,30.8417,10.0,1.0,0.000000


## Correlation cleaning

In [115]:
corr_df = cleaned_df.drop(["sample_type", "id"], axis=1)
target_correlation = corr_df.corr()["target"].to_frame()

In [116]:
uncorrelated_cols = target_correlation[target_correlation["target"].abs() <= 0.05].index
uncorrelated_cols

Index(['continuous_0', 'categorical_0', 'categorical_1', 'categorical_2',
       'categorical_3', 'categorical_4', 'categorical_5', 'categorical_6',
       'categorical_7', 'categorical_8',
       ...
       'continuous_658', 'continuous_660', 'continuous_662', 'continuous_663',
       'continuous_664', 'continuous_665', 'continuous_666', 'categorical_15',
       'categorical_16', 'continuous_667'],
      dtype='object', length=607)

In [117]:
# cleaned_df = cleaned_df.drop(uncorrelated_cols, axis=1)
# cleaned_df

In [118]:
cat_cols = list(filter(lambda x: x.startswith("categorical"), list(cleaned_df)))
cleaned_df = pd.get_dummies(cleaned_df, columns=cat_cols, dtype=int)

In [119]:
dev_df = cleaned_df[cleaned_df["sample_type"] == "dev"]
dev_df.to_csv("../datasets/train.csv")
dev_df

Unnamed: 0,id,target,sample_type,continuous_0,continuous_1,continuous_2,continuous_3,continuous_4,continuous_5,continuous_6,...,categorical_16_14.0,categorical_16_15.0,categorical_16_16.0,categorical_16_17.0,categorical_16_18.0,categorical_16_19.0,categorical_16_20.0,categorical_16_21.0,categorical_16_22.0,categorical_16_23.0
0,14537510,1,dev,1.0,123059.0,1.000000,0.369565,0.358696,0.282609,0.141304,...,0,0,0,0,0,0,0,0,0,0
1,12527950,1,dev,1.0,10499.0,0.836957,0.032609,0.000000,0.000000,0.000000,...,0,0,0,0,0,0,0,0,0,0
2,12277478,1,dev,1.0,537129.0,0.967391,0.315217,0.271739,0.260870,0.250000,...,0,0,0,0,0,0,0,0,0,0
3,13362203,1,dev,1.0,412431.0,0.932584,0.876404,0.258427,0.089888,0.089888,...,0,0,0,0,0,0,0,0,0,0
4,14036057,0,dev,1.0,54264.0,0.956044,0.681319,0.670330,0.593407,0.000000,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31489,14120894,1,dev,1.0,373593.0,0.988506,0.287356,0.287356,0.252874,0.103448,...,0,0,0,0,0,0,0,0,0,0
31490,13466688,1,dev,1.0,316013.0,0.967033,0.274725,0.065934,0.021978,0.010989,...,0,0,0,0,0,0,0,0,0,0
31493,11486651,1,dev,1.0,8594.0,0.804348,0.000000,0.000000,0.000000,0.000000,...,0,0,0,0,0,0,0,0,0,0
31494,13808107,1,dev,1.0,415396.0,0.978022,0.208791,0.076923,0.054945,0.043956,...,0,0,0,0,0,0,0,0,0,0


In [120]:
valid_df = cleaned_df[cleaned_df["sample_type"] == "valid"]
valid_df.to_csv("../datasets/valid.csv")
valid_df

Unnamed: 0,id,target,sample_type,continuous_0,continuous_1,continuous_2,continuous_3,continuous_4,continuous_5,continuous_6,...,categorical_16_14.0,categorical_16_15.0,categorical_16_16.0,categorical_16_17.0,categorical_16_18.0,categorical_16_19.0,categorical_16_20.0,categorical_16_21.0,categorical_16_22.0,categorical_16_23.0
8,10467718,1,valid,1.0,26836.0,0.439560,0.186813,0.054945,0.000000,0.000000,...,0,0,0,0,0,0,0,0,0,0
10,9901053,1,valid,1.0,90058.0,0.983871,0.048387,0.032258,0.032258,0.000000,...,0,0,0,0,0,0,0,0,0,0
11,10722270,1,valid,1.0,16895.0,1.000000,0.263736,0.000000,0.000000,0.000000,...,0,0,0,0,0,0,0,0,0,0
15,12267487,1,valid,1.0,73040.0,0.869565,0.076087,0.054348,0.021739,0.000000,...,0,0,0,0,0,0,0,0,0,0
18,11184135,1,valid,1.0,449891.0,0.975000,0.450000,0.187500,0.062500,0.037500,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31474,12190030,1,valid,1.0,70633.0,1.000000,0.307692,0.274725,0.087912,0.000000,...,0,0,0,0,0,0,0,0,0,0
31487,12898463,1,valid,1.0,21025.0,0.944444,0.044444,0.022222,0.000000,0.000000,...,0,0,0,0,0,0,0,0,0,0
31488,11306269,1,valid,1.0,44592.0,0.842697,0.561798,0.134831,0.000000,0.000000,...,0,0,0,0,0,0,0,0,0,0
31491,11642886,1,valid,1.0,31648.0,0.891304,0.065217,0.043478,0.000000,0.000000,...,0,0,0,0,0,0,0,0,0,0


In [121]:
test_df = cleaned_df[cleaned_df["sample_type"] == "test"]
test_df.to_csv("../datasets/test.csv")
test_df

Unnamed: 0,id,target,sample_type,continuous_0,continuous_1,continuous_2,continuous_3,continuous_4,continuous_5,continuous_6,...,categorical_16_14.0,categorical_16_15.0,categorical_16_16.0,categorical_16_17.0,categorical_16_18.0,categorical_16_19.0,categorical_16_20.0,categorical_16_21.0,categorical_16_22.0,categorical_16_23.0
7,15779547,1,test,1.0,493761.0,0.967391,0.206522,0.076087,0.076087,0.076087,...,0,0,0,0,0,0,0,0,0,0
12,16235168,1,test,1.0,131795.0,0.813187,0.054945,0.054945,0.043956,0.032967,...,0,0,0,0,0,0,0,0,0,0
13,15549746,1,test,1.0,265988.0,0.815217,0.086957,0.054348,0.054348,0.043478,...,0,0,0,0,0,0,0,0,0,0
14,15240313,1,test,1.0,36376.0,0.945055,0.736264,0.241758,0.000000,0.000000,...,0,0,0,0,0,0,0,0,0,0
23,16423636,1,test,1.0,7068.0,0.516484,0.000000,0.000000,0.000000,0.000000,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31481,15212285,0,test,1.0,7028.0,0.989011,0.000000,0.000000,0.000000,0.000000,...,0,0,0,0,0,0,0,0,0,0
31482,15278736,1,test,1.0,21141.0,0.805195,0.181818,0.012987,0.000000,0.000000,...,0,0,0,0,0,0,0,0,0,0
31484,16032766,1,test,1.0,89088.0,0.978022,0.054945,0.054945,0.054945,0.000000,...,0,0,0,0,0,0,0,0,0,0
31492,17047712,0,test,1.0,31135.0,1.000000,0.164835,0.065934,0.000000,0.000000,...,0,0,0,0,0,0,0,0,0,0
