In [28]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/bitblenders-007/test_data.parquet
/kaggle/input/bitblenders-007/add_event.parquet
/kaggle/input/bitblenders-007/685404e30cfdb_submission_template.csv
/kaggle/input/bitblenders-007/data_dictionary.csv
/kaggle/input/bitblenders-007/offer_metadata.parquet
/kaggle/input/bitblenders-007/add_trans.parquet
/kaggle/input/bitblenders-007/train_data.parquet


In [29]:
import pandas as pd
import numpy as np
import re

# 1. Load test data
df_test = pd.read_parquet("/kaggle/input/bitblenders-007/test_data.parquet")
print("Original test shape:", df_test.shape)

# 2. Replace common string "nulls"/"errors"
df_test.replace(['None', 'none', 'NaN', 'nan', '', 'INF', 'inf', '-inf'], np.nan, inplace=True)

# 3. Drop constant columns
nunique = df_test.nunique(dropna=True)
constant_cols = nunique[nunique <= 1].index.tolist()
df_test.drop(columns=constant_cols, inplace=True)
print("Dropped constant columns:", constant_cols)

# 4. Remove unwanted characters (/, \, %, ,) from object columns
def clean_special_chars(df, chars=r"[,/\\%]"):
    for col in df.columns:
        if df[col].dtype == 'object':
            df[col] = df[col].astype(str).apply(lambda x: re.sub(chars, '', x) if pd.notna(x) else x)
    return df

df_test = clean_special_chars(df_test)

# 5. Capping outliers (IQR method) on numeric `f` columns
def cap_outliers(df, cols, lower_q=0.01, upper_q=0.99):
    for col in cols:
        if df[col].nunique() <= 1:
            continue
        try:
            lower = df[col].quantile(lower_q)
            upper = df[col].quantile(upper_q)
            df[col] = df[col].clip(lower, upper)
        except:
            print(f"Skipping non-numeric column: {col}")
    return df

# Filter safe numeric f columns only
feature_cols = [col for col in df_test.columns if col.startswith('f') and pd.api.types.is_numeric_dtype(df_test[col])]
df_test = cap_outliers(df_test, feature_cols)

# 6. Fill NaNs with median for numeric columns
df_test.fillna(df_test.median(numeric_only=True), inplace=True)

# 7. Downcast floats (to reduce RAM)
for col in feature_cols:
    df_test[col] = pd.to_numeric(df_test[col], downcast='float')

# 8. (Optional) Convert datetime cols if any
date_cols = ['id4', 'id5']  # Replace with your actual datetime column names
for col in date_cols:
    if col in df_test.columns:
        df_test[col] = pd.to_datetime(df_test[col], errors='coerce')


# 9. Save cleaned version
df_test.to_parquet("/kaggle/working/test_data_cleaned.parquet", index=False)
print("✅ Test data cleaned and saved.")
print("Final test shape:", df_test.shape)


Original test shape: (369301, 371)
Dropped constant columns: ['f14', 'f15', 'f16', 'f19', 'f20', 'f21', 'f23', 'f24', 'f25', 'f62', 'f66', 'f71', 'f80', 'f88', 'f102', 'f112', 'f122', 'f128', 'f129', 'f135', 'f136', 'f144', 'f145', 'f205', 'f226', 'f229', 'f236', 'f238', 'f240', 'f243', 'f245', 'f246', 'f248', 'f249', 'f258', 'f259', 'f260', 'f262', 'f266', 'f267', 'f268', 'f270', 'f271', 'f277', 'f279', 'f281', 'f286', 'f287', 'f290', 'f291', 'f294', 'f295', 'f298', 'f300', 'f301', 'f303', 'f304', 'f307', 'f308', 'f309', 'f334', 'f335', 'f360']
✅ Test data cleaned and saved.
Final test shape: (369301, 308)


In [30]:
df_test.head()


Unnamed: 0,id1,id2,id3,id4,id5,f1,f2,f3,f4,f5,...,f356,f357,f358,f359,f361,f362,f363,f364,f365,f366
46756,1362907_91950_16-23_2023-11-04 18:56:26.000794,1362907,91950,2023-11-04 18:56:26.000794,2023-11-04,,,,,,...,0.0014331347664681,0.0027807272790246,0.0465999838670646,0.0,1.0,0.0,0.0,56.0,0.0,0.0
57819,1082599_88356_16-23_2023-11-04 06:08:53.373,1082599,88356,2023-11-04 06:08:53.373000,2023-11-04,,9.0,,,,...,-0.0039733219809848,0.0014287946777398,0.0603093514970909,0.0,195.0,13.0,0.0666666666666666,,,
15390,1888466_958700_16-23_2023-11-05 10:07:28.000725,1888466,958700,2023-11-05 10:07:28.000725,2023-11-05,,,,,22.0,...,-0.0408242612752721,-0.0174961119751166,0.0734836702954899,0.0,155.0,67.0,0.432258064516129,1142.0,436.0,0.3817863397548161
145730,1888971_795739_16-23_2023-11-04 12:25:28.244,1888971,795739,2023-11-04 12:25:28.244000,2023-11-04,,,,,,...,-0.0003337454772072,0.0013159476327096,0.040572039549215,0.0,,,,,,
146085,1256369_82296_16-23_2023-11-05 06:45:26.657,1256369,82296,2023-11-05 06:45:26.657000,2023-11-05,,,,,,...,0.0008831864677628,0.0020541699799278,0.038243539079214,0.0,29.0,2.0,0.0689655172413793,361.0,3.0,0.0083102493074792


In [31]:
# Replace common null-like string values with np.nan
df_test.replace(
    to_replace=['None', 'none', 'NaN', 'nan', '', 'INF', 'inf', '-inf', 'NULL', 'null'],
    value=np.nan,
    inplace=True
)


In [32]:
df_test.isin(['None', 'none', 'NaN', 'nan', '']).sum().sum()


0

In [33]:
df_test.head(100)


Unnamed: 0,id1,id2,id3,id4,id5,f1,f2,f3,f4,f5,...,f356,f357,f358,f359,f361,f362,f363,f364,f365,f366
46756,1362907_91950_16-23_2023-11-04 18:56:26.000794,1362907,91950,2023-11-04 18:56:26.000794,2023-11-04,,,,,,...,0.0014331347664681,0.0027807272790246,0.0465999838670646,0.0,1.0,0.0,0.0,56.0,0.0,0.0
57819,1082599_88356_16-23_2023-11-04 06:08:53.373,1082599,88356,2023-11-04 06:08:53.373000,2023-11-04,,9.0,,,,...,-0.0039733219809848,0.0014287946777398,0.0603093514970909,0.0,195.0,13.0,0.0666666666666666,,,
15390,1888466_958700_16-23_2023-11-05 10:07:28.000725,1888466,958700,2023-11-05 10:07:28.000725,2023-11-05,,,,,22.0,...,-0.0408242612752721,-0.0174961119751166,0.0734836702954899,0.0,155.0,67.0,0.432258064516129,1142.0,436.0,0.3817863397548161
145730,1888971_795739_16-23_2023-11-04 12:25:28.244,1888971,795739,2023-11-04 12:25:28.244000,2023-11-04,,,,,,...,-0.0003337454772072,0.0013159476327096,0.040572039549215,0.0,,,,,,
146085,1256369_82296_16-23_2023-11-05 06:45:26.657,1256369,82296,2023-11-05 06:45:26.657000,2023-11-05,,,,,,...,0.0008831864677628,0.0020541699799278,0.038243539079214,0.0,29.0,2.0,0.0689655172413793,361.0,3.0,0.0083102493074792
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
148193,1432828_664919_16-23_2023-11-04 12:56:08.718,1432828,664919,2023-11-04 12:56:08.718000,2023-11-04,,,,,28.0,...,-0.0012683071305544,0.0012268793046597,0.0420464824746967,0.0,222.0,0.0,0.0,244.0,0.0,0.0
44887,1589486_404258283_16-23_2023-11-04 06:44:57.811,1589486,404258283,2023-11-04 06:44:57.811000,2023-11-04,,,,,22.0,...,-0.0278207996328801,-0.0119231998426629,0.0500774393391843,0.0,758.0,0.0,0.0,705.0,1.0,0.001418439716312
180505,1283337_13365_16-23_2023-11-05 10:43:45.573,1283337,13365,2023-11-05 10:43:45.573000,2023-11-05,,26.0,,,18.0,...,0.0015712085515832,0.0035444525212801,0.060755317938801,0.0,99.0,2.0,0.0202020202020202,57.0,1.0,0.0175438596491228
166765,1122755_88356_16-23_2023-11-05 15:26:57.772,1122755,88356,2023-11-05 15:26:57.772000,2023-11-05,,,,,44.0,...,-0.0058973225850692,-0.0007146723060302,0.049946690724186,0.0,239.0,0.0,0.0,106.0,0.0,0.0


In [34]:
# Fill numeric NaNs with column-wise median
df_test.fillna(df_test.median(numeric_only=True), inplace=True)


In [35]:
df_test.head(40)

Unnamed: 0,id1,id2,id3,id4,id5,f1,f2,f3,f4,f5,...,f356,f357,f358,f359,f361,f362,f363,f364,f365,f366
46756,1362907_91950_16-23_2023-11-04 18:56:26.000794,1362907,91950,2023-11-04 18:56:26.000794,2023-11-04,,,,,,...,0.0014331347664681,0.0027807272790246,0.0465999838670646,0.0,1.0,0.0,0.0,56.0,0.0,0.0
57819,1082599_88356_16-23_2023-11-04 06:08:53.373,1082599,88356,2023-11-04 06:08:53.373000,2023-11-04,,9.0,,,,...,-0.0039733219809848,0.0014287946777398,0.0603093514970909,0.0,195.0,13.0,0.0666666666666666,,,
15390,1888466_958700_16-23_2023-11-05 10:07:28.000725,1888466,958700,2023-11-05 10:07:28.000725,2023-11-05,,,,,22.0,...,-0.0408242612752721,-0.0174961119751166,0.0734836702954899,0.0,155.0,67.0,0.432258064516129,1142.0,436.0,0.3817863397548161
145730,1888971_795739_16-23_2023-11-04 12:25:28.244,1888971,795739,2023-11-04 12:25:28.244000,2023-11-04,,,,,,...,-0.0003337454772072,0.0013159476327096,0.040572039549215,0.0,,,,,,
146085,1256369_82296_16-23_2023-11-05 06:45:26.657,1256369,82296,2023-11-05 06:45:26.657000,2023-11-05,,,,,,...,0.0008831864677628,0.0020541699799278,0.038243539079214,0.0,29.0,2.0,0.0689655172413793,361.0,3.0,0.0083102493074792
77784,1750220_98194407_16-23_2023-11-04 05:46:54.823,1750220,98194407,2023-11-04 05:46:54.823000,2023-11-04,,,,,44.0,...,-0.0294034969359644,-0.0126014986868419,0.052926294484736,0.0,1.0,0.0,0.0,65.0,0.0,0.0
175314,1406615_18473108_16-23_2023-11-04 21:39:43.485,1406615,18473108,2023-11-04 21:39:43.485000,2023-11-04,,,42.0,,21.0,...,-0.0284995318352059,-0.0122140850722311,0.0512991573033707,0.0,4.0,0.0,0.0,53.0,1.0,0.0188679245283018
22541,1354131_88148_16-23_2023-11-05 16:57:56.684,1354131,88148,2023-11-05 16:57:56.684000,2023-11-05,,,,,,...,-0.0091175445805463,-0.000749098584511,0.0676952741623923,0.0,,,,271.0,0.0,0.0
122552,1457351_399752_16-23_2023-11-04 06:00:04.106,1457351,399752,2023-11-04 06:00:04.106000,2023-11-04,,9.0,,,43.0,...,0.0010453623290916,0.0023486019768627,0.0566570195248099,0.0,517.0,36.0,0.06963249516441,805.0,67.0,0.0832298136645962
95726,1461171_45856_16-23_2023-11-04 15:08:26.802,1461171,45856,2023-11-04 15:08:26.802000,2023-11-04,,,,,6.0,...,-0.0050033771887475,0.0021436740257646,0.0817374834831036,0.0,5.0,0.0,0.0,80.0,0.0,0.0


In [36]:
# Total missing values
print("Total NaNs remaining:", df_test.isna().sum().sum())

# Top 20 columns with most NaNs
missing = df_test.isna().sum().sort_values(ascending=False)
print(missing.head(20))


Total NaNs remaining: 21896548
f120    369273
f34     368928
f18     368853
f17     368853
f13     368853
f84     368806
f37     368700
f221    365356
f189    362788
f176    359443
f154    359443
f92     358606
f70     358606
f64     358606
f33     358263
f220    356539
f36     352514
f79     352299
f118    350117
f114    345516
dtype: int64


In [37]:
# Drop columns with more than 99% missing values
threshold = 0.99
missing_ratio = df_test.isna().mean()
cols_to_drop = missing_ratio[missing_ratio > threshold].index

df_test.drop(columns=cols_to_drop, inplace=True)

print(f"✅ Dropped {len(cols_to_drop)} mostly-empty columns:", list(cols_to_drop))


✅ Dropped 7 mostly-empty columns: ['f13', 'f17', 'f18', 'f34', 'f37', 'f84', 'f120']


In [38]:
# Fill numeric columns with median
numeric_cols = df_test.select_dtypes(include='number').columns
df_test[numeric_cols] = df_test[numeric_cols].fillna(df_test[numeric_cols].median())

# Fill object columns with 'missing'
object_cols = df_test.select_dtypes(include='object').columns
df_test[object_cols] = df_test[object_cols].fillna('missing')

# Fill datetime columns using forward fill
datetime_cols = df_test.select_dtypes(include='datetime').columns
df_test[datetime_cols] = df_test[datetime_cols].fillna(method='ffill')


  df_test[datetime_cols] = df_test[datetime_cols].fillna(method='ffill')


In [39]:
print("✅ Final missing value count:", df_test.isna().sum().sum())


✅ Final missing value count: 0


In [40]:
df_test.head(40)

Unnamed: 0,id1,id2,id3,id4,id5,f1,f2,f3,f4,f5,...,f356,f357,f358,f359,f361,f362,f363,f364,f365,f366
46756,1362907_91950_16-23_2023-11-04 18:56:26.000794,1362907,91950,2023-11-04 18:56:26.000794,2023-11-04,missing,missing,missing,missing,missing,...,0.0014331347664681,0.0027807272790246,0.0465999838670646,0.0,1.0,0.0,0.0,56.0,0.0,0.0
57819,1082599_88356_16-23_2023-11-04 06:08:53.373,1082599,88356,2023-11-04 06:08:53.373000,2023-11-04,missing,9.0,missing,missing,missing,...,-0.0039733219809848,0.0014287946777398,0.0603093514970909,0.0,195.0,13.0,0.0666666666666666,missing,missing,missing
15390,1888466_958700_16-23_2023-11-05 10:07:28.000725,1888466,958700,2023-11-05 10:07:28.000725,2023-11-05,missing,missing,missing,missing,22.0,...,-0.0408242612752721,-0.0174961119751166,0.0734836702954899,0.0,155.0,67.0,0.432258064516129,1142.0,436.0,0.3817863397548161
145730,1888971_795739_16-23_2023-11-04 12:25:28.244,1888971,795739,2023-11-04 12:25:28.244000,2023-11-04,missing,missing,missing,missing,missing,...,-0.0003337454772072,0.0013159476327096,0.040572039549215,0.0,missing,missing,missing,missing,missing,missing
146085,1256369_82296_16-23_2023-11-05 06:45:26.657,1256369,82296,2023-11-05 06:45:26.657000,2023-11-05,missing,missing,missing,missing,missing,...,0.0008831864677628,0.0020541699799278,0.038243539079214,0.0,29.0,2.0,0.0689655172413793,361.0,3.0,0.0083102493074792
77784,1750220_98194407_16-23_2023-11-04 05:46:54.823,1750220,98194407,2023-11-04 05:46:54.823000,2023-11-04,missing,missing,missing,missing,44.0,...,-0.0294034969359644,-0.0126014986868419,0.052926294484736,0.0,1.0,0.0,0.0,65.0,0.0,0.0
175314,1406615_18473108_16-23_2023-11-04 21:39:43.485,1406615,18473108,2023-11-04 21:39:43.485000,2023-11-04,missing,missing,42.0,missing,21.0,...,-0.0284995318352059,-0.0122140850722311,0.0512991573033707,0.0,4.0,0.0,0.0,53.0,1.0,0.0188679245283018
22541,1354131_88148_16-23_2023-11-05 16:57:56.684,1354131,88148,2023-11-05 16:57:56.684000,2023-11-05,missing,missing,missing,missing,missing,...,-0.0091175445805463,-0.000749098584511,0.0676952741623923,0.0,missing,missing,missing,271.0,0.0,0.0
122552,1457351_399752_16-23_2023-11-04 06:00:04.106,1457351,399752,2023-11-04 06:00:04.106000,2023-11-04,missing,9.0,missing,missing,43.0,...,0.0010453623290916,0.0023486019768627,0.0566570195248099,0.0,517.0,36.0,0.06963249516441,805.0,67.0,0.0832298136645962
95726,1461171_45856_16-23_2023-11-04 15:08:26.802,1461171,45856,2023-11-04 15:08:26.802000,2023-11-04,missing,missing,missing,missing,6.0,...,-0.0050033771887475,0.0021436740257646,0.0817374834831036,0.0,5.0,0.0,0.0,80.0,0.0,0.0


In [41]:
import numpy as np

df_test.replace(
    to_replace=['missing', 'Missing', 'MISSING', 'None', 'none', 'NaN', 'nan', '', 'INF', 'inf', '-inf', 'NULL', 'null'],
    value=np.nan,
    inplace=True
)


In [42]:
# Total rows
total_rows = len(df_test)

# Count NaNs and calculate percentage per column
nan_stats = df_test.isna().sum().to_frame(name='NaN_Count')
nan_stats['NaN_Percent'] = (nan_stats['NaN_Count'] / total_rows) * 100

# Sort descending by percentage
nan_stats = nan_stats[nan_stats['NaN_Count'] > 0].sort_values('NaN_Percent', ascending=False)

# Display top 20
nan_stats.head(20)


Unnamed: 0,NaN_Count,NaN_Percent
f221,365356,98.931766
f189,362788,98.236398
f176,359443,97.330633
f154,359443,97.330633
f92,358606,97.103988
f70,358606,97.103988
f64,358606,97.103988
f33,358263,97.01111
f220,356539,96.544282
f36,352514,95.454385


In [43]:
# Drop columns with >90% missing values
cols_to_drop = nan_stats[nan_stats['NaN_Percent'] > 90].index.tolist()
print("Dropping these columns:", cols_to_drop)

df_test.drop(columns=cols_to_drop, inplace=True)


Dropping these columns: ['f221', 'f189', 'f176', 'f154', 'f92', 'f70', 'f64', 'f33', 'f220', 'f36', 'f79', 'f118', 'f114', 'f117', 'f81', 'f4']


In [44]:
print("✅ Final test shape:", df_test.shape)


✅ Final test shape: (369301, 285)


In [45]:
# Fill numeric columns with median
numeric_cols = df_test.select_dtypes(include='number').columns
df_test[numeric_cols] = df_test[numeric_cols].fillna(df_test[numeric_cols].median())


In [46]:
print("✅ Total NaNs after imputation:", df_test.isna().sum().sum())


✅ Total NaNs after imputation: 13657700


In [47]:
nan_counts = df_test.isna().sum()
high_nan_cols = nan_counts[nan_counts > 100000]
print(high_nan_cols.sort_values(ascending=False))


f121    330021
f3      319554
f119    292787
f116    285607
f218    281726
f35     276249
f40     271610
f29     266584
f82     255814
f48     237645
f1      237422
f115    235431
f207    215504
f206    215504
f2      214285
f57     203283
f83     201386
f212    200896
f211    200896
f210    200896
f209    200896
f208    200896
f7      176785
f11     171966
f78     167096
f42     155498
f43     155498
f27     147640
f26     147640
f22     147640
f104    143648
f9      139140
f222    131394
f219    130423
f31     128771
f32     127139
f217    126631
f10     119899
f56     118987
f53     118987
f54     118987
f55     118987
f58     118987
f110    118496
f12     116307
f8      116271
f5      112287
f39     108399
f139    106476
f143    100509
f142    100509
f141    100509
f140    100509
dtype: int64


In [48]:
# Drop columns with more than 70% missing values
nan_percent = (df_test.isna().sum() / len(df_test)) * 100
cols_to_drop = nan_percent[nan_percent > 70].index.tolist()

print("Columns to drop (more than 70% NaN):", cols_to_drop)

# Drop them
df_test.drop(columns=cols_to_drop, inplace=True)

print("✅ New shape of test data:", df_test.shape)


Columns to drop (more than 70% NaN): ['f3', 'f29', 'f35', 'f40', 'f116', 'f119', 'f121', 'f218']
✅ New shape of test data: (369301, 277)


In [49]:
# Fill remaining numeric NaNs with median
num_cols = df_test.select_dtypes(include='number').columns
df_test[num_cols] = df_test[num_cols].fillna(df_test[num_cols].median())


In [50]:
# Fill missing numeric values with median
num_cols = df_test.select_dtypes(include='number').columns
df_test[num_cols] = df_test[num_cols].fillna(df_test[num_cols].median())

print("✅ Total NaNs after filling:", df_test.isna().sum().sum())


✅ Total NaNs after filling: 11333562


In [51]:
missing_cols = df_test.isna().sum()
missing_cols = missing_cols[missing_cols > 0].sort_values(ascending=False)

print("Remaining NaNs per column:")
print(missing_cols)


Remaining NaNs per column:
f82     255814
f48     237645
f1      237422
f115    235431
f206    215504
         ...  
f273         8
f269         8
f265         8
f264         8
f235         8
Length: 270, dtype: int64


In [55]:
df_test.head()

Unnamed: 0,id1,id2,id3,id4,id5,f1,f2,f5,f6,f7,...,f356,f357,f358,f359,f361,f362,f363,f364,f365,f366
46756,1362907_91950_16-23_2023-11-04 18:56:26.000794,1362907,91950,2023-11-04 18:56:26.000794,2023-11-04,,,,5.0,11.0,...,0.0014331347664681,0.0027807272790246,0.0465999838670646,0.0,1.0,0.0,0.0,56.0,0.0,0.0
57819,1082599_88356_16-23_2023-11-04 06:08:53.373,1082599,88356,2023-11-04 06:08:53.373000,2023-11-04,,9.0,,,,...,-0.0039733219809848,0.0014287946777398,0.0603093514970909,0.0,195.0,13.0,0.0666666666666666,,,
15390,1888466_958700_16-23_2023-11-05 10:07:28.000725,1888466,958700,2023-11-05 10:07:28.000725,2023-11-05,,,22.0,5.0,11.0,...,-0.0408242612752721,-0.0174961119751166,0.0734836702954899,0.0,155.0,67.0,0.432258064516129,1142.0,436.0,0.3817863397548161
145730,1888971_795739_16-23_2023-11-04 12:25:28.244,1888971,795739,2023-11-04 12:25:28.244000,2023-11-04,,,,,,...,-0.0003337454772072,0.0013159476327096,0.040572039549215,0.0,,,,,,
146085,1256369_82296_16-23_2023-11-05 06:45:26.657,1256369,82296,2023-11-05 06:45:26.657000,2023-11-05,,,,,,...,0.0008831864677628,0.0020541699799278,0.038243539079214,0.0,29.0,2.0,0.0689655172413793,361.0,3.0,0.0083102493074792


In [57]:
df_test.tail()

Unnamed: 0,id1,id2,id3,id4,id5,f1,f2,f5,f6,f7,...,f356,f357,f358,f359,f361,f362,f363,f364,f365,f366
1806,1874443_95537_16-23_2023-11-05 09:21:24.182,1874443,95537,2023-11-05 09:21:24.182,2023-11-05,,,,,,...,-9.962325459418146e-05,0.0011444262884863,0.0451847712288842,0.0,5.0,0.0,0.0,144.0,8.0,0.0555555555555555
127494,1541978_5718_16-23_2023-11-05 00:56:43.946,1541978,5718,2023-11-05 00:56:43.946,2023-11-05,23.0,,10.0,35.0,17.0,...,-0.0025950479909546,0.0010418624094475,0.0552689756816507,0.0,18.0,0.0,0.0,216.0,8.0,0.037037037037037
106947,1887841_85905_16-23_2023-11-05 20:40:43.312,1887841,85905,2023-11-05 20:40:43.312,2023-11-05,,,,5.0,,...,-0.0369808777693538,-0.0158489476154373,0.0792447380771868,0.0,41.0,1.0,0.024390243902439,569.0,1.0,0.0017574692442882
158372,1569367_944713_16-23_2023-11-05 00:43:04.335,1569367,944713,2023-11-05 00:43:04.335,2023-11-05,,,,18.0,,...,8.231582859141949e-05,0.0019767025731701,0.0457251973961485,0.0,173.0,18.0,0.1040462427745664,81.0,12.0,0.1481481481481481
74378,1086547_60142_16-23_2023-11-05 10:37:36.747,1086547,60142,2023-11-05 10:37:36.747,2023-11-05,,28.0,43.0,32.0,19.0,...,-0.0497166409067491,-0.0213071318171781,0.0894899536321483,0.0,1.0,0.0,0.0,,,


In [58]:
df_test.to_parquet("cleaned_test_data.parquet", index=False)