In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_parquet("/content/drive/MyDrive/train.parquet")

In [3]:
datac = df.copy()

In [4]:
df = df.drop_duplicates().reset_index(drop=True)

In [5]:
df['year'] = df['Date'].dt.year
df['month'] = df['Date'].dt.month
df['day'] = df['Date'].dt.day
df['dayofweek'] = df['Date'].dt.dayofweek
df['weekend'] = df['dayofweek'].apply(lambda x: 1 if x >= 5 else 0)

In [6]:
df["weekend"].value_counts()

Unnamed: 0_level_0,count
weekend,Unnamed: 1_level_1
0,1128293
1,453816


In [7]:
#IQR method (capping)
for col in ['X1', 'X2', 'X3', 'X4', 'X5']:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR

    df[col] = df[col].clip(lower, upper)

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
from imblearn.combine import SMOTETomek
import pandas as pd
from collections import Counter
X = df.drop(["target","Date"], axis=1)
y = df["target"]

# Step 1: Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.2,
                                                    stratify=y,
                                                    random_state=42)

In [9]:
X_train.dtypes


Unnamed: 0,0
X1,float64
X2,float64
X3,float64
X4,float64
X5,float64
year,int32
month,int32
day,int32
dayofweek,int32
weekend,int64


In [10]:
X_train.dtypes

Unnamed: 0,0
X1,float64
X2,float64
X3,float64
X4,float64
X5,float64
year,int32
month,int32
day,int32
dayofweek,int32
weekend,int64


In [11]:
X_test

Unnamed: 0,X1,X2,X3,X4,X5,year,month,day,dayofweek,weekend
911904,1.263644,5.506956,1.0,2.718282,0.000000,2023,3,25,5,1
1083782,1.000000,5.493473,1.0,1.000000,0.693147,2023,8,29,1,0
903063,1.271249,5.502563,1.0,1.000000,2.944439,2023,3,17,4,0
1376070,1.197217,5.485958,1.0,1.000000,0.000000,2024,5,18,5,1
1234539,1.402141,5.489516,1.0,1.000000,2.944439,2024,9,1,6,1
...,...,...,...,...,...,...,...,...,...,...
438839,1.000000,5.502767,1.0,1.000000,2.944439,2022,1,23,6,1
377983,1.074655,5.466878,1.0,1.000000,0.000000,2021,11,29,0,0
1037969,1.051271,5.487904,1.0,1.000000,2.944439,2023,7,18,1,0
259492,1.271249,5.458010,1.0,2.718282,2.890372,2021,8,13,4,0


In [12]:
from sklearn.preprocessing import RobustScaler

# RobustScaler
scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [13]:
# Apply SMOTETomek

print("Before resampling:", Counter(y_train))

smote_tomek = SMOTETomek(random_state=42, n_jobs = -1)
X_resampled, y_resampled = smote_tomek.fit_resample(X_train_scaled, y_train.astype(int))

print("After resampling:", Counter(y_resampled))


Before resampling: Counter({'0': 1254461, '1': 11226})
After resampling: Counter({0: 1253377, 1: 1253377})


In [14]:
import keras
from keras.models import Sequential
from keras.layers import Dense, InputLayer

In [23]:
model = Sequential()
model.add(InputLayer(shape=(10,)))
model.add(Dense(64, activation='tanh'))
model.add(Dense(32, activation='tanh'))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='sgd', loss='binary_crossentropy', metrics=['accuracy'])

# Train
model.fit(X_resampled, y_resampled, epochs=20, batch_size=32, validation_split=0.2)


Epoch 1/20
[1m62669/62669[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m215s[0m 3ms/step - accuracy: 0.8797 - loss: 0.2787 - val_accuracy: 0.9073 - val_loss: 0.2440
Epoch 2/20
[1m62669/62669[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m208s[0m 3ms/step - accuracy: 0.9094 - loss: 0.2209 - val_accuracy: 0.9066 - val_loss: 0.2512
Epoch 3/20
[1m62669/62669[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m285s[0m 4ms/step - accuracy: 0.9151 - loss: 0.2093 - val_accuracy: 0.8896 - val_loss: 0.2878
Epoch 4/20
[1m62669/62669[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m233s[0m 3ms/step - accuracy: 0.9215 - loss: 0.1971 - val_accuracy: 0.9235 - val_loss: 0.2110
Epoch 5/20
[1m62669/62669[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m201s[0m 3ms/step - accuracy: 0.9256 - loss: 0.1890 - val_accuracy: 0.9046 - val_loss: 0.2604
Epoch 6/20
[1m62669/62669[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m200s[0m 3ms/step - accuracy: 0.9291 - loss: 0.1828 - val_accuracy: 0.9212 - val_loss:

<keras.src.callbacks.history.History at 0x7d60a8c82550>

In [31]:
d=pd.read_parquet('/content/drive/MyDrive/test.parquet')

In [32]:
d.iloc[:,1:]

Unnamed: 0,Date,X1,X2,X3,X4,X5
0,2020-12-16,1.685395,5.463917,1.0,7.389056,2.890372
1,2020-12-16,1.488844,5.454936,1.0,7.389056,2.890372
2,2020-12-16,1.164160,5.471136,1.0,1.000000,2.890372
3,2020-12-16,1.000000,5.467385,1.0,1.000000,2.890372
4,2020-12-16,1.000000,5.453995,1.0,1.000000,2.833213
...,...,...,...,...,...,...
409851,2024-11-26,1.120752,5.483053,1.0,2.718282,0.000000
409852,2024-11-26,1.096365,5.477802,1.0,7.389056,0.000000
409853,2024-11-26,1.111822,5.479972,1.0,2.718282,0.000000
409854,2024-11-26,1.000000,5.486165,1.0,1.000000,0.000000


In [35]:
y_pred = model.predict(d.iloc[:,2:])

[1m12808/12808[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m50s[0m 4ms/step


In [36]:
q = pd.DataFrame(y_pred,columns=['target'])

In [37]:
q['ID']=d.iloc[:,0]

In [39]:
q[['ID','target']]

Unnamed: 0,ID,target
0,0,7.831355e-06
1,1,1.074248e-05
2,2,2.840235e-05
3,3,2.840514e-05
4,4,2.840538e-05
...,...,...
409851,409851,3.600152e-07
409852,409852,3.599974e-07
409853,409853,3.600152e-07
409854,409854,3.601677e-07


In [40]:
y_pred

array([[7.8313551e-06],
       [1.0742479e-05],
       [2.8402352e-05],
       ...,
       [3.6001518e-07],
       [3.6016769e-07],
       [3.6016769e-07]], dtype=float32)

In [43]:
for i in y_pred:
  if i>=0.5:
    print(1)
  else:
    print(0)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
1
1
1
1
1
1
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
1
1
1
0
0
0
0
0
0
0
0
0
0
0
0
0
1
0
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0