In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
from IPython.display import display

# Load the data
train_df = pd.read_parquet('/kaggle/input/celebal-ana-verse-j/train.parquet')
test_df = pd.read_parquet('/kaggle/input/celebal-ana-verse-j/test.parquet')
sample_submission = pd.read_parquet('/kaggle/input/celebal-ana-verse-j/sample_submission.parquet')

# Display all heads
print("Train Data:")
display(train_df.head())

print("\nTest Data:")
display(test_df.head())

print("\nSample Submission:")
display(sample_submission.head())


In [None]:
print("Train shape:", train_df.shape)
print("Test shape:", test_df.shape)

print("\nTrain columns:\n", train_df.columns.tolist())
print("\nMissing values in train:\n", train_df.isnull().sum())

print("\nTarget distribution:\n", train_df['target'].value_counts())


In [None]:
print(train_df.nunique())
print(train_df.describe())


In [None]:
train_df['Date'] = pd.to_datetime(train_df['Date'])
train_df['Day'] = train_df['Date'].dt.day
train_df['Month'] = train_df['Date'].dt.month
train_df['Year'] = train_df['Date'].dt.year


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

sns.countplot(data=train_df, x='target')
plt.title("Target Class Distribution")
plt.show()


In [None]:
corr = train_df.corr(numeric_only=True)
plt.figure(figsize=(10,6))
sns.heatmap(corr, annot=True, cmap='coolwarm')
plt.title("Correlation Heatmap")
plt.show()


In [None]:
sns.boxplot(data=train_df, x='target', y='X1')
plt.title('X1 vs Target')
plt.show()

# Repeat for other features like X2, X3, etc.


In [None]:
train_df['DayOfWeek'] = train_df['Date'].dt.dayofweek


In [None]:
test_df['Date'] = pd.to_datetime(test_df['Date'])
test_df['Day'] = test_df['Date'].dt.day
test_df['Month'] = test_df['Date'].dt.month
test_df['Year'] = test_df['Date'].dt.year
test_df['DayOfWeek'] = test_df['Date'].dt.dayofweek


In [None]:
train_df['target'] = train_df['target'].astype(int)


In [None]:
target_by_date = train_df.groupby('Date')['target'].mean()
target_by_date.plot(figsize=(12,6), title="Daily Target Rate")


In [None]:
target_by_date['2024-01':'2024-03'].plot(title="Target Rate (Jan-Mar 2024)")


In [None]:
target_by_date.rolling(window=7).mean().plot(figsize=(12,6), title="7-Day Rolling Average of Target Rate")


In [None]:
print(train_df.columns)


In [None]:
train_df.groupby('X1')['target'].mean().sort_values().plot(kind='barh', figsize=(10,6), title="Average Target by X1")


In [None]:
from sklearn.model_selection import train_test_split

# Drop non-feature columns
X = train_df.drop(columns=['Date', 'target'])
y = train_df['target']

# Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Initialize with class weights
rf = RandomForestClassifier(class_weight='balanced', random_state=42)

# Train
rf.fit(X_train, y_train)

# Predict
y_pred = rf.predict(X_test)

# Evaluate
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

# Get feature importances from the trained model
importances = rf.feature_importances_

# Create a DataFrame for better plotting
feat_imp_df = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': importances
}).sort_values(by='Importance', ascending=False)

# Plot
plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=feat_imp_df)
plt.title('Feature Importances from Random Forest')
plt.tight_layout()
plt.show()


In [None]:
# STEP 1: Copy test data
test_processed = test_df.copy()

# STEP 2: Process 'Date' column to extract features like in training
test_processed['Date'] = pd.to_datetime(test_processed['Date'])
test_processed['Day'] = test_processed['Date'].dt.day
test_processed['Month'] = test_processed['Date'].dt.month
test_processed['Year'] = test_processed['Date'].dt.year
test_processed['DayOfWeek'] = test_processed['Date'].dt.dayofweek

# STEP 3: Drop original 'Date' column
X_final_test = test_processed.drop(['Date'], axis=1)

# STEP 4: Drop 'ID' from features if model was trained without it
X_final_test = X_final_test.drop('ID', axis=1)

# STEP 5: Predict using trained RandomForest model
final_predictions = rf.predict(X_final_test)

# STEP 6: Prepare submission DataFrame
submission_df = pd.DataFrame({
    'ID': test_df['ID'],  # ID from original test data
    'target': final_predictions
})

# STEP 7: Save to CSV
submission_df.to_csv('submission.csv', index=False)


In [None]:
!pip install xgboost


In [None]:
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix


In [None]:
# Calculate scale_pos_weight = (negative class / positive class)
neg, pos = y_train.value_counts()
scale_pos_weight = neg / pos


In [None]:
xgb_clf = XGBClassifier(
    objective='binary:logistic',
    scale_pos_weight=scale_pos_weight,
    eval_metric='logloss',
    use_label_encoder=False,
    random_state=42
)

xgb_clf.fit(X_train, y_train)


In [None]:
y_pred = xgb_clf.predict(X_test)

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


In [None]:
# STEP 1: Copy test data
test_processed = test_df.copy()

# STEP 2: Process 'Date' column to extract features like in training
test_processed['Date'] = pd.to_datetime(test_processed['Date'])
test_processed['Day'] = test_processed['Date'].dt.day
test_processed['Month'] = test_processed['Date'].dt.month
test_processed['Year'] = test_processed['Date'].dt.year
test_processed['DayOfWeek'] = test_processed['Date'].dt.dayofweek

# STEP 3: Drop original 'Date' column
X_final_test = test_processed.drop(['Date'], axis=1)

# STEP 4: Drop 'ID' from features if model was trained without it
X_final_test = X_final_test.drop('ID', axis=1)

final_predictions = xgb_clf.predict(X_final_test)
# STEP 6: Prepare submission DataFrame
submission_df = pd.DataFrame({
    'ID': test_df['ID'],  # ID from original test data
    'target': final_predictions
})

# STEP 7: Save to CSV
# Save to CSV
submission_df.to_csv('xgb_submission.csv', index=False)