# Introduction
This notebook demonstrates loading, preprocessing, modeling, and submission generation for a logistic regression classification task using CSV data files. Using best practices, it handles missing values, applies feature engineering, trains a model, and exports a submission CSV file.

## [data_loading]

In [None]:
import pandas as pd
import numpy as np

# Load training and test datasets using relative paths
train = pd.read_csv('hacktrain.csv')
test = pd.read_csv('hacktest.csv')

print(f'Train shape: {train.shape}')
print(f'Test shape: {test.shape}')

## [missing_value_handling]

In [None]:
# Inspect missing values
print(train.isnull().sum())

# Mean imputation for numerical features
num_cols = train.select_dtypes(include=[np.number]).columns.tolist()
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='mean')
train[num_cols] = imputer.fit_transform(train[num_cols])
test[num_cols] = imputer.transform(test[num_cols])

print('Missing values after imputation:')
print(train[num_cols].isnull().sum())

## [feature_engineering]

In [None]:
from sklearn.preprocessing import StandardScaler, LabelEncoder

# Separate features and target
X = train.drop('target', axis=1)  # replace 'target' with actual target column name
y = train['target']
X_test = test.copy()

# Identify categorical columns
cat_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()

# Encode categoricals
for col in cat_cols:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col].astype(str))
    X_test[col] = le.transform(X_test[col].astype(str))

# Scale numerical features
scaler = StandardScaler()
X[num_cols] = scaler.fit_transform(X[num_cols])
X_test[num_cols] = scaler.transform(X_test[num_cols])

## [model_training]

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Split for validation
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Train logistic regression
model = LogisticRegression(max_iter=1000, random_state=42)
model.fit(X_train, y_train)

# Validate
y_pred = model.predict(X_val)
print(f'Validation Accuracy: {accuracy_score(y_val, y_pred):.4f}')

## [predictions]

In [None]:
# Retrain on full data and predict on test set
model.fit(X, y)
preds = model.predict(X_test)

## [submission]

In [None]:
# Prepare submission DataFrame
submission = pd.DataFrame({
    'Id': test['Id'],  # replace 'Id' with actual test ID column name
    'target': preds
})

# Save to CSV with required filename
submission.to_csv('submission (1).csv', index=False)
print('Submission saved to submission (1).csv')

## [final_checks]

In [None]:
# Final check: list files and confirm submission exists
import os
print(os.listdir('.'))