In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

train_df = pd.read_csv('kaggle-tabular-playground-series-aug-2022/train.csv')
test_df = pd.read_csv('kaggle-tabular-playground-series-aug-2022/test.csv')

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report
from sklearn.impute import SimpleImputer


# Prepare the data
X = train_df.drop(['id', 'failure'], axis=1)
y = train_df['failure']

# Define categorical variables
categorical_features = ['product_code', 'attribute_0', 'attribute_1']

# Handle categorical variables with one-hot encoding
X = pd.get_dummies(X, columns=categorical_features)

# Define numeric features (all columns that are not categorical)
numeric_features = [col for col in X.columns if not any(col.startswith(cat_feat) for cat_feat in categorical_features)]

# Impute numeric features with median
numeric_imputer = SimpleImputer(strategy='median')
X[numeric_features] = numeric_imputer.fit_transform(X[numeric_features])

# No need to impute categorical features as they've been one-hot encoded

print(f"Shape of X after imputation: {X.shape}")
print(f"Shape of y: {y.shape}")

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
model = LogisticRegression(random_state=42)
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Score the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

# Print classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Prepare the final validation data (test_df)
X_final = test_df.drop('id', axis=1)
X_final = pd.get_dummies(X_final, columns=categorical_features)

# Impute missing values in the final validation data
# Impute numeric features with median from the training data to avoid data leakage
X_final.columns
X_final[numeric_features] = numeric_imputer.transform(X_final[numeric_features])

print(f"Shape of X_final: {X_final.shape}")

# Ensure X_final has the same columns as X_train
for col in X_train.columns:
    if col not in X_final.columns:
        X_final[col] = 0
X_final = X_final[X_train.columns]

# Make predictions on the final validation data
final_predictions = model.predict(X_final)

# Create a submission dataframe
submission = pd.DataFrame({
    'id': test_df['id'],
    'failure': final_predictions
})

# Save the submission file
submission.to_csv('logistic_regression_submission.csv', index=False)
print("\nSubmission file created: logistic_regression_submission.csv")


Shape of X after imputation: (26570, 31)
Shape of y: (26570,)
Accuracy: 0.8007

Classification Report:
              precision    recall  f1-score   support

           0       0.80      1.00      0.89      4257
           1       0.40      0.00      0.01      1057

    accuracy                           0.80      5314
   macro avg       0.60      0.50      0.45      5314
weighted avg       0.72      0.80      0.71      5314

Shape of X_final: (20775, 30)

Submission file created: logistic_regression_submission.csv


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
