# Don't Overfit! II

Train a logistic regression model with LASSO regularization on a dataset, calculate row-wise means, select features, and save the model using joblib for future predictions.

Dataset: https://www.kaggle.com/competitions/dont-overfit-ii/data

Hugging Face: https://huggingface.co/spaces/alperugurcan/Dont-Overfit-II

In [11]:
import pandas as pd
from sklearn.linear_model import LogisticRegressionCV
from scipy.sparse import csr_matrix
import matplotlib.pyplot as plt

# Load data
train = pd.read_csv('/kaggle/input/dont-overfit-ii/train.csv')
test = pd.read_csv('/kaggle/input/dont-overfit-ii/test.csv')

In [2]:
train['mean_'] = train.drop(columns=['id', 'target']).mean(axis=1)
test['mean_'] = test.drop(columns=['id']).mean(axis=1)

In [3]:
# Selected variables based on available columns
var_selected = [str(i) for i in range(300)] + ['mean_']

In [4]:
# Create sparse matrices
train_matrix = csr_matrix(train[var_selected].values)
test_matrix = csr_matrix(test[var_selected].values)

In [6]:
# Train logistic regression model with LASSO regularization
model = LogisticRegressionCV(cv=5, penalty='l1', solver='saga', scoring='roc_auc', max_iter=10000, random_state=123456)
model.fit(train_matrix, train['target'])

In [7]:
# Extract non-zero coefficients
coef_var = model.coef_.flatten()
var_importans = pd.DataFrame({'var': var_selected, 'coef': coef_var})
var_importans = var_importans[var_importans['coef'] != 0]

In [9]:
# Predict on test data
pred_test = model.predict_proba(test_matrix)[:, 1]
data_test = pd.DataFrame({'id': test['id'], 'target': pred_test})

# Write predictions to CSV
data_test.to_csv('submission.csv', index=False)

In [10]:
import joblib
joblib.dump(model, 'logistic_regression_model.joblib')

['logistic_regression_model.joblib']