In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import RFE
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

train_data = pd.read_csv("/kaggle/input/titanic/train.csv")
train_data.head()

test_data = pd.read_csv("/kaggle/input/titanic/test.csv")
test_data.head()



# Feature List
features_list = ['Sex', 'Cabin', 'Family_Size', 'IsAlone']

# Normalize features (already done correctly)
min_fare_train = train_data['Fare'].min()
max_fare_train = train_data['Fare'].max()
train_data['Fare'] = (train_data['Fare'] - min_fare_train) / (max_fare_train - min_fare_train)
test_data['Fare'] = (test_data['Fare'] - min_fare_train) / (max_fare_train - min_fare_train)

min_age_train = train_data['Age'].min()
max_age_train = train_data['Age'].max()
train_data['Age'] = (train_data['Age'] - min_age_train) / (max_age_train - min_age_train)
test_data['Age'] = (test_data['Age'] - min_age_train) / (max_age_train - min_age_train)

# Encoding categorical variables
train_data['Sex'] = train_data['Sex'].map({'female': 0, 'male': 1})
test_data['Sex'] = test_data['Sex'].map({'female': 0, 'male': 1})

train_data['Cabin'] = train_data['Cabin'].notnull().astype(int)
test_data['Cabin'] = test_data['Cabin'].notnull().astype(int)

# Feature Engineering
train_data['Family_Size'] = train_data['SibSp'] + train_data['Parch']
test_data['Family_Size'] = test_data['SibSp'] + test_data['Parch']

train_data['IsAlone'] = np.where(train_data['Family_Size'] == 0, 0, 1)
test_data['IsAlone'] = np.where(test_data['Family_Size'] == 0, 0, 1)

# Prepare features
X = train_data[features_list]
y = train_data['Survived']
X_test = test_data[features_list]  # For future predictions

# Handle missing values
X_train = X.fillna(X.median())
X_test = X_test.fillna(X_test.median())

# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# Initialize the model
model = LogisticRegression(penalty = 'l2')
# Perform RFE to select 4 best features
rfe = RFE(estimator=model, n_features_to_select=4)
rfe.fit(X_train_scaled, y)

# Extract selected features mask
selected_features_mask = rfe.support_

# Get the names of the selected features
selected_features = X.columns[selected_features_mask]
print(f'Selected features by RFE: {selected_features}')

# Use the selected features for both train and test sets
X_train_scaled_RFE = X_train_scaled[:, selected_features_mask]



# Split data into training and testing sets
X_train_train, X_train_test, y_train_train, y_train_test = train_test_split(X_train_scaled_RFE, y, test_size=0.2, random_state=42)



# Defining hyperparameters 
from sklearn.model_selection import GridSearchCV

# ... (rest of your code)

# Defining hyperparameters (including L1 regularization for feature selection)
param_grid = {
    'solver': ['liblinear', 'lbfgs', 'sag', 'newton-cg'],
    'C': [0.01, 0.1, 1, 10, 100],
    'penalty': ['l1', 'l2', 'none'],
    'max_iter': [100, 200, 300],
}

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='accuracy')

# Fitting the model (GRIDSEARCHCV will try every possible combination)
grid_search.fit(X_train_train, y_train_train)

# Getting the best params 
best_params = grid_search.best_params_
print(f" Best Parameters : {best_params}")

# Getting best params score 
best_score = grid_search.best_score_
print(f"Best Accuracy achieved : {best_score}")

# Testing the model with best params
best_model = grid_search.best_estimator_
test_score = best_model.score(X_train_test , y_train_test)
print(f"Test set accuracy : {test_score}")


# # # Train the model on the selected features
# model.fit(X_train_train, y_train_train)

# # # Predict using the test set
# y_pred = model.predict(X_train_test)

# # Checking accuracy
# accuracy = accuracy_score(y_train_test, y_pred)
# print('Accuracy:', accuracy)




# output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': Predictions})
# output.to_csv('submission.csv', index=False)
# print("Your submission was successfully saved!")


/kaggle/input/titanic/train.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/gender_submission.csv
Selected features by RFE: Index(['Sex', 'Cabin', 'Family_Size', 'IsAlone'], dtype='object')




 Best Parameters : {'C': 0.01, 'max_iter': 100, 'penalty': 'l2', 'solver': 'lbfgs'}
Best Accuracy achieved : 0.7977051117896188
Test set accuracy : 0.7821229050279329


300 fits failed out of a total of 900.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
75 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/conda/lib/python3.10/site-packages/sklearn/linear_model/_logistic.py", line 1162, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/opt/conda/lib/python3.10/site-packages/sklearn/linear_model/_logistic.py", line 54, in _check_solver
    raise ValueError(
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

------------------------------------