In [38]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.impute import SimpleImputer

# Load the data
data = pd.read_csv('/content/train_and_test2.csv')

# Data preprocessing
def preprocess_data(df):
    # Rename the target column
    if '2urvived' in df.columns:
        df = df.rename(columns={'2urvived': 'Survived'})

    # Drop unnecessary columns
    columns_to_drop = ['Passengerid'] + [col for col in df.columns if col.startswith('zero')]
    df = df.drop(columns=[col for col in columns_to_drop if col in df.columns])

    # Convert categorical variables to numeric
    if 'Sex' in df.columns:
        df['Sex'] = df['Sex'].map({'male': 0, 'female': 1})
    if 'Embarked' in df.columns:
        df['Embarked'] = pd.Categorical(df['Embarked']).codes

    return df

# Preprocess the data
data = preprocess_data(data)

# Print column names and data info
print("Columns in the dataset:")
print(data.columns)

# Check if 'Survived' column exists
if 'Survived' not in data.columns:
    print("Error: 'Survived' column not found in the dataset.")
    exit()

# Split features and target
X = data.drop('Survived', axis=1)
y = data['Survived']

# Handle missing values
imputer = SimpleImputer(strategy='constant', fill_value=-1)
X_imputed = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_imputed, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train the model
model = LogisticRegression(random_state=42)
model.fit(X_train_scaled, y_train)

# Make predictions
y_pred = model.predict(X_test_scaled)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"\nAccuracy: {accuracy:.2f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Feature importance
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': abs(model.coef_[0])
}).sort_values('importance', ascending=False)

print("\nFeature Importance:")
print(feature_importance)

Columns in the dataset:
Index(['Age', 'Fare', 'Sex', 'sibsp', 'Parch', 'Pclass', 'Embarked',
       'Survived'],
      dtype='object')

Accuracy: 0.74

Classification Report:
              precision    recall  f1-score   support

           0       0.75      0.96      0.84       189
           1       0.61      0.15      0.24        73

    accuracy                           0.74       262
   macro avg       0.68      0.56      0.54       262
weighted avg       0.71      0.74      0.67       262


Feature Importance:
    feature  importance
5    Pclass    0.623729
0       Age    0.335168
3     sibsp    0.119814
6  Embarked    0.110522
4     Parch    0.098557
1      Fare    0.092224
2       Sex    0.000000


In [35]:
X

Unnamed: 0,Age,Fare,Sex,sibsp,Parch,Pclass,Embarked
0,22.0,7.2500,,1,0,3,2
1,38.0,71.2833,,1,0,1,0
2,26.0,7.9250,,0,0,3,2
3,35.0,53.1000,,1,0,1,2
4,35.0,8.0500,,0,0,3,2
...,...,...,...,...,...,...,...
1304,28.0,8.0500,,0,0,3,2
1305,39.0,108.9000,,0,0,1,0
1306,38.5,7.2500,,0,0,3,2
1307,28.0,8.0500,,0,0,3,2
