In [4]:
# Import necessary libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Load the dataset
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

# Display the first few rows of the dataframe
print(train_data.head())

# Check for missing values
print(train_data.isnull().sum())

# Fill missing values
train_data['Age'].fillna(train_data['Age'].median(), inplace=True)
train_data['Embarked'].fillna(train_data['Embarked'].mode()[0], inplace=True)

# Drop the 'Cabin' column as it has too many missing values
train_data.drop(columns=['Cabin', 'Name', 'Ticket'], inplace=True)

# Convert 'Age' column to numeric (this handles any non-numeric values)
train_data['Age'] = pd.to_numeric(train_data['Age'], errors='coerce')

# Fill any remaining NaN values after conversion
train_data['Age'].fillna(train_data['Age'].median(), inplace=True)

# Encode categorical variables
le = LabelEncoder()
train_data['Sex'] = le.fit_transform(train_data['Sex'])
train_data['Embarked'] = le.fit_transform(train_data['Embarked'])

# Feature scaling
scaler = StandardScaler()
train_data[['Age', 'Fare']] = scaler.fit_transform(train_data[['Age', 'Fare']])

# Define the features (X) and target (y)
X = train_data.drop(columns=['Survived'])
y = train_data['Survived']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the logistic regression model
lr_model = LogisticRegression(max_iter=200)

# Train the model
lr_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = lr_model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

# Confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print('Confusion Matrix:')
print(conf_matrix)

# Classification report
class_report = classification_report(y_test, y_pred)
print('Classification Report:')
print(class_report)

# Analyze the coefficients
feature_importance = pd.DataFrame({
    'Feature': X.columns,
    'Coefficient': lr_model.coef_[0]
}).sort_values(by='Coefficient', ascending=False)

print('Feature Importance:')
print(feature_importance)



   PassengerId  Survived     Pclass  \
1            0         3     Braund   
2            1         1    Cumings   
3            1         3  Heikkinen   
4            1         1   Futrelle   
5            0         3      Allen   

                                          Name     Sex   Age  SibSp  Parch  \
1                              Mr. Owen Harris    male  22.0      1      0   
2   Mrs. John Bradley (Florence Briggs Thayer)  female  38.0      1      0   
3                                  Miss. Laina  female  26.0      0      0   
4           Mrs. Jacques Heath (Lily May Peel)  female  35.0      1      0   
5                            Mr. William Henry    male  35.0      0      0   

             Ticket     Fare Cabin Embarked  
1         A/5 21171   7.2500   NaN        S  
2          PC 17599  71.2833   C85        C  
3  STON/O2. 3101282   7.9250   NaN        S  
4            113803  53.1000  C123        S  
5            373450   8.0500   NaN        S  
PassengerId    0
Sur

ValueError: could not convert string to float: 'Moran'