In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

# Step 1: Load and preprocess the data
df = pd.read_csv('titanic/train.csv')

# Drop the 'Unnamed: 0' column
df.drop(columns=['Name', "PassengerId"], inplace=True)

# Handle missing values (if any)
df.dropna(inplace=True)
df = pd.get_dummies(df)
df = df.astype(int)

# Separate predictors (X) and target variable (y)
X = df.drop(columns=['Survived'])  # Predictors
y = df['Survived']  # Target variable


# Feature scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Step 2: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

# Step 3: Train the Logistic Regression Model
logistic_model = LogisticRegression()
logistic_model.fit(X_train, y_train)

# Step 4: Extract the Coefficients (Beta Values)
intercept = logistic_model.intercept_[0]  # Intercept (beta_0)
coefficients = logistic_model.coef_[0]    # Coefficients (beta_1, beta_2, ..., beta_n)

# Combine intercept and coefficients with feature names
feature_names = ['Intercept'] + list(df.drop(columns=['Survived']).columns)
beta_values = [intercept] + list(coefficients)

# Create a DataFrame to display the beta values
if len(beta_values) == len(feature_names):
    beta_df = pd.DataFrame(beta_values, index=feature_names, columns=['Beta'])
    beta_df_sorted_ascending = beta_df.sort_values(by='Beta', ascending=False)
    beta_df_sorted_descending = beta_df.sort_values(by='Beta', ascending=True)
    print(beta_df_sorted_ascending.head(10))
    print(beta_df_sorted_descending.head(10))

else:
    print("Mismatch in the length of feature names and beta values.")
    print(f"Feature names: {len(feature_names)}, Beta values: {len(beta_values)}")
    print(feature_names)
    print(X)


                     Beta
Intercept        2.044517
Sex_female       1.131466
Cabin_B96 B98    0.304107
Ticket_113760    0.304107
Ticket_PC 17755  0.287737
Cabin_C70        0.265370
Ticket_PP 9549   0.241933
Cabin_E25        0.237836
Cabin_A23        0.226200
Ticket_27042     0.226200
                     Beta
Sex_male        -1.131466
Age             -0.667364
Ticket_347054   -0.585867
Ticket_113781   -0.524039
Cabin_C22 C26   -0.524039
Parch           -0.338028
Ticket_695      -0.291593
Ticket_PC 17758 -0.258205
Cabin_C65       -0.258205
Cabin_E77       -0.257813
