In [1]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [2]:
df = pd.read_csv('data.csv') 

In [3]:
# Print first 2 rows
df.head(2)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C


In [4]:
# Data preprocessing

def get_data_preprocessed(dataframe):
    
    # Drop columns which doesn't contribute much to the survival decision
    dataframe = dataframe.drop(columns=['PassengerId', 'Name', 'Ticket', 'Cabin'])
    
    # One hot encoding of categorical columns
    dataframe = pd.get_dummies(dataframe, columns=['Sex', 'Embarked'], drop_first=True)
    
    # Fill missing values
    dataframe['Age'].fillna(dataframe['Age'].mean(), inplace=True)
    
    
    return dataframe

In [5]:
X = get_data_preprocessed(df)
y = df['Survived']

In [6]:
X.head(2)

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Sex_male,Embarked_Q,Embarked_S
0,0,3,22.0,1,0,7.25,True,False,True
1,1,1,38.0,1,0,71.2833,False,False,False


In [7]:
y.head(3)

0    0
1    1
2    1
Name: Survived, dtype: int64

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

In [9]:
lr = LogisticRegression(random_state=0, max_iter=1000)

In [10]:
lr.fit(X_train, y_train)

In [11]:
# Prediction
y_pred = lr.predict(X_test)

acc = accuracy_score(y_test, y_pred)
print("Logistic Regression model accuracy (in %):", acc*100)

Logistic Regression model accuracy (in %): 100.0
