In [1]:
#importing essential libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

In [2]:
#Only use if using google colab
#mounting the drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)


Mounted at /content/drive


In [3]:
print(os.getcwd())
os.chdir('/content/drive/My Drive/Machine Learning/Titanic Survival/Dataset') #replace with the directory where the dataset is stored
print(os.getcwd())


/content
/content/drive/My Drive/Machine Learning/Titanic Survival/Dataset


In [4]:
#importing the datasets
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [5]:
#drop unnecessary columns
train = train.drop(["PassengerId", "Name", "Ticket"], axis = 1)
test = test.drop(["Name", "Ticket"], axis = 1)

# Data Preprocessing
#Embarked # only in train, fill the two missing values with the most occurred value, which is "S".

In [6]:
train["Embarked"] = train["Embarked"].fillna("S") 
embark_dummies_titanic  = pd.get_dummies(train['Embarked'])
embark_dummies_titanic.drop(['S'], axis=1, inplace=True) 

embark_dummies_test  = pd.get_dummies(test['Embarked'])
embark_dummies_test.drop(['S'], axis=1, inplace=True)

train = train.join(embark_dummies_titanic)
test  = test.join(embark_dummies_test)

train.drop(['Embarked'], axis=1,inplace=True)
test.drop(['Embarked'], axis=1,inplace=True)
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Sex       891 non-null    object 
 3   Age       714 non-null    float64
 4   SibSp     891 non-null    int64  
 5   Parch     891 non-null    int64  
 6   Fare      891 non-null    float64
 7   Cabin     204 non-null    object 
 8   C         891 non-null    uint8  
 9   Q         891 non-null    uint8  
dtypes: float64(2), int64(4), object(2), uint8(2)
memory usage: 57.6+ KB


In [7]:
# Fare # only for test_df, since there is a missing "Fare" values
test["Fare"].fillna(test["Fare"].median(), inplace=True)

#convert from float to int
train['Fare'] = train["Fare"].astype(int) 
test['Fare'] = test['Fare'].astype(int)

In [8]:
# Age #  fill NaN values in Age column with random values generated
train["Age"][np.isnan(train["Age"])]
train["Age"].fillna(train["Age"].mean(), inplace = True)

test["Age"].fillna(test["Age"].mean(), inplace = True)

In [9]:
#convert age from float to int
train["Age"] = train["Age"].astype(int)
test["Age"] = test["Age"].astype(int)

In [10]:
# Encoding categorical data
sex = pd.get_dummies(train['Sex'], drop_first = True, prefix = 'sex')
pclass = pd.get_dummies(train['Pclass'], drop_first = True, prefix = 'Pclass')
train  = pd.concat([train, sex, pclass], axis = 1)
sex1 = pd.get_dummies(test['Sex'], drop_first = True, prefix = 'sex')
pclass1 = pd.get_dummies(test['Pclass'], drop_first = True, prefix = 'pclass')
test = pd.concat([test, sex1, pclass1], axis = 1)

train.drop(['Pclass', 'Sex'], axis = 1, inplace = True)
test.drop(['Pclass', 'Sex'], axis = 1, inplace = True)



In [11]:
# Cabin has a lot of NaN values, so it won't cause a remarkable impact on prediction
train.drop("Cabin",axis=1,inplace=True)
test.drop("Cabin",axis=1,inplace=True)
train.info()
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype
---  ------    --------------  -----
 0   Survived  891 non-null    int64
 1   Age       891 non-null    int64
 2   SibSp     891 non-null    int64
 3   Parch     891 non-null    int64
 4   Fare      891 non-null    int64
 5   C         891 non-null    uint8
 6   Q         891 non-null    uint8
 7   sex_male  891 non-null    uint8
 8   Pclass_2  891 non-null    uint8
 9   Pclass_3  891 non-null    uint8
dtypes: int64(5), uint8(5)
memory usage: 39.3 KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype
---  ------       --------------  -----
 0   PassengerId  418 non-null    int64
 1   Age          418 non-null    int64
 2   SibSp        418 non-null    int64
 3   Parch        418 non-null    int64
 4   Fare         418 non-null    int64
 5   C           

In [12]:
# define training and testing sets
X = train.drop("Survived", axis = 1)
Y = train["Survived"]
X_test = test.drop("PassengerId", axis = 1).copy()

In [13]:
# Machine Learning # Fitting Logistic Regression to training set
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(X, Y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [15]:
# Predicting the values
y_pred = model.predict(X)
model.score(X, Y)

0.8092031425364759