# Titanic - Machine Learning from Disaster

## Import Libraries

In [35]:
import pandas as pd
from sklearn import preprocessing
import matplotlib.pyplot as plt

## Load the dataset

In [3]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
pd.set_option('display.max_columns', len(train.columns))
train.sample(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
777,778,1,3,"Emanuel, Miss. Virginia Ethel",female,5.0,0,0,364516,12.475,,S
397,398,0,2,"McKane, Mr. Peter David",male,46.0,0,0,28403,26.0,,S
595,596,0,3,"Van Impe, Mr. Jean Baptiste",male,36.0,1,1,345773,24.15,,S
289,290,1,3,"Connolly, Miss. Kate",female,22.0,0,0,370373,7.75,,Q
615,616,1,2,"Herman, Miss. Alice",female,24.0,1,2,220845,65.0,,S


In [8]:
# Information About the Data
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [19]:
# Description of the data in the DataFrame
train.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [25]:
# Detect missing values
pd.concat([train.isna().any(), train.isnull().sum()], axis=1)

Unnamed: 0,0,1
PassengerId,False,0
Survived,False,0
Pclass,False,0
Name,False,0
Sex,False,0
Age,True,177
SibSp,False,0
Parch,False,0
Ticket,False,0
Fare,False,0


In [27]:
le = preprocessing.LabelEncoder()

train['Sex'] = le.fit_transform(train['Sex'])
train['Embarked'] = le.fit_transform(train['Embarked'])

In [26]:
train.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [34]:
train[['Survived', 'Pclass', 'Sex', 'Age', 'SibSp',
       'Parch', 'Fare', 'Embarked']].quantile([0, 0.25, 0.50, 0.75, 1])

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0.0,0.0,1.0,0.0,0.42,0.0,0.0,0.0,0.0
0.25,0.0,2.0,0.0,20.125,0.0,0.0,7.9104,1.0
0.5,0.0,3.0,1.0,28.0,0.0,0.0,14.4542,2.0
0.75,1.0,3.0,1.0,38.0,1.0,0.0,31.0,2.0
1.0,1.0,3.0,1.0,80.0,8.0,6.0,512.3292,3.0


## Check duplicated values

In [4]:
train[train.duplicated()].sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            0.0
SibSp            0
Parch            0
Ticket           0
Fare           0.0
Cabin            0
Embarked         0
dtype: object

## Exploratory data analysis

In [5]:
# Perform groupby operation and aggregate using a dictionary of aggregation functions
result = train.groupby('Pclass').agg({'Survived': ['sum', 'count']})

# Rename the columns for clarity
result.columns = ['Survived', 'Total']

# Calculate the percentage of survived passengers
result['Percentage'] = (result['Survived'] / result['Total']) * 100
result.reset_index()

Unnamed: 0,Pclass,Survived,Total,Percentage
0,1,136,216,62.962963
1,2,87,184,47.282609
2,3,119,491,24.236253


### Pclass Summary

In [6]:
# Create a pivot table
pivot_table = pd.pivot_table(train, index=["Pclass"], values=["Survived"],  aggfunc=sum)

# Calculate total passengers for each class using value counts
total_passengers = train['Pclass'].value_counts().sort_index()

# Assign the total passengers to the pivot table
pivot_table['Total'] = total_passengers

#Total Passangers
total_passengers =train['Pclass'].value_counts().sum()

# # Calculate percentage of survived passengers within each class
pivot_table['%CLass'] = (pivot_table["Survived"] / pivot_table['Total']) * 100
pivot_table['%Total'] = (pivot_table["Survived"] / total_passengers) * 100

# pivot_table['Percentage_Survived'] = (pivot_table[1] / pivot_table['Total']) * 100
pivot_table.reset_index()

Unnamed: 0,Pclass,Survived,Total,%CLass,%Total
0,1,136,216,62.962963,15.263749
1,2,87,184,47.282609,9.76431
2,3,119,491,24.236253,13.35578


### Gender Summary

In [7]:
# Create a pivot table
pivot_table = pd.pivot_table(train, index=["Sex"], values=["Survived"],  aggfunc=sum)

# Calculate total passengers for each class using value counts
total_passengers = train['Sex'].value_counts().sort_index()

# Assign the total passengers to the pivot table
pivot_table['Total'] = total_passengers

#Total Passangers
total_passengers =train['Pclass'].value_counts().sum()

# # Calculate percentage of survived passengers within each class
pivot_table['%CLass'] = (pivot_table["Survived"] / pivot_table['Total']) * 100
pivot_table['%Total'] = (pivot_table["Survived"] / total_passengers) * 100

# pivot_table['Percentage_Survived'] = (pivot_table[1] / pivot_table['Total']) * 100
pivot_table.reset_index()

Unnamed: 0,Sex,Survived,Total,%CLass,%Total
0,female,233,314,74.203822,26.150393
1,male,109,577,18.890815,12.233446


### Age Summary