In [49]:
import numpy as np
import pandas as pd

In [50]:
df=pd.read_csv('titanic.csv')

In [51]:
#Glance through the data
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [52]:
#find the datatype in each columns
df.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [53]:
#get overall insights using .describe(it will not work for categorical data)
df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [54]:
#filtering out the columns with only object datatype and storing in variable columns. 
columns=df.dtypes[df.dtypes=='object'].index

In [55]:
#getting general insight for categorical data
df[columns].describe()

Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked
count,891,891,891,204,889
unique,891,2,681,147,3
top,"Davies, Mr. Alfred J",male,CA. 2343,G6,S
freq,1,577,7,4,644


In [56]:
#filtering

In [57]:
#finding out how many columns have 0 as fare
df[df['Fare']==0].count()

PassengerId    15
Survived       15
Pclass         15
Name           15
Sex            15
Age             7
SibSp          15
Parch          15
Ticket         15
Fare           15
Cabin           3
Embarked       15
dtype: int64

In [58]:
fare_filter=df['Fare']==0

In [59]:
fare_filter

0      False
1      False
2      False
3      False
4      False
       ...  
886    False
887    False
888    False
889    False
890    False
Name: Fare, Length: 891, dtype: bool

In [60]:
df[fare_filter]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
179,180,0,3,"Leonard, Mr. Lionel",male,36.0,0,0,LINE,0.0,,S
263,264,0,1,"Harrison, Mr. William",male,40.0,0,0,112059,0.0,B94,S
271,272,1,3,"Tornquist, Mr. William Henry",male,25.0,0,0,LINE,0.0,,S
277,278,0,2,"Parkes, Mr. Francis ""Frank""",male,,0,0,239853,0.0,,S
302,303,0,3,"Johnson, Mr. William Cahoone Jr",male,19.0,0,0,LINE,0.0,,S
413,414,0,2,"Cunningham, Mr. Alfred Fleming",male,,0,0,239853,0.0,,S
466,467,0,2,"Campbell, Mr. William",male,,0,0,239853,0.0,,S
481,482,0,2,"Frost, Mr. Anthony Wood ""Archie""",male,,0,0,239854,0.0,,S
597,598,0,3,"Johnson, Mr. Alfred",male,49.0,0,0,LINE,0.0,,S
633,634,0,1,"Parr, Mr. William Henry Marsh",male,,0,0,112052,0.0,,S


In [61]:
df[fare_filter].count()

PassengerId    15
Survived       15
Pclass         15
Name           15
Sex            15
Age             7
SibSp          15
Parch          15
Ticket         15
Fare           15
Cabin           3
Embarked       15
dtype: int64

In [62]:
df['Fare'].replace(0,np.nan,inplace=True)

In [63]:
#finding the percentage of null values
((df.isnull().sum())/len(df))*100

PassengerId     0.000000
Survived        0.000000
Pclass          0.000000
Name            0.000000
Sex             0.000000
Age            19.865320
SibSp           0.000000
Parch           0.000000
Ticket          0.000000
Fare            1.683502
Cabin          77.104377
Embarked        0.224467
dtype: float64

In [64]:
#finding variance
df.var()

PassengerId    66231.000000
Survived           0.236772
Pclass             0.699015
Age              211.019125
SibSp              1.216043
Parch              0.649728
Fare            2493.686567
dtype: float64

In [65]:
#looking at what columns are not significant for model training
df.nunique()/len(df)*100

PassengerId    100.000000
Survived         0.224467
Pclass           0.336700
Name           100.000000
Sex              0.224467
Age              9.876543
SibSp            0.785634
Parch            0.785634
Ticket          76.430976
Fare            27.721661
Cabin           16.498316
Embarked         0.336700
dtype: float64

In [66]:
#dropping columns
columns_to_drop=['Cabin','PassengerId','Name','Ticket']
df.drop(columns_to_drop,axis=1, inplace=True)

In [67]:
'''finding the correlation between data, correlation tells us 
by how much a certain value will change based upon change in another value
1 means perfect positive correlation and -1 means perfect negative correlation'''
df.corr()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare
Survived,1.0,-0.338481,-0.077221,-0.035322,0.081629,0.252453
Pclass,-0.338481,1.0,-0.369226,0.083081,0.018443,-0.561517
Age,-0.077221,-0.369226,1.0,-0.308247,-0.189119,0.098981
SibSp,-0.035322,0.083081,-0.308247,1.0,0.414838,0.15524
Parch,0.081629,0.018443,-0.189119,0.414838,1.0,0.212137
Fare,0.252453,-0.561517,0.098981,0.15524,0.212137,1.0


In [68]:
df['Survived']==1

0      False
1       True
2       True
3       True
4      False
       ...  
886    False
887     True
888    False
889     True
890    False
Name: Survived, Length: 891, dtype: bool

In [69]:
#finding out total number of passangers that survived
print('Total number of passangers survived :', df[df['Survived']==1].count()[0])
print('Total number of passangers that did not survive :', df[df['Survived']==0].count()[0])

Total number of passangers survived : 342
Total number of passangers that did not survive : 549


In [70]:
#finding out the percent of passangers that survived
print('Percentage of passangers that survived :', (df[df['Survived']==1].count()[0])/len(df)*100,'%')
print('Percentage of passangers that did not survive :', (df[df['Survived']==0].count()[0])/len(df)*100,'%')

Percentage of passangers that survived : 38.38383838383838 %
Percentage of passangers that did not survive : 61.61616161616161 %


# How many survived per Passanger class

In [71]:
#grouping the data according to the passanger class
df.groupby('Pclass').count()

Unnamed: 0_level_0,Survived,Sex,Age,SibSp,Parch,Fare,Embarked
Pclass,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,216,216,186,216,216,211,214
2,184,184,173,184,184,178,184
3,491,491,355,491,491,487,491


In [72]:
216+184+491

891

In [73]:
#grouping the passanger class and survived class and finding out how many from per passanger class actually survived. 
survived_pclass1=(df[(df.Pclass==1)&(df.Survived==1)]).count()[0]
survived_pclass2=(df[(df.Pclass==2)&(df.Survived==1)]).count()[0]
survived_pclass3=(df[(df.Pclass==3)&(df.Survived==1)]).count()[0]

In [74]:
df[(df.Pclass==1)&(df.Survived==1)]

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
1,1,1,female,38.0,1,0,71.2833,C
3,1,1,female,35.0,1,0,53.1000,S
11,1,1,female,58.0,0,0,26.5500,S
23,1,1,male,28.0,0,0,35.5000,S
31,1,1,female,,1,0,146.5208,C
...,...,...,...,...,...,...,...,...
862,1,1,female,48.0,0,0,25.9292,S
871,1,1,female,47.0,1,1,52.5542,S
879,1,1,female,56.0,0,1,83.1583,C
887,1,1,female,19.0,0,0,30.0000,S


In [75]:
df[(df.Pclass==1)&(df.Survived==1)].count()

Survived    136
Pclass      136
Sex         136
Age         122
SibSp       136
Parch       136
Fare        136
Embarked    134
dtype: int64

In [76]:
#Finding the total number of passangers in the respective passanger classes
passangers_pclass1=(df[(df.Pclass==1)]).count()[0]
passangers_pclass2=(df[(df.Pclass==2)]).count()[0]
passangers_pclass3=(df[(df.Pclass==3)]).count()[0]

In [77]:
df[(df.Pclass==1)]

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
1,1,1,female,38.0,1,0,71.2833,C
3,1,1,female,35.0,1,0,53.1000,S
6,0,1,male,54.0,0,0,51.8625,S
11,1,1,female,58.0,0,0,26.5500,S
23,1,1,male,28.0,0,0,35.5000,S
...,...,...,...,...,...,...,...,...
871,1,1,female,47.0,1,1,52.5542,S
872,0,1,male,33.0,0,0,5.0000,S
879,1,1,female,56.0,0,1,83.1583,C
887,1,1,female,19.0,0,0,30.0000,S


In [78]:
#calculating the percentage 
print('percentage of passangers survived in passanger class 1:', (survived_pclass1/passangers_pclass1)*100)
print('percentage of passangers survived in passanger class 2:',(survived_pclass2/passangers_pclass2)*100)
print('percentage of passangers survived in passanger class 3:',(survived_pclass3/passangers_pclass3)*100)

percentage of passangers survived in passanger class 1: 62.96296296296296
percentage of passangers survived in passanger class 2: 47.28260869565217
percentage of passangers survived in passanger class 3: 24.236252545824847


# Percentage of people survived based on their sex

In [79]:
df.groupby('Sex').count()

Unnamed: 0_level_0,Survived,Pclass,Age,SibSp,Parch,Fare,Embarked
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
female,314,314,261,314,314,314,312
male,577,577,453,577,577,562,577


In [80]:
df.groupby('Sex')['Survived'].value_counts()

Sex     Survived
female  1           233
        0            81
male    0           468
        1           109
Name: Survived, dtype: int64

In [81]:
total_females=len(df[(df.Sex)=='female'])

In [82]:
total_males=len(df[(df.Sex)=='male'])

In [86]:
females_survived= df[((df.Sex)=='female')&(df.Survived)==1].count()[0]
males_survived= df[((df.Sex)=='male')&(df.Survived)==1].count()[0]

In [87]:
females_survived

233

In [88]:
print('percentage of females that survived :', (females_survived/total_females)*100 )
print('percentage of males that survived :', (males_survived/total_males)*100 )

percentage of females that survived : 74.20382165605095
percentage of males that survived : 18.890814558058924


# is there any relation between the average fare paid and survival rate?

In [89]:
df.groupby('Survived')['Fare'].mean()

Survived
0    22.696673
1    48.537330
Name: Fare, dtype: float64

# Correlation between age and survival

In [92]:
df.groupby('Survived')['Age'].mean()

Survived
0    30.626179
1    28.343690
Name: Age, dtype: float64

# Converting age to categories. 

In [93]:
df['Age'].max()

80.0

In [94]:
df['Age'].min()

0.42

In [95]:
df[(df.Age<1)]

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,familyMembers
78,1,2,male,0.83,0,2,29.0,S,2
305,1,1,male,0.92,1,2,151.55,S,3
469,1,3,female,0.75,2,1,19.2583,C,3
644,1,3,female,0.75,2,1,19.2583,C,3
755,1,2,male,0.67,1,1,14.5,S,2
803,1,3,male,0.42,0,1,8.5167,C,1
831,1,2,male,0.83,1,1,18.75,S,2


In [96]:
bins=[0,10,18,30,60,100]
labels=['kids','teens','young','mid-aged','old']
df['Age-group']=pd.cut(df['Age'],bins=bins,labels=labels)

In [97]:
kids=len(df[(df['Age-group']=='kids')])
teens=len(df[(df['Age-group']=='teens')])
young=len(df[(df['Age-group']=='young')])
mid=len(df[(df['Age-group']=='mid-aged')])
old=len(df[(df['Age-group']=='old')])

In [98]:
kids_survived=(len(df[(df['Age-group']=='kids')&(df.Survived==1)])/kids)*100
teens_survived=(len(df[(df['Age-group']=='teens')&(df.Survived==1)])/teens)*100
young_survived=(len(df[(df['Age-group']=='young')&(df.Survived==1)]))/young*100
midAged_survived=(len(df[(df['Age-group']=='mid-aged')&(df.Survived==1)])/mid)*100
old_survived=(len(df[(df['Age-group']=='old')&(df.Survived==1)])/old)*100

In [99]:
print(kids_survived)
print(teens_survived)
print(young_survived)
print(midAged_survived)
print(old_survived)

59.375
42.66666666666667
35.55555555555556
42.04946996466431
22.727272727272727


# try to bin family members and do some eda on your new column