# Titanic

In [65]:
import numpy as np
import pandas as pd

# Let's charge the csv
df = pd.read_csv("titanic.csv")
# Display the firest columns of the dataframe df
df.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,552,0,2,"Sharp, Mr. Percival James R",male,27.0,0,0,244358,26.0,,S
1,638,0,2,"Collyer, Mr. Harvey",male,31.0,1,1,C.A. 31921,26.25,,S
2,499,0,1,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S
3,261,0,3,"Smith, Mr. Thomas",male,,0,0,384461,7.75,,Q
4,395,1,3,"Sandstrom, Mrs. Hjalmar (Agnes Charlotta Bengt...",female,24.0,0,2,PP 9549,16.7,G6,S
5,811,0,3,"Alexander, Mr. William",male,26.0,0,0,3474,7.8875,,S
6,758,0,2,"Bailey, Mr. Percy Andrew",male,18.0,0,0,29108,11.5,,S
7,703,0,3,"Barbara, Miss. Saiide",female,18.0,0,1,2691,14.4542,,C
8,406,0,2,"Gale, Mr. Shadrach",male,34.0,1,0,28664,21.0,,S
9,641,0,3,"Jensen, Mr. Hans Peder",male,20.0,0,0,350050,7.8542,,S


In [23]:
# Here is the shape of the dataframe

print(f"It has {df.shape[0]} lines and {df.shape[1]} colums")

It has 891 lines and 12 colums


In [8]:
# We can select the grid we're interested in and have information about them

df[['Survived', 'Age', 'Fare']]

Unnamed: 0,Survived,Age,Fare
0,0,27.00,26.0000
1,0,31.00,26.2500
2,0,25.00,151.5500
3,0,,7.7500
4,1,24.00,16.7000
...,...,...,...
886,0,47.00,38.5000
887,1,30.00,9.5000
888,1,36.00,135.6333
889,0,22.00,7.7958


In [9]:
df[['Survived', 'Age', 'Fare']].describe()

Unnamed: 0,Survived,Age,Fare
count,891.0,714.0,891.0
mean,0.383838,29.699118,32.204208
std,0.486592,14.526497,49.693429
min,0.0,0.42,0.0
25%,0.0,20.125,7.9104
50%,0.0,28.0,14.4542
75%,1.0,38.0,31.0
max,1.0,80.0,512.3292


In [12]:
# The columns of the dataframe are Series

type(df['Pclass'])


pandas.core.series.Series

In [13]:
# We can see, all the different classes we have
df['Pclass'].unique()

array([2, 1, 3])

In [33]:
# Creation of a mask
# To select only the females of the 1st class or of the 2nd class

mask = (df['Sex'] == 'female') & ((df['Pclass'] == 1) | (df['Pclass'] == 2))

# Or more simply

mask_eq = (df['Sex'] == 'female') & ~(df['Pclass'] == 3)

#Let's verify that it is the same thing
list(mask) == list(mask_eq)

True

In [36]:
# Let's apply this mask to our dataframe
df_female_1_or_2 = df[mask]
df_female_1_or_2.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
2,499,0,1,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S
13,755,1,2,"Herman, Mrs. Samuel (Jane Laver)",female,48.0,1,2,220845,65.0,,S
16,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
18,338,1,1,"Burns, Miss. Elizabeth Margaret",female,41.0,0,0,16966,134.5,E40,C
23,53,1,1,"Harper, Mrs. Henry Sleeper (Myna Haxtun)",female,49.0,1,0,PC 17572,76.7292,D33,C
27,718,1,2,"Troutt, Miss. Edwina Celia ""Winnie""",female,27.0,0,0,34218,10.5,E101,S
38,505,1,1,"Maioni, Miss. Roberta",female,16.0,0,0,110152,86.5,B79,S
39,514,1,1,"Rothschild, Mrs. Martin (Elizabeth L. Barrett)",female,54.0,1,0,PC 17603,59.4,,C
40,376,1,1,"Meyer, Mrs. Edgar Joseph (Leila Saks)",female,,1,0,PC 17604,82.1708,,C
42,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [37]:
#The mask can also be summed to obtain the number of women who match the mask.
mask.sum()

np.int64(170)

In [39]:
print(f"{100*mask.sum()/len(df)} % des passagers étaient des femmes en deuxième ou première classe")

19.079685746352414 % des passagers étaient des femmes en deuxième ou première classe


In [42]:
# A large proportion of the 'cabins' are not filled in (NaN)
df.Cabin

0          NaN
1          NaN
2      C22 C26
3          NaN
4           G6
        ...   
886        E63
887        NaN
888        C32
889        NaN
890        NaN
Name: Cabin, Length: 891, dtype: object

In [62]:
# Create a mask that indicates where are the NaN with isna()
df.Cabin.isna()

0       True
1       True
2      False
3       True
4      False
       ...  
886    False
887     True
888    False
889     True
890     True
Name: Cabin, Length: 891, dtype: bool

In [63]:
# We can supress the lines with the missing 'Cabin'
df_clear = df.dropna(subset = 'Cabin')
df_clear.head()


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
2,499,0,1,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S
4,395,1,3,"Sandstrom, Mrs. Hjalmar (Agnes Charlotta Bengt...",female,24.0,0,2,PP 9549,16.7,G6,S
16,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
18,338,1,1,"Burns, Miss. Elizabeth Margaret",female,41.0,0,0,16966,134.5,E40,C
20,545,0,1,"Douglas, Mr. Walter Donald",male,50.0,1,0,PC 17761,106.425,C86,C


In [60]:
df['Age'].isna()

0      False
1      False
2      False
3       True
4      False
       ...  
886    False
887    False
888    False
889    False
890    False
Name: Age, Length: 891, dtype: bool

In [67]:
# To see how many missing values we have per lines
# axis = 1 to sum the columns, by default it sums the line
df.isna().sum(axis = 1)


0      1
1      1
2      0
3      2
4      0
      ..
886    0
887    1
888    0
889    1
890    1
Length: 891, dtype: int64

In [79]:
# The problem is that the passengers are not indexed correctly
# Let's change that
df.set_index('PassengerId', inplace = True)

In [81]:
df.isna().sum(axis = 1)
# The passenger whose ID is 261 misses 2 values

PassengerId
552    1
638    1
499    0
261    2
395    0
      ..
463    0
287    1
326    0
396    1
832    1
Length: 891, dtype: int64

In [87]:
# we can slice the dataframe with loc (for the index) and iloc (for the indice)

df_last = df.iloc[-10:, 0:2]
df_last #contains the 10 last lines and the columns of indice between 0 and 1 (the upper bound is not included for iloc)

Unnamed: 0_level_0,Survived,Pclass
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1
556,0,1
236,0,3
224,0,3
598,0,3
258,1,1
463,0,1
287,1,3
326,1,1
396,0,3
832,1,2


In [93]:
df_short_id = df.loc[499:395, 'Name']
df_short_id # The upper bound is incuded for loc

PassengerId
499      Allison, Mrs. Hudson J C (Bessie Waldo Daniels)
261                                    Smith, Mr. Thomas
395    Sandstrom, Mrs. Hjalmar (Agnes Charlotta Bengt...
Name: Name, dtype: object

In [94]:
# To know the name of the passenger 552
df.loc[552, 'Name']

'Sharp, Mr. Percival James R'

# Group by
We would like to know the rate of survival for each class and each sex

In [102]:
# We aggregate with a cartesian product

agg_df = df.groupby(['Pclass', 'Sex'])
agg_df.size()

Pclass  Sex   
1       female     94
        male      122
2       female     76
        male      108
3       female    144
        male      347
dtype: int64

In [108]:
# The rate is obtained by taking the mean because 1 means survived and 0 means not survived

agg_df['Survived'].mean()

Pclass  Sex   
1       female    0.968085
        male      0.368852
2       female    0.921053
        male      0.157407
3       female    0.500000
        male      0.135447
Name: Survived, dtype: float64

More than 96% of women of first class survived but less than 14% of men of third class survived.