# EDA with Pandas

This tutorial gives just some basic information about pandas (obviously not all the information)

In [None]:
import numpy as np
import pandas as pd
print ("Hello")

In [2]:
# read the data
train_df = pd.read_csv('../Titanic/train.csv')
test_df = pd.read_csv('../Titanic/test.csv')

In [3]:
# take a look at the first five lines  
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
# take a look at the first 10 lines (just as an example)
train_df.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [5]:
# data dimensionality
train_df.shape

(891, 12)

In [6]:
# feature names
train_df.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [7]:
# output some general information
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


In [8]:
# describe method shows basic statistical characteristics of each numerical feature
train_df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [9]:
# unique values
train_df.Sex.unique()

array(['male', 'female'], dtype=object)

In [10]:
# number of uncique value counts
train_df.Sex.value_counts()

male      577
female    314
Name: Sex, dtype: int64

In [11]:
# mean, min and max of the feature
print("mean age = ", train_df.Age.mean())
print("min age = ",train_df.Age.min())
print('max age = ',train_df.Age.max())

mean age =  29.69911764705882
min age =  0.42
max age =  80.0


In [12]:
# mean age of the Survived passengers
train_df[train_df["Survived"] == 1]["Age"].mean()

28.343689655172415

In [13]:
# new dataframe
survived_df = train_df[train_df['Survived'] == 1] 

In [14]:
# all of the passengers in this dataframe are survived
survived_df.Survived.unique()

array([1])

In [15]:
# the same can be done using loc
survived2_df = train_df.loc[train_df['Survived'] == 1]
survived2_df.Survived.unique()

array([1])

In [16]:
# delete dataframes
del survived_df
del survived2_df

In [17]:
# Grouping
# 1. First, the groupby method divides the grouping_columns by their values. 
#    They become a new index in the resulting dataframe.
# 2. Then, columns of interest are selected. 
#    If these columns is not included, all non groupby clauses will be included.
# 3. Finally, one or several functions are applied to the obtained groups per selected columns.
columns_to_show = ['Age', 'Pclass','Fare']
train_df.groupby(['Survived'])[columns_to_show].describe()

Unnamed: 0_level_0,Age,Age,Age,Age,Age,Age,Age,Age,Pclass,Pclass,Pclass,Pclass,Pclass,Fare,Fare,Fare,Fare,Fare,Fare,Fare,Fare
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
Survived,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
0,424.0,30.626179,14.17211,1.0,21.0,28.0,39.0,74.0,549.0,2.531876,...,3.0,3.0,549.0,22.117887,31.388207,0.0,7.8542,10.5,26.0,263.0
1,290.0,28.34369,14.950952,0.42,19.0,28.0,36.0,80.0,342.0,1.950292,...,3.0,3.0,342.0,48.395408,66.596998,0.0,12.475,26.0,57.0,512.3292


In [18]:
# the same thing can be done in another way (let's, for example, check min() and std())
train_df.groupby(['Survived'])[columns_to_show].agg([np.mean, np.std])

Unnamed: 0_level_0,Age,Age,Pclass,Pclass,Fare,Fare
Unnamed: 0_level_1,mean,std,mean,std,mean,std
Survived,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
0,30.626179,14.17211,2.531876,0.735805,22.117887,31.388207
1,28.34369,14.950952,1.950292,0.863321,48.395408,66.596998


In [19]:
# We can easily add another column
train_df['ScaledAge'] = train_df['Age']/train_df['Age'].max()

In [20]:
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,ScaledAge
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,0.275
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,0.475
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,0.325
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,0.4375
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,0.4375


In [21]:
# to delete the column we use drop() method. For example we want to delete column "Age"
# axis=1 indicates that we delete columns
# inplace=True alters the DataFrame
train_df.drop(['Age'],axis=1,inplace=True)
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,SibSp,Parch,Ticket,Fare,Cabin,Embarked,ScaledAge
0,1,0,3,"Braund, Mr. Owen Harris",male,1,0,A/5 21171,7.25,,S,0.275
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,1,0,PC 17599,71.2833,C85,C,0.475
2,3,1,3,"Heikkinen, Miss. Laina",female,0,0,STON/O2. 3101282,7.925,,S,0.325
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,1,0,113803,53.1,C123,S,0.4375
4,5,0,3,"Allen, Mr. William Henry",male,0,0,373450,8.05,,S,0.4375


In [22]:
# check the number of missed values
train_df.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
ScaledAge      177
dtype: int64

In [23]:
# Let's fill missed values for the ScaledAge with the median value of this column
train_df.ScaledAge.fillna(train_df.ScaledAge.median(skipna=True),inplace=True)
train_df.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
ScaledAge        0
dtype: int64