import the libraries that you will need

In [10]:
import pandas as pd
import numpy as np


Pandas has two objects, namely series and data frames

# Object Series

Object Series has one data dimension. Does not have a column name because it only has one column, and has an index

In [11]:
data = [0.25, 0.50, 0.75, 1] 

converting data into series

In [12]:
data = pd.Series(data)
data

0    0.25
1    0.50
2    0.75
3    1.00
dtype: float64

convert from series to array

In [13]:
data.values

array([0.25, 0.5 , 0.75, 1.  ])

# displays the index.

The index is a range, where the starting point is inclusive of the range and the stop point is exclusive of the range

In [14]:
data.index

RangeIndex(start=0, stop=4, step=1)

In [15]:
list(range(1,10))

[1, 2, 3, 4, 5, 6, 7, 8, 9]

how to call data

In [16]:
data[2]

0.75

Implicit index is the default index.

We can define the index, this is called explicit i.e. index defined.

When defining an index, the number of indexes must be equal to the number of data.

In [17]:
data = pd.Series([0.25, 0.50, 0.75, 1], index=['a','b','c','d'])
data

a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64

In [18]:
data.values

array([0.25, 0.5 , 0.75, 1.  ])

In [19]:
data.index

Index(['a', 'b', 'c', 'd'], dtype='object')

call data

In [10]:
# index explicit

data['a']

0.25

this is data selection

even though we have created an explicit index, we can still call the implicit index

In [13]:
# index implicit

data[3]

1.0

when the implicit index and the explicit index are the same, when we call the data, it will rely on the explicit index

In [22]:
data_2 = pd.Series([0.25, 0.50, 0.75, 1], index=[2,5,3,7])

In [23]:
data_2[2]

0.25

In [24]:
data_2[0]

KeyError: 0

we will try slicing

In [25]:
data = pd.Series([0.25, 0.50, 0.75, 1], index=['a','b','c','d'])

In [26]:
data

a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64

for example we will call from data b to data c

In [27]:
data['b':'c'] # index explicit

b    0.50
c    0.75
dtype: float64

but if we slicing the implicit, then only the starting point will appear. because the explicit index is a range

In [19]:
data[1:2] # index implicit

b    0.5
dtype: float64

# loc iloc

example of data with implicit index and explicit index

In [29]:
data_2 = pd.Series([0.25, 0.50, 0.75, 1], index=[2,5,3,7])
data_2

2    0.25
5    0.50
3    0.75
7    1.00
dtype: float64

when we access an index, what appears is the explicit index

In [30]:
data_2[2] # index explicit : selecting

0.25

when we call index explicit from index 2 to 3. the value that appears is precisely the index implicit

In [31]:
data_2[2:3] # index implisit : slicing

3    0.75
dtype: float64

when the explicit index and implicit index are the same, there will be inconsistencies as in the case above

To overcome this inconsistency, we will use the loc and iloc rules.

loc is to call its explicit index

iloc is to call its implicit index

In [31]:
# loc

data_2.loc[3] # selecting index explicit 

0.75

In [32]:
data_2.loc[2:3] # selecting index exsplicit 

2    0.25
5    0.50
3    0.75
dtype: float64

In [33]:
# iloc

data_2.iloc[3] # selecting index implicit 

1.0

In [34]:
data_2.iloc[2:3] # selecting index implicit 

3    0.75
dtype: float64

In [35]:
dict_populations = {"Jakarta" : 750,
                "Bogor" : 490,
                "Depok" : 350,
                "Tanggerang" : 270,
                "Bekasi" : 670}

# just an example not a real number

In [43]:
dict_populations

{'Jakarta': 750, 'Bogor': 490, 'Depok': 350, 'Tanggerang': 270, 'Bekasi': 670}

In [36]:
# transform dictionary to series
population = pd.Series(dict_populations)
population

Jakarta       750
Bogor         490
Depok         350
Tanggerang    270
Bekasi        670
dtype: int64

In [37]:
population.loc['Depok']

350

In [38]:
population.iloc[2]

350

In [39]:
dict_large = {"Jakarta" : 737,
                "Bogor" : 325,
                "Depok" : 247,
                "Tanggerang" : 302,
                "Bekasi" : 355}
# this is just an example, not a real area number
dict_large

{'Jakarta': 737, 'Bogor': 325, 'Depok': 247, 'Tanggerang': 302, 'Bekasi': 355}

In [40]:
large = pd.Series(dict_large)
large

Jakarta       737
Bogor         325
Depok         247
Tanggerang    302
Bekasi        355
dtype: int64

# Data Frame

Data Frame is a collection of series, with at least one series

In [42]:
area = pd.DataFrame({'Pop':population, 'Large':large})
area

Unnamed: 0,Pop,Large
Jakarta,750,737
Bogor,490,325
Depok,350,247
Tanggerang,270,302
Bekasi,670,355


In [44]:
area['Large']

Jakarta       737
Bogor         325
Depok         247
Tanggerang    302
Bekasi        355
Name: Large, dtype: int64

In [45]:
area['Large']['Jakarta']

737

when calling data with regional.pop syntax it will appear as below

because pop is the same as the name of the function in the data frame

In [48]:
area.pop

<bound method DataFrame.pop of             Pop  Large
Jakarta     750    737
Bogor       490    325
Depok       350    247
Tanggerang  270    302
Bekasi      670    355>

then it is safer to call the data with the syntax area['Pop']

In [49]:
area['Pop']

Jakarta       750
Bogor         490
Depok         350
Tanggerang    270
Bekasi        670
Name: Pop, dtype: int64

we rename column pop with ['Pop']

In [50]:
area.loc["Depok"]

Pop      350
Large    247
Name: Depok, dtype: int64

In [51]:
area.iloc[2]

Pop      350
Large    247
Name: Depok, dtype: int64

In [52]:
area['Pop']["Jakarta":"Depok"] # index explicit

Jakarta    750
Bogor      490
Depok      350
Name: Pop, dtype: int64

In [53]:
area["Pop"].iloc[0:3] # index implicit

Jakarta    750
Bogor      490
Depok      350
Name: Pop, dtype: int64

# Load Data Titanic

In [55]:
df = pd.read_csv('Titanic.csv')
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [56]:
# view from top data

df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [66]:
# show top 10 data

df.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [57]:
# view data info

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [58]:
# see non null count of data

df.notnull().sum()

PassengerId    891
Survived       891
Pclass         891
Name           891
Sex            891
Age            714
SibSp          891
Parch          891
Ticket         891
Fare           891
Cabin          204
Embarked       889
dtype: int64

In [59]:
# see the number of NaN from the data

df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [74]:
# see the sum of the data

df.sum()

  df.sum()


PassengerId                                               397386
Survived                                                     342
Pclass                                                      2057
Name           Braund, Mr. Owen HarrisCumings, Mrs. John Brad...
Sex            malefemalefemalefemalemalemalemalemalefemalefe...
Age                                                     21205.17
SibSp                                                        466
Parch                                                        340
Ticket         A/5 21171PC 17599STON/O2. 31012821138033734503...
Fare                                                  28693.9493
dtype: object

In [60]:
# see data from the back

df.tail()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.45,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0,C148,C
890,891,0,3,"Dooley, Mr. Patrick",male,32.0,0,0,370376,7.75,,Q


In [61]:
# number of rows and columns

df.shape

(891, 12)

In [62]:
# see the column in the data

df.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [63]:
# see index

df.index

RangeIndex(start=0, stop=891, step=1)

In [94]:
# displays information from columns in the form of numbers

df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [95]:
# show average in age

df['Age'].mean()

29.69911764705882

In [64]:
# show number that appears most often in age

df['Age'].mode()

0    24.0
dtype: float64

In [103]:
# see the smallest number in age

df['Age'].min()

0.42

In [104]:
# see the largest number in age

df['Age'].max()

80.0

In [65]:
# misalnya melihat NaN dari column age

df[df.Age.isnull()]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
17,18,1,2,"Williams, Mr. Charles Eugene",male,,0,0,244373,13.0000,,S
19,20,1,3,"Masselmani, Mrs. Fatima",female,,0,0,2649,7.2250,,C
26,27,0,3,"Emir, Mr. Farred Chehab",male,,0,0,2631,7.2250,,C
28,29,1,3,"O'Dwyer, Miss. Ellen ""Nellie""",female,,0,0,330959,7.8792,,Q
...,...,...,...,...,...,...,...,...,...,...,...,...
859,860,0,3,"Razi, Mr. Raihed",male,,0,0,2629,7.2292,,C
863,864,0,3,"Sage, Miss. Dorothy Edith ""Dolly""",female,,8,2,CA. 2343,69.5500,,S
868,869,0,3,"van Melkebeke, Mr. Philemon",male,,0,0,345777,9.5000,,S
878,879,0,3,"Laleff, Mr. Kristo",male,,0,0,349217,7.8958,,S


In [67]:
#then we mask

df[df.Age.isnull()] #eg see NaN from column Age

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
17,18,1,2,"Williams, Mr. Charles Eugene",male,,0,0,244373,13.0000,,S
19,20,1,3,"Masselmani, Mrs. Fatima",female,,0,0,2649,7.2250,,C
26,27,0,3,"Emir, Mr. Farred Chehab",male,,0,0,2631,7.2250,,C
28,29,1,3,"O'Dwyer, Miss. Ellen ""Nellie""",female,,0,0,330959,7.8792,,Q
...,...,...,...,...,...,...,...,...,...,...,...,...
859,860,0,3,"Razi, Mr. Raihed",male,,0,0,2629,7.2292,,C
863,864,0,3,"Sage, Miss. Dorothy Edith ""Dolly""",female,,8,2,CA. 2343,69.5500,,S
868,869,0,3,"van Melkebeke, Mr. Philemon",male,,0,0,345777,9.5000,,S
878,879,0,3,"Laleff, Mr. Kristo",male,,0,0,349217,7.8958,,S


In [68]:
#then to form data frame

df[df.Age.isnull()] #eg see NaN from column Age

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
17,18,1,2,"Williams, Mr. Charles Eugene",male,,0,0,244373,13.0000,,S
19,20,1,3,"Masselmani, Mrs. Fatima",female,,0,0,2649,7.2250,,C
26,27,0,3,"Emir, Mr. Farred Chehab",male,,0,0,2631,7.2250,,C
28,29,1,3,"O'Dwyer, Miss. Ellen ""Nellie""",female,,0,0,330959,7.8792,,Q
...,...,...,...,...,...,...,...,...,...,...,...,...
859,860,0,3,"Razi, Mr. Raihed",male,,0,0,2629,7.2292,,C
863,864,0,3,"Sage, Miss. Dorothy Edith ""Dolly""",female,,8,2,CA. 2343,69.5500,,S
868,869,0,3,"van Melkebeke, Mr. Philemon",male,,0,0,345777,9.5000,,S
878,879,0,3,"Laleff, Mr. Kristo",male,,0,0,349217,7.8958,,S
