In [1]:
import numpy as np
import pandas as pd

## Series

In [2]:
pd.Series([1200, 2000, 3000, 4000])

0    1200
1    2000
2    3000
3    4000
dtype: int64

dtype menjelaskan tipe data yg dimiliki dalam series

64 maksudnya adalah bit64

In [4]:
pd.Series(["Aziz", "Dharmawan", "Apray"])

0         Aziz
1    Dharmawan
2        Apray
dtype: object

In [5]:
pd.Series([12.5, 10.1, 9.3, 8.6])

0    12.5
1    10.1
2     9.3
3     8.6
dtype: float64

In [6]:
pd.Series([5, "Aziz", 12.5])

0       5
1    Aziz
2    12.5
dtype: object

tipe data object akan memakan memori lebih besar
data string akan selalu dibaca sebagai object

In [7]:
revenues = pd.Series([5555, 7000, 1980])

In [8]:
revenues

0    5555
1    7000
2    1980
dtype: int64

In [9]:
revenues.index

RangeIndex(start=0, stop=3, step=1)

In [10]:
revenues.values

array([5555, 7000, 1980], dtype=int64)

basic syntax = pd.series([values], index = [list indexnya] )

In [14]:
pd.Series([10,20,30,40,50], index=['A', 'B', 'C', 'D', 'E'])

A    10
B    20
C    30
D    40
E    50
dtype: int64

In [11]:
city_revenues = pd.Series(
    [4200, 8000, 6500],
    index=["Amsterdam", "Toronto", "Tokyo"]
)
city_revenues

Amsterdam    4200
Toronto      8000
Tokyo        6500
dtype: int64

In [12]:
city_revenues.index

Index(['Amsterdam', 'Toronto', 'Tokyo'], dtype='object')

In [17]:
# dalam bentuk dictionary (key:value)
pd.Series({'Amsterdam': 10, 'Jakarta': 20, 'Tokyo': 50})

Amsterdam    10
Jakarta      20
Tokyo        50
dtype: int64

In [15]:
city_employee_count = pd.Series({"Amsterdam": 5, "Tokyo": 8})
city_employee_count

Amsterdam    5
Tokyo        8
dtype: int64

In [19]:
city_employee_count.values

array([5, 8], dtype=int64)

In [20]:
city_employee_count.index

Index(['Amsterdam', 'Tokyo'], dtype='object')

In [21]:
city_employee_count.keys()

Index(['Amsterdam', 'Tokyo'], dtype='object')

In [22]:
"Tokyo" in city_employee_count

True

In [23]:
"Jakarta" in city_employee_count

False

## Data Frame

In [24]:
# basic syntax
# pd.DataFrame({
#         'nama kolom': value,
#         'nama kolom': value})

In [26]:
city_data = pd.DataFrame({
    "revenue": city_revenues,
    "employee_count": city_employee_count
})

In [27]:
city_data

Unnamed: 0,revenue,employee_count
Amsterdam,4200,5.0
Tokyo,6500,8.0
Toronto,8000,


In [28]:
city_data.index

Index(['Amsterdam', 'Tokyo', 'Toronto'], dtype='object')

In [29]:
city_data.values

array([[4.2e+03, 5.0e+00],
       [6.5e+03, 8.0e+00],
       [8.0e+03,     nan]])

In [30]:
city_data.axes

[Index(['Amsterdam', 'Tokyo', 'Toronto'], dtype='object'),
 Index(['revenue', 'employee_count'], dtype='object')]

In [32]:
# axes menurun = index = axes indeks ke-0
# axes mendatar = values = axes indeks ke-1

In [33]:
city_data.axes[0]

Index(['Amsterdam', 'Tokyo', 'Toronto'], dtype='object')

In [34]:
city_data.axes[1]

Index(['revenue', 'employee_count'], dtype='object')

In [35]:
city_data.keys()

Index(['revenue', 'employee_count'], dtype='object')

In [37]:
city_data.columns

Index(['revenue', 'employee_count'], dtype='object')

In [40]:
city_data

Unnamed: 0,revenue,employee_count
Amsterdam,4200,5.0
Tokyo,6500,8.0
Toronto,8000,


In [36]:
"Amsterdam" in city_data

False

In [41]:
"revenue" in city_data

True

In [42]:
# dalam DataFrame, yang di masukan untuk in adalah bagian kolom, bukan index nya
# dalam Series, yang di masukan untuk in adalah bagian index nya

## Accessing Series Elements

In [43]:
city_revenues

Amsterdam    4200
Toronto      8000
Tokyo        6500
dtype: int64

In [44]:
city_revenues["Amsterdam"]

4200

In [46]:
city_revenues[0]

4200

In [56]:
city_revenues[:2]

Amsterdam    4200
Toronto      8000
dtype: int64

In [50]:
city_revenues[-2:]

Toronto    8000
Tokyo      6500
dtype: int64

In [54]:
city_revenues[:]

Amsterdam    4200
Toronto      8000
Tokyo        6500
dtype: int64

In [51]:
city_revenues[:"Tokyo"]

Amsterdam    4200
Toronto      8000
Tokyo        6500
dtype: int64

## .loc and .iloc

In [58]:
colors = pd.Series(
    ["red", "purple", "blue", "green", "yellow"],
    index=[1, 2, 3, 5, 8]
)

In [59]:
colors

1       red
2    purple
3      blue
5     green
8    yellow
dtype: object

In [62]:
colors[1]

'red'

In [65]:
colors.iloc[1]      # posisi indeks

'purple'

In [79]:
colors.loc[1]       # label

'red'

In [67]:
colors

1       red
2    purple
3      blue
5     green
8    yellow
dtype: object

In [70]:
colors.iloc[1:3]

2    purple
3      blue
dtype: object

In [71]:
colors.loc[1:3]

1       red
2    purple
3      blue
dtype: object

In [72]:
colors.iloc[:4]

1       red
2    purple
3      blue
5     green
dtype: object

In [73]:
colors.loc[:4]

1       red
2    purple
3      blue
dtype: object

In [77]:
colors.iloc[-2]

'green'

In [78]:
colors.loc[-2]

KeyError: -2

In [74]:
city_revenues

Amsterdam    4200
Toronto      8000
Tokyo        6500
dtype: int64

In [81]:
city_revenues.loc[:"Toronto"]

Amsterdam    4200
Toronto      8000
dtype: int64

## Accessing DataFrame Elements

In [82]:
city_data

Unnamed: 0,revenue,employee_count
Amsterdam,4200,5.0
Tokyo,6500,8.0
Toronto,8000,


In [83]:
city_data["revenue"]

Amsterdam    4200
Tokyo        6500
Toronto      8000
Name: revenue, dtype: int64

In [84]:
city_data["employee_count"]

Amsterdam    5.0
Tokyo        8.0
Toronto      NaN
Name: employee_count, dtype: float64

In [85]:
city_data.employee_count

Amsterdam    5.0
Tokyo        8.0
Toronto      NaN
Name: employee_count, dtype: float64

In [86]:
toys = pd.DataFrame([
    {"name": "ball", "shape": "sphere"},
    {"name": "Rubik's cube", "shape": "cube"}
])

In [87]:
toys

Unnamed: 0,name,shape
0,ball,sphere
1,Rubik's cube,cube


In [88]:
toys["shape"]

0    sphere
1      cube
Name: shape, dtype: object

In [89]:
toys.shape

(2, 2)

### .loc and .iloc

In [90]:
city_data

Unnamed: 0,revenue,employee_count
Amsterdam,4200,5.0
Tokyo,6500,8.0
Toronto,8000,


In [91]:
city_data.loc["Amsterdam"]

revenue           4200.0
employee_count       5.0
Name: Amsterdam, dtype: float64

In [92]:
city_data.loc["Tokyo": "Toronto"]

Unnamed: 0,revenue,employee_count
Tokyo,6500,8.0
Toronto,8000,


In [93]:
city_data.iloc[1]

revenue           6500.0
employee_count       8.0
Name: Tokyo, dtype: float64

In [96]:
city_data.loc["Amsterdam": "Tokyo", "revenue"]

Amsterdam    4200
Tokyo        6500
Name: revenue, dtype: int64

In [106]:
city_data.loc["Amsterdam": "Tokyo", ["employee_count"]]

Unnamed: 0,employee_count
Amsterdam,5.0
Tokyo,8.0


In [97]:
city_revenues

Amsterdam    4200
Toronto      8000
Tokyo        6500
dtype: int64

In [98]:
city_revenues.sum()

18700

In [100]:
city_revenues.min()

4200

In [101]:
city_revenues.max()

8000

In [102]:
city_revenues.mean()

6233.333333333333

## Combining Multiple Datasets

In [107]:
further_city_data = pd.DataFrame(
    {"revenue": [7000, 3400], "employee_count": [2, 2]},
    index=["New York", "Barcelona"]
)

In [108]:
further_city_data

Unnamed: 0,revenue,employee_count
New York,7000,2
Barcelona,3400,2


In [109]:
city_data

Unnamed: 0,revenue,employee_count
Amsterdam,4200,5.0
Tokyo,6500,8.0
Toronto,8000,


In [114]:
# basi syntax
# pd.concat([data_pertama, data_kedua])

In [117]:
all_city_data = pd.concat([city_data, further_city_data], sort=False)

# untuk data yg banyak sebaiknya menggunakan sort=False, karena akan memakan waktu yang lama

In [118]:
all_city_data

Unnamed: 0,revenue,employee_count
Amsterdam,4200,5.0
Tokyo,6500,8.0
Toronto,8000,
New York,7000,2.0
Barcelona,3400,2.0


In [119]:
city_countries = pd.DataFrame({
    "country": ["Holland", "Japan", "Holland", "Canada", "Spain"],
    "capital": [1, 1, 0, 0, 0]},
    index=["Amsterdam", "Tokyo", "Rotterdam", "Toronto", "Barcelona"]
)

In [120]:
city_countries

Unnamed: 0,country,capital
Amsterdam,Holland,1
Tokyo,Japan,1
Rotterdam,Holland,0
Toronto,Canada,0
Barcelona,Spain,0


In [121]:
all_city_data

Unnamed: 0,revenue,employee_count
Amsterdam,4200,5.0
Tokyo,6500,8.0
Toronto,8000,
New York,7000,2.0
Barcelona,3400,2.0


In [154]:
cities = pd.concat([all_city_data, city_countries], axis=1, sort=False)

# axis=1 untuk menggabungkan tabel secara mendatar
# axis=0 untuk menggabungkan tabel secara menurun

In [155]:
cities

Unnamed: 0,revenue,employee_count,country,capital
Amsterdam,4200.0,5.0,Holland,1.0
Tokyo,6500.0,8.0,Japan,1.0
Toronto,8000.0,,Canada,0.0
New York,7000.0,2.0,,
Barcelona,3400.0,2.0,Spain,0.0
Rotterdam,,,Holland,0.0


In [149]:
cities = pd.concat([all_city_data, city_countries], axis=1, join="inner", sort=False)

In [150]:
cities

Unnamed: 0,revenue,employee_count,country,capital
Amsterdam,4200,5.0,Holland,1
Tokyo,6500,8.0,Japan,1
Toronto,8000,,Canada,0
Barcelona,3400,2.0,Spain,0


In [151]:
countries = pd.DataFrame({
    "population_millions": [17, 127, 37],
    "continent": ["Europe", "Asia", "North America"]
}, index=["Holland", "Japan", "Canada"])

In [152]:
countries

Unnamed: 0,population_millions,continent
Holland,17,Europe
Japan,127,Asia
Canada,37,North America


In [156]:
cities

Unnamed: 0,revenue,employee_count,country,capital
Amsterdam,4200.0,5.0,Holland,1.0
Tokyo,6500.0,8.0,Japan,1.0
Toronto,8000.0,,Canada,0.0
New York,7000.0,2.0,,
Barcelona,3400.0,2.0,Spain,0.0
Rotterdam,,,Holland,0.0


In [158]:
pd.merge(cities, countries, left_on="country", right_index=True)

Unnamed: 0,revenue,employee_count,country,capital,population_millions,continent
Amsterdam,4200.0,5.0,Holland,1.0,17,Europe
Rotterdam,,,Holland,0.0,17,Europe
Tokyo,6500.0,8.0,Japan,1.0,127,Asia
Toronto,8000.0,,Canada,0.0,37,North America


In [160]:
pd.merge(
    cities,
    countries,
    left_on="country",
    right_index=True,
    how="left"
)

Unnamed: 0,revenue,employee_count,country,capital,population_millions,continent
Amsterdam,4200.0,5.0,Holland,1.0,17.0,Europe
Tokyo,6500.0,8.0,Japan,1.0,127.0,Asia
Toronto,8000.0,,Canada,0.0,37.0,North America
New York,7000.0,2.0,,,,
Barcelona,3400.0,2.0,Spain,0.0,,
Rotterdam,,,Holland,0.0,17.0,Europe
