# Series

## Making series

In [2]:
import pandas as pd

# from dictionary
dict_data = {'a': 1, 'b': 2, 'c':3}
series = pd.Series(dict_data)

print(series)

a    1
b    2
c    3
dtype: int64


In [3]:
type(series)

pandas.core.series.Series

In [4]:
series.index

Index(['a', 'b', 'c'], dtype='object')

In [5]:
series.values

array([1, 2, 3], dtype=int64)

딕셔너리가 아닌 리스트를 통해 시리즈를 만들 수도 있다.

In [6]:
list_data = ['a', 'b', 'c']
series_2 = pd.Series(list_data)

print(series_2)

0    a
1    b
2    c
dtype: object


In [7]:
series_3 = pd.Series(list_data, index=['index1', 'index2', 'index3'])
print(series_3)

index1    a
index2    b
index3    c
dtype: object


## Select from a series

In [8]:
capital = pd.Series({'Korea': 'Seoul',
                     'Japan': 'Tokyo',
                     'China': 'Beijing',
                     'India': 'New Delhi',
                     'Taiwan': 'Taipei',
                     'Singapore': 'Singapore'})

print(capital)

Korea            Seoul
Japan            Tokyo
China          Beijing
India        New Delhi
Taiwan          Taipei
Singapore    Singapore
dtype: object


In [9]:
capital['Korea']

'Seoul'

In [10]:
capital[['Korea', 'Taiwan']]

Korea      Seoul
Taiwan    Taipei
dtype: object

In [11]:
capital[0]

'Seoul'

In [12]:
capital[[0, 3]]

Korea        Seoul
India    New Delhi
dtype: object

In [13]:
capital[0:3]

Korea      Seoul
Japan      Tokyo
China    Beijing
dtype: object

## Calculate serieses

In [14]:
series_1 = pd.Series([1, 2, 3])
series_2 = pd.Series([4, 5, 6])

series_1 + series_2

0    5
1    7
2    9
dtype: int64

In [15]:
series_1 * 2

0    2
1    4
2    6
dtype: int64

# DataFrame

## Making DataFrames

In [16]:
dict_data = {'col1': [1,2,3], 'col2': [4,5,6], 'col3': [7,8,9]}
df = pd.DataFrame(dict_data)

df

Unnamed: 0,col1,col2,col3
0,1,4,7
1,2,5,8
2,3,6,9


In [17]:
type(df)

pandas.core.frame.DataFrame

In [18]:
df2 = pd.DataFrame([[1,2,3], [4,5,6], [7,8,9]])

df2

Unnamed: 0,0,1,2
0,1,2,3
1,4,5,6
2,7,8,9


In [19]:
df3 = pd.DataFrame([[1,2,3], [4,5,6], [7,8,9]],
                   index=['index1', 'index2', 'index3'],
                   columns=['col1', 'col2', 'col3'])
df3

Unnamed: 0,col1,col2,col3
index1,1,2,3
index2,4,5,6
index3,7,8,9


In [20]:
# delete a row and a column from DataFrame.
df3.drop('index3', axis=0, inplace=True)
df3.drop('col1', axis=1, inplace=True)

df3

Unnamed: 0,col2,col3
index1,2,3
index2,5,6


## Select rows and columns

In [21]:
dict_data = {'col1': [1,2,3,4], 'col2': [5,6,7,8],
             'col3': [9,10,11,12], 'col4': [13,14,15,16]}
df = pd.DataFrame(dict_data, index=['index1', 'index2', 'index3', 'index4'])

df

Unnamed: 0,col1,col2,col3,col4
index1,1,5,9,13
index2,2,6,10,14
index3,3,7,11,15
index4,4,8,12,16


In [22]:
df['col1']

index1    1
index2    2
index3    3
index4    4
Name: col1, dtype: int64

In [23]:
df.col1

index1    1
index2    2
index3    3
index4    4
Name: col1, dtype: int64

In [24]:
# make it as a dataframe
df[['col1']]

Unnamed: 0,col1
index1,1
index2,2
index3,3
index4,4


In [25]:
df[['col1', 'col2']]

Unnamed: 0,col1,col2
index1,1,5
index2,2,6
index3,3,7
index4,4,8


In [26]:
# DataFrame.loc['row index']
# DataFrame.iloc[location index]

df.loc['index1']

col1     1
col2     5
col3     9
col4    13
Name: index1, dtype: int64

In [27]:
df.iloc[0]

col1     1
col2     5
col3     9
col4    13
Name: index1, dtype: int64

In [28]:
# return as a dataframe type
df.loc[['index1']]

Unnamed: 0,col1,col2,col3,col4
index1,1,5,9,13


In [29]:
df.iloc[0]

col1     1
col2     5
col3     9
col4    13
Name: index1, dtype: int64

In [30]:
df.iloc[[0]]

Unnamed: 0,col1,col2,col3,col4
index1,1,5,9,13


In [31]:
df.loc['index1':'index3']

Unnamed: 0,col1,col2,col3,col4
index1,1,5,9,13
index2,2,6,10,14
index3,3,7,11,15


In [32]:
df.iloc[0:2]

Unnamed: 0,col1,col2,col3,col4
index1,1,5,9,13
index2,2,6,10,14


In [33]:
df.loc['index1', 'col1']

1

In [34]:
df.loc[['index1', 'index3'], ['col1', 'col4']]

Unnamed: 0,col1,col4
index1,1,13
index3,3,15


In [35]:
df.loc['index1':'index2', 'col1':'col3']

Unnamed: 0,col1,col2,col3
index1,1,5,9
index2,2,6,10


In [36]:
df.iloc[[0, 2], [0, 3]]

Unnamed: 0,col1,col4
index1,1,13
index3,3,15


# Explore and analysis data

In [37]:
import seaborn as sns

df = sns.load_dataset('titanic')
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [38]:
df.tail()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
886,0,2,male,27.0,0,0,13.0,S,Second,man,True,,Southampton,no,True
887,1,1,female,19.0,0,0,30.0,S,First,woman,False,B,Southampton,yes,True
888,0,3,female,,1,2,23.45,S,Third,woman,False,,Southampton,no,False
889,1,1,male,26.0,0,0,30.0,C,First,man,True,C,Cherbourg,yes,True
890,0,3,male,32.0,0,0,7.75,Q,Third,man,True,,Queenstown,no,True


In [39]:
df.shape

(891, 15)

In [40]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    object  
 3   age          714 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   embarked     889 non-null    object  
 8   class        891 non-null    category
 9   who          891 non-null    object  
 10  adult_male   891 non-null    bool    
 11  deck         203 non-null    category
 12  embark_town  889 non-null    object  
 13  alive        891 non-null    object  
 14  alone        891 non-null    bool    
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 80.7+ KB


In [41]:
df['sex'].value_counts()

sex
male      577
female    314
Name: count, dtype: int64

In [42]:
df[['sex', 'survived']].value_counts()

sex     survived
male    0           468
female  1           233
male    1           109
female  0            81
Name: count, dtype: int64

In [43]:
# ratio of counts
df[['sex', 'survived']].value_counts(normalize=True).sort_index()

sex     survived
female  0           0.090909
        1           0.261504
male    0           0.525253
        1           0.122334
Name: proportion, dtype: float64

In [44]:
df['survived'].mean()

0.3838383838383838

In [45]:
df[['survived', 'age']].mean()

survived     0.383838
age         29.699118
dtype: float64

In [46]:
df['fare'].min()

0.0

In [47]:
df['fare'].max()

512.3292

In [48]:
df['fare'].mean()

32.204207968574636

In [49]:
df['fare'].median()

14.4542

# Manage missing data

In [50]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    object  
 3   age          714 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   embarked     889 non-null    object  
 8   class        891 non-null    category
 9   who          891 non-null    object  
 10  adult_male   891 non-null    bool    
 11  deck         203 non-null    category
 12  embark_town  889 non-null    object  
 13  alive        891 non-null    object  
 14  alone        891 non-null    bool    
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 80.7+ KB


In [51]:
df.head().isnull()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False
1,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False
3,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False


## drop the missing values

In [52]:
# 결측치가 있는 경우 행을 모두 삭제
df.dropna()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
6,0,1,male,54.0,0,0,51.8625,S,First,man,True,E,Southampton,no,True
10,1,3,female,4.0,1,1,16.7000,S,Third,child,False,G,Southampton,yes,False
11,1,1,female,58.0,0,0,26.5500,S,First,woman,False,C,Southampton,yes,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
871,1,1,female,47.0,1,1,52.5542,S,First,woman,False,D,Southampton,yes,False
872,0,1,male,33.0,0,0,5.0000,S,First,man,True,B,Southampton,no,True
879,1,1,female,56.0,0,1,83.1583,C,First,woman,False,C,Cherbourg,yes,False
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True


In [53]:
# dropna() 메서드 내에 subset을 입력하면 해당 열 중에서 결측치가 있는 경우 행을 삭제
# axis=0은 행 방향으로 동작을 의미
df.dropna(subset= ['age'], axis=0)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,,Southampton,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
885,0,3,female,39.0,0,5,29.1250,Q,Third,woman,False,,Queenstown,no,False
886,0,2,male,27.0,0,0,13.0000,S,Second,man,True,,Southampton,no,True
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True
889,1,1,male,26.0,0,0,30.0000,C,First,man,True,C,Cherbourg,yes,True


In [54]:
# 결측치가 있는 열을 삭제
df.dropna(axis=1)

Unnamed: 0,survived,pclass,sex,sibsp,parch,fare,class,who,adult_male,alive,alone
0,0,3,male,1,0,7.2500,Third,man,True,no,False
1,1,1,female,1,0,71.2833,First,woman,False,yes,False
2,1,3,female,0,0,7.9250,Third,woman,False,yes,True
3,1,1,female,1,0,53.1000,First,woman,False,yes,False
4,0,3,male,0,0,8.0500,Third,man,True,no,True
...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,0,0,13.0000,Second,man,True,no,True
887,1,1,female,0,0,30.0000,First,woman,False,yes,True
888,0,3,female,1,2,23.4500,Third,woman,False,no,False
889,1,1,male,0,0,30.0000,First,man,True,yes,True


In [55]:
# thres=300 : 결측치가 300개 이상 갖는 열을 삭제
df.dropna(axis=1, thresh=300)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,Southampton,yes,False
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,Southampton,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,Second,man,True,Southampton,no,True
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,Southampton,yes,True
888,0,3,female,,1,2,23.4500,S,Third,woman,False,Southampton,no,False
889,1,1,male,26.0,0,0,30.0000,C,First,man,True,Cherbourg,yes,True


## Replace the missing values

In [56]:
df_2 = df.copy()
df_2.head(6)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True
5,0,3,male,,0,0,8.4583,Q,Third,man,True,,Queenstown,no,True


In [57]:
mean_age = df_2['age'].mean()
print(mean_age)

29.69911764705882


In [58]:
# 결측치 특정 값을 대체
df_2['age'].fillna(mean_age, inplace=True)

In [59]:
df_2['age'].head(6)

0    22.000000
1    38.000000
2    26.000000
3    35.000000
4    35.000000
5    29.699118
Name: age, dtype: float64

In [60]:
df_2['embark_town'].fillna('Southampton', inplace=True)

In [61]:
# forward fill and backward fill
df_2['deck_ffill'] = df_2['deck'].fillna(method='ffill')
df_2['deck_bfill'] = df_2['deck'].fillna(method='bfill')

df_2[['deck', 'deck_ffill', 'deck_bfill']].head(12)

Unnamed: 0,deck,deck_ffill,deck_bfill
0,,,C
1,C,C,C
2,,C,C
3,C,C,C
4,,C,E
5,,C,E
6,E,E,E
7,,E,G
8,,E,G
9,,E,G


# Indexing

In [62]:
df = sns.load_dataset('mpg')
df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name
0,18.0,8,307.0,130.0,3504,12.0,70,usa,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693,11.5,70,usa,buick skylark 320
2,18.0,8,318.0,150.0,3436,11.0,70,usa,plymouth satellite
3,16.0,8,304.0,150.0,3433,12.0,70,usa,amc rebel sst
4,17.0,8,302.0,140.0,3449,10.5,70,usa,ford torino


In [63]:
# set 'name' column as an index
df.set_index('name', inplace=True)
df.head()

Unnamed: 0_level_0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
chevrolet chevelle malibu,18.0,8,307.0,130.0,3504,12.0,70,usa
buick skylark 320,15.0,8,350.0,165.0,3693,11.5,70,usa
plymouth satellite,18.0,8,318.0,150.0,3436,11.0,70,usa
amc rebel sst,16.0,8,304.0,150.0,3433,12.0,70,usa
ford torino,17.0,8,302.0,140.0,3449,10.5,70,usa


In [64]:
df.sort_index(inplace=True)
df.head()

Unnamed: 0_level_0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
amc ambassador brougham,13.0,8,360.0,175.0,3821,11.0,73,usa
amc ambassador dpl,15.0,8,390.0,190.0,3850,8.5,70,usa
amc ambassador sst,17.0,8,304.0,150.0,3672,11.5,72,usa
amc concord,24.3,4,151.0,90.0,3003,20.1,80,usa
amc concord,19.4,6,232.0,90.0,3210,17.2,78,usa


In [65]:
df.sort_index(inplace=True, ascending=False)
df.head()

Unnamed: 0_level_0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
vw rabbit custom,31.9,4,89.0,71.0,1925,14.0,79,europe
vw rabbit c (diesel),44.3,4,90.0,48.0,2085,21.7,80,europe
vw rabbit,29.0,4,90.0,70.0,1937,14.2,76,europe
vw rabbit,41.5,4,98.0,76.0,2144,14.7,80,europe
vw pickup,44.0,4,97.0,52.0,2130,24.6,82,europe


In [66]:
# reset index
df.reset_index(inplace=True)
df

Unnamed: 0,name,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin
0,vw rabbit custom,31.9,4,89.0,71.0,1925,14.0,79,europe
1,vw rabbit c (diesel),44.3,4,90.0,48.0,2085,21.7,80,europe
2,vw rabbit,29.0,4,90.0,70.0,1937,14.2,76,europe
3,vw rabbit,41.5,4,98.0,76.0,2144,14.7,80,europe
4,vw pickup,44.0,4,97.0,52.0,2130,24.6,82,europe
...,...,...,...,...,...,...,...,...,...
393,amc concord,19.4,6,232.0,90.0,3210,17.2,78,usa
394,amc concord,24.3,4,151.0,90.0,3003,20.1,80,usa
395,amc ambassador sst,17.0,8,304.0,150.0,3672,11.5,72,usa
396,amc ambassador dpl,15.0,8,390.0,190.0,3850,8.5,70,usa


# Filtering

## Boolean indexing

In [67]:
df = sns.load_dataset('mpg')
df.tail(10)

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name
388,26.0,4,156.0,92.0,2585,14.5,82,usa,chrysler lebaron medallion
389,22.0,6,232.0,112.0,2835,14.7,82,usa,ford granada l
390,32.0,4,144.0,96.0,2665,13.9,82,japan,toyota celica gt
391,36.0,4,135.0,84.0,2370,13.0,82,usa,dodge charger 2.2
392,27.0,4,151.0,90.0,2950,17.3,82,usa,chevrolet camaro
393,27.0,4,140.0,86.0,2790,15.6,82,usa,ford mustang gl
394,44.0,4,97.0,52.0,2130,24.6,82,europe,vw pickup
395,32.0,4,135.0,84.0,2295,11.6,82,usa,dodge rampage
396,28.0,4,120.0,79.0,2625,18.6,82,usa,ford ranger
397,31.0,4,119.0,82.0,2720,19.4,82,usa,chevy s-10


In [68]:
# unique values
df['cylinders'].unique()

array([8, 4, 6, 3, 5], dtype=int64)

In [69]:
filter_bool = (df['cylinders'] == 4)
filter_bool.tail(10)

388     True
389    False
390     True
391     True
392     True
393     True
394     True
395     True
396     True
397     True
Name: cylinders, dtype: bool

In [70]:
# 행 인덱스에 boolean series를 입력하면 해당 조건을 만족하는 행만 선택
df.loc[filter_bool]

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name
14,24.0,4,113.0,95.0,2372,15.0,70,japan,toyota corona mark ii
18,27.0,4,97.0,88.0,2130,14.5,70,japan,datsun pl510
19,26.0,4,97.0,46.0,1835,20.5,70,europe,volkswagen 1131 deluxe sedan
20,25.0,4,110.0,87.0,2672,17.5,70,europe,peugeot 504
21,24.0,4,107.0,90.0,2430,14.5,70,europe,audi 100 ls
...,...,...,...,...,...,...,...,...,...
393,27.0,4,140.0,86.0,2790,15.6,82,usa,ford mustang gl
394,44.0,4,97.0,52.0,2130,24.6,82,europe,vw pickup
395,32.0,4,135.0,84.0,2295,11.6,82,usa,dodge rampage
396,28.0,4,120.0,79.0,2625,18.6,82,usa,ford ranger


In [71]:
# 여러 개 조건도 가능
filter_bool_2 = (df['cylinders'] == 4) & (df['horsepower'] >= 100)
df.loc[filter_bool_2, ['cylinders', 'horsepower', 'name']]

Unnamed: 0,cylinders,horsepower,name
23,4,113.0,bmw 2002
76,4,112.0,volvo 145e (sw)
120,4,112.0,volvo 144ea
122,4,110.0,saab 99le
180,4,115.0,saab 99le
207,4,102.0,volvo 245
242,4,110.0,bmw 320i
271,4,105.0,plymouth sapporo
276,4,115.0,saab 99gle
323,4,105.0,dodge colt


## isin() method

In [72]:
# name이 'fond maverick', 'ford mustang ii', 'chevrolet impala' 인 데이터 선택
filter_bool_3 = (df['name'] == 'ford maverick') | (df['name'] == 'ford mustang ii') | (df['name'] == 'chevrolet impala')
df.loc[filter_bool_3,]

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name
6,14.0,8,454.0,220.0,4354,9.0,70,usa,chevrolet impala
17,21.0,6,200.0,85.0,2587,16.0,70,usa,ford maverick
38,14.0,8,350.0,165.0,4209,12.0,71,usa,chevrolet impala
62,13.0,8,350.0,165.0,4274,12.0,72,usa,chevrolet impala
100,18.0,6,250.0,88.0,3021,16.5,73,usa,ford maverick
103,11.0,8,400.0,150.0,4997,14.0,73,usa,chevrolet impala
126,21.0,6,200.0,,2875,17.0,74,usa,ford maverick
155,15.0,6,250.0,72.0,3158,19.5,75,usa,ford maverick
166,13.0,8,302.0,129.0,3169,12.0,75,usa,ford mustang ii
193,24.0,6,200.0,81.0,3012,17.6,76,usa,ford maverick


In [73]:
filter_isin = df['name'].isin(
    ['ford maverick', 'ford mustang ii', 'chevrolet impala']
)

df.loc[filter_isin]

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name
6,14.0,8,454.0,220.0,4354,9.0,70,usa,chevrolet impala
17,21.0,6,200.0,85.0,2587,16.0,70,usa,ford maverick
38,14.0,8,350.0,165.0,4209,12.0,71,usa,chevrolet impala
62,13.0,8,350.0,165.0,4274,12.0,72,usa,chevrolet impala
100,18.0,6,250.0,88.0,3021,16.5,73,usa,ford maverick
103,11.0,8,400.0,150.0,4997,14.0,73,usa,chevrolet impala
126,21.0,6,200.0,,2875,17.0,74,usa,ford maverick
155,15.0,6,250.0,72.0,3158,19.5,75,usa,ford maverick
166,13.0,8,302.0,129.0,3169,12.0,75,usa,ford mustang ii
193,24.0,6,200.0,81.0,3012,17.6,76,usa,ford maverick


In [74]:
df.loc[filter_isin, ].sort_values('horsepower')

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name
155,15.0,6,250.0,72.0,3158,19.5,75,usa,ford maverick
193,24.0,6,200.0,81.0,3012,17.6,76,usa,ford maverick
17,21.0,6,200.0,85.0,2587,16.0,70,usa,ford maverick
100,18.0,6,250.0,88.0,3021,16.5,73,usa,ford maverick
166,13.0,8,302.0,129.0,3169,12.0,75,usa,ford mustang ii
103,11.0,8,400.0,150.0,4997,14.0,73,usa,chevrolet impala
38,14.0,8,350.0,165.0,4209,12.0,71,usa,chevrolet impala
62,13.0,8,350.0,165.0,4274,12.0,72,usa,chevrolet impala
6,14.0,8,454.0,220.0,4354,9.0,70,usa,chevrolet impala
126,21.0,6,200.0,,2875,17.0,74,usa,ford maverick


# Insert a new column

In [75]:
df['ratio'] = (df['mpg'] / df['weight']) * 100
df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name,ratio
0,18.0,8,307.0,130.0,3504,12.0,70,usa,chevrolet chevelle malibu,0.513699
1,15.0,8,350.0,165.0,3693,11.5,70,usa,buick skylark 320,0.406174
2,18.0,8,318.0,150.0,3436,11.0,70,usa,plymouth satellite,0.523865
3,16.0,8,304.0,150.0,3433,12.0,70,usa,amc rebel sst,0.466065
4,17.0,8,302.0,140.0,3449,10.5,70,usa,ford torino,0.492896


In [76]:
# 특정 열의 조건을 기반으로 새로운 열을 만들며, 조건문 함수 사용

import numpy as np

num = pd.Series([-2, -1, 1, 2])
np.where(num >= 0)

(array([2, 3], dtype=int64),)

In [77]:
np.where(num >=0, '양수', '음수')

array(['음수', '음수', '양수', '양수'], dtype='<U2')

In [78]:
# horsepower가 100미만, 100 이상, 200 이상인지 구분
import numpy as np

df['horse_power_div'] = np.where(
    df['horsepower'] < 100, '100 미만',
    np.where((df['horsepower'] >= 100) & (df['horsepower'] < 200), '100 이상',
             np.where(df['horsepower'] >= 200, '200 이상', '기타'))
)

df.head(8)

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name,ratio,horse_power_div
0,18.0,8,307.0,130.0,3504,12.0,70,usa,chevrolet chevelle malibu,0.513699,100 이상
1,15.0,8,350.0,165.0,3693,11.5,70,usa,buick skylark 320,0.406174,100 이상
2,18.0,8,318.0,150.0,3436,11.0,70,usa,plymouth satellite,0.523865,100 이상
3,16.0,8,304.0,150.0,3433,12.0,70,usa,amc rebel sst,0.466065,100 이상
4,17.0,8,302.0,140.0,3449,10.5,70,usa,ford torino,0.492896,100 이상
5,15.0,8,429.0,198.0,4341,10.0,70,usa,ford galaxie 500,0.345543,100 이상
6,14.0,8,454.0,220.0,4354,9.0,70,usa,chevrolet impala,0.321543,200 이상
7,14.0,8,440.0,215.0,4312,8.5,70,usa,plymouth fury iii,0.324675,200 이상


# Merge dataframes

## concat()

In [79]:
import pandas as pd

df1 = pd.DataFrame({
    "A": ["A0", "A1", "A2", "A3"],
    "B": ["B0", "B1", "B2", "B3"],
    "C": ["C0", "C1", "C2", "C3"],
    "D": ["D0", "D1", "D2", "D3"]
}, index=[0, 1, 2, 3])

In [80]:
# 열 이름이 같은 경우
df2 = pd.DataFrame({
    "A": ["A4", "A5", "A6", "A7"],
    "B": ["B4", "B5", "B6", "B7"],
    "C": ["C4", "C5", "C6", "C7"],
    "D": ["D4", "D5", "D6", "D7"]
}, index=[4, 5, 6, 7])

df3 = pd.DataFrame({
    "A": ["A8", "A9", "A10", "A11"],
    "B": ["B8", "B9", "B10", "B11"],
    "C": ["C8", "C9", "C10", "C11"],
    "D": ["D8", "D9", "D10", "D11"]
}, index=[8, 9, 10, 11])

result = pd.concat([df1, df2, df3])
result

Unnamed: 0,A,B,C,D
0,A0,B0,C0,D0
1,A1,B1,C1,D1
2,A2,B2,C2,D2
3,A3,B3,C3,D3
4,A4,B4,C4,D4
5,A5,B5,C5,D5
6,A6,B6,C6,D6
7,A7,B7,C7,D7
8,A8,B8,C8,D8
9,A9,B9,C9,D9


In [81]:
# 열 이름이 다른 경우
df4 = pd.DataFrame({
    "B": ["B2", "B3", "B6", "B7"],
    "D": ["D2", "D3", "D6", "D7"],
    "F": ["F2", "F3", "F6", "F7"]
}, index = [2, 3, 6, 7])

result = pd.concat([df1, df4])

result

Unnamed: 0,A,B,C,D,F
0,A0,B0,C0,D0,
1,A1,B1,C1,D1,
2,A2,B2,C2,D2,
3,A3,B3,C3,D3,
2,,B2,,D2,F2
3,,B3,,D3,F3
6,,B6,,D6,F6
7,,B7,,D7,F7


In [82]:
# reset index
result = pd.concat([df1, df4], ignore_index=True)

result

Unnamed: 0,A,B,C,D,F
0,A0,B0,C0,D0,
1,A1,B1,C1,D1,
2,A2,B2,C2,D2,
3,A3,B3,C3,D3,
4,,B2,,D2,F2
5,,B3,,D3,F3
6,,B6,,D6,F6
7,,B7,,D7,F7


In [83]:
# 열기준 데이터 합치기
result = pd.concat([df1, df2], axis=1)

result

Unnamed: 0,A,B,C,D,A.1,B.1,C.1,D.1
0,A0,B0,C0,D0,,,,
1,A1,B1,C1,D1,,,,
2,A2,B2,C2,D2,,,,
3,A3,B3,C3,D3,,,,
4,,,,,A4,B4,C4,D4
5,,,,,A5,B5,C5,D5
6,,,,,A6,B6,C6,D6
7,,,,,A7,B7,C7,D7


In [84]:
# 교집합 기준으로 합치기
result = pd.concat([df1, df4], axis=1, join='inner')

result

Unnamed: 0,A,B,C,D,B.1,D.1,F
2,A2,B2,C2,D2,B2,D2,F2
3,A3,B3,C3,D3,B3,D3,F3


In [85]:
df1

Unnamed: 0,A,B,C,D
0,A0,B0,C0,D0
1,A1,B1,C1,D1
2,A2,B2,C2,D2
3,A3,B3,C3,D3


In [86]:
# 데이터프레임에 시리즈 합치기
s1 = pd.Series(["X0", "X1", "X2", "X3"], name="X")

result = pd.concat([df1, s1], axis=1)

result

Unnamed: 0,A,B,C,D,X
0,A0,B0,C0,D0,X0
1,A1,B1,C1,D1,X1
2,A2,B2,C2,D2,X2
3,A3,B3,C3,D3,X3


## merge()

merge() 함수는 기준이 되는 열이나 인덱스, 즉 key를 기준으로 두 데이터프레임을 합침
- inner join : `pd.merge(left, right, on='key', how='inner)` 
- left join : `pd.merge(left, right, on='key', how='left)`
- right join : `pd.merge(left, right, on='key', how='right')`
- outer join : `pd.merge(left, right, on='key', how='outer')`

In [87]:
left = pd.DataFrame({
    "key": ["K0", "K1", "K2", "K3"],
    "A": ["A0", "A1", "A2", "A3"],
    "B": ["B0", "B1", "B2", "B3"]
})

right = pd.DataFrame({
    "key": ["K0", "K1", "K3", "K4"],
    "C": ["C0", "C1", "C3", "C4"],
    "D": ["D0", "D1", "D3", "D4"],
})

# inner join

result = pd.merge(left, right, on='key')

result

Unnamed: 0,key,A,B,C,D
0,K0,A0,B0,C0,D0
1,K1,A1,B1,C1,D1
2,K3,A3,B3,C3,D3


In [88]:
# left join
result = pd.merge(left, right, on='key', how='left')

result

Unnamed: 0,key,A,B,C,D
0,K0,A0,B0,C0,D0
1,K1,A1,B1,C1,D1
2,K2,A2,B2,,
3,K3,A3,B3,C3,D3


In [89]:
# right join
result = pd.merge(left, right, on='key', how='right')

result

Unnamed: 0,key,A,B,C,D
0,K0,A0,B0,C0,D0
1,K1,A1,B1,C1,D1
2,K3,A3,B3,C3,D3
3,K4,,,C4,D4


In [90]:
# outer join
result = pd.merge(left, right, on='key', how='outer')
result

Unnamed: 0,key,A,B,C,D
0,K0,A0,B0,C0,D0
1,K1,A1,B1,C1,D1
2,K2,A2,B2,,
3,K3,A3,B3,C3,D3
4,K4,,,C4,D4


In [91]:
# 기준되는 열의 이름이 다른 경우 `left_on`, `right_on`을 통해 키를 직접 선언
left = pd.DataFrame({
    "key_left": ["K0", "K1", "K2", "K3"],
    "A": ["A0", "A1", "A2", "A3"],
    "B": ["B0", "B1", "B2", "B3"]
})

right = pd.DataFrame({
    "key_right": ["K0", "K1", "K3", "K4"],
    "C": ["C0", "C1", "C3", "C4"],
    "D": ["D0", "D1", "D3", "D4"],
})

result = pd.merge(left, right, left_on='key_left', 
                  right_on='key_right', how='inner')

result

Unnamed: 0,key_left,A,B,key_right,C,D
0,K0,A0,B0,K0,C0,D0
1,K1,A1,B1,K1,C1,D1
2,K3,A3,B3,K3,C3,D3


In [92]:
# left.merge(right) 
result = left.merge(right, left_on='key_left',
                    right_on='key_right', how='inner')
result

Unnamed: 0,key_left,A,B,key_right,C,D
0,K0,A0,B0,K0,C0,D0
1,K1,A1,B1,K1,C1,D1
2,K3,A3,B3,K3,C3,D3


## join()

`join()` 메서드는 두 데이터프레임의 행 인덱스를 기준으로 데이터를 결합

In [93]:
left = pd.DataFrame({
    "A": ["A0", "A1", "A2", "A3"],
    "B": ["B0", "B1", "B2", "B3"],
}, index=["K0", "K1", "K2", "K3"])

right = pd.DataFrame({
    "C": ["C0", "C1", "C3", "C4"],
    "D": ["D0", "D1", "D3", "D4"]
}, index=["K0", "K1", "K3", "K4"])

In [94]:
left

Unnamed: 0,A,B
K0,A0,B0
K1,A1,B1
K2,A2,B2
K3,A3,B3


In [95]:
right

Unnamed: 0,C,D
K0,C0,D0
K1,C1,D1
K3,C3,D3
K4,C4,D4


In [96]:
result = left.join(right)

result

Unnamed: 0,A,B,C,D
K0,A0,B0,C0,D0
K1,A1,B1,C1,D1
K2,A2,B2,,
K3,A3,B3,C3,D3


# Data Restructuring

In [97]:
import seaborn as sns

df = sns.load_dataset('penguins')

df.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female


## `melt()`

`melt()` 함수는 ID 변수를 기준으로 원본 데이터프레임의 열 이름들을 variable 열에 넣고, 각 열에 있던 데이터는 value열에 넣어 아래로 긴 형태로 만들어 준다.

In [98]:
df.melt(id_vars=['species', 'island']).head(10)

Unnamed: 0,species,island,variable,value
0,Adelie,Torgersen,bill_length_mm,39.1
1,Adelie,Torgersen,bill_length_mm,39.5
2,Adelie,Torgersen,bill_length_mm,40.3
3,Adelie,Torgersen,bill_length_mm,
4,Adelie,Torgersen,bill_length_mm,36.7
5,Adelie,Torgersen,bill_length_mm,39.3
6,Adelie,Torgersen,bill_length_mm,38.9
7,Adelie,Torgersen,bill_length_mm,39.2
8,Adelie,Torgersen,bill_length_mm,34.1
9,Adelie,Torgersen,bill_length_mm,42.0


## `pivot_table()`

- index : 행 인덱스

- column : 열 인덱스

- values : 데이터 값

- aggfunc : 데이터 집계 함수

In [99]:
df_pivot_1 = df.pivot_table(index='species',
                            columns='island',
                            values='bill_length_mm',
                            aggfunc='mean')

df_pivot_1

island,Biscoe,Dream,Torgersen
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Adelie,38.975,38.501786,38.95098
Chinstrap,,48.833824,
Gentoo,47.504878,,


In [100]:
df_pivot_2 = df.pivot_table(index=['species', 'sex'],
                            columns='island',
                            values=['bill_length_mm', 'flipper_length_mm'],
                            aggfunc=['mean', 'count'])

df_pivot_2

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,mean,mean,mean,mean,mean,count,count,count,count,count,count
Unnamed: 0_level_1,Unnamed: 1_level_1,bill_length_mm,bill_length_mm,bill_length_mm,flipper_length_mm,flipper_length_mm,flipper_length_mm,bill_length_mm,bill_length_mm,bill_length_mm,flipper_length_mm,flipper_length_mm,flipper_length_mm
Unnamed: 0_level_2,island,Biscoe,Dream,Torgersen,Biscoe,Dream,Torgersen,Biscoe,Dream,Torgersen,Biscoe,Dream,Torgersen
species,sex,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3
Adelie,Female,37.359091,36.911111,37.554167,187.181818,187.851852,188.291667,22.0,27.0,24.0,22.0,27.0,24.0
Adelie,Male,40.590909,40.071429,40.586957,190.409091,191.928571,194.913043,22.0,28.0,23.0,22.0,28.0,23.0
Chinstrap,Female,,46.573529,,,191.735294,,,34.0,,,34.0,
Chinstrap,Male,,51.094118,,,199.911765,,,34.0,,,34.0,
Gentoo,Female,45.563793,,,212.706897,,,58.0,,,58.0,,
Gentoo,Male,49.47377,,,221.540984,,,61.0,,,61.0,,


## `stack()`과 `unstack()`

- `stack()` : 열 인덱스를 행 인덱스로 변환

- `unstack()` : 행 인덱스를 열 인덱스로 변환

데이터 프레임 형태로 변환을 위해서는 `to_frame()` 메서드를 추가

In [101]:
df_pivot_4 = df.pivot_table(index=['species', 'sex'],
                            columns='island',
                            values='bill_length_mm',
                            aggfunc='mean')
df_pivot_4

Unnamed: 0_level_0,island,Biscoe,Dream,Torgersen
species,sex,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Adelie,Female,37.359091,36.911111,37.554167
Adelie,Male,40.590909,40.071429,40.586957
Chinstrap,Female,,46.573529,
Chinstrap,Male,,51.094118,
Gentoo,Female,45.563793,,
Gentoo,Male,49.47377,,


In [102]:
# stack method
df_pivot_4.stack()

species    sex     island   
Adelie     Female  Biscoe       37.359091
                   Dream        36.911111
                   Torgersen    37.554167
           Male    Biscoe       40.590909
                   Dream        40.071429
                   Torgersen    40.586957
Chinstrap  Female  Dream        46.573529
           Male    Dream        51.094118
Gentoo     Female  Biscoe       45.563793
           Male    Biscoe       49.473770
dtype: float64

In [103]:
# to dataframe
df_pivot_4.stack().to_frame()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,0
species,sex,island,Unnamed: 3_level_1
Adelie,Female,Biscoe,37.359091
Adelie,Female,Dream,36.911111
Adelie,Female,Torgersen,37.554167
Adelie,Male,Biscoe,40.590909
Adelie,Male,Dream,40.071429
Adelie,Male,Torgersen,40.586957
Chinstrap,Female,Dream,46.573529
Chinstrap,Male,Dream,51.094118
Gentoo,Female,Biscoe,45.563793
Gentoo,Male,Biscoe,49.47377


In [104]:
# unstack method
df_pivot_4.unstack()

island,Biscoe,Biscoe,Dream,Dream,Torgersen,Torgersen
sex,Female,Male,Female,Male,Female,Male
species,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
Adelie,37.359091,40.590909,36.911111,40.071429,37.554167,40.586957
Chinstrap,,,46.573529,51.094118,,
Gentoo,45.563793,49.47377,,,,


# Apply function in the DataFrame

## functions into a Series

- series.apply(함수)

In [105]:
import seaborn as sns

sns.load_dataset('penguins')
bill_length_mm = df['bill_length_mm']

bill_length_mm.head()

0    39.1
1    39.5
2    40.3
3     NaN
4    36.7
Name: bill_length_mm, dtype: float64

In [106]:
import numpy as np
result = bill_length_mm.apply(np.sqrt)

result.head()

0    6.252999
1    6.284903
2    6.348228
3         NaN
4    6.058052
Name: bill_length_mm, dtype: float64

In [107]:
def mm_to_cm(num):
    return num / 10

result_2 = bill_length_mm.apply(mm_to_cm)
result_2

0      3.91
1      3.95
2      4.03
3       NaN
4      3.67
       ... 
339     NaN
340    4.68
341    5.04
342    4.52
343    4.99
Name: bill_length_mm, Length: 344, dtype: float64

## functions into a DataFrame

- 각 열에 적용 : DataFrame.apply(함수) 또는 Dataframe.apply(함수, axis=0)

- 각 행에 적용 : DataFrame.apply(함수, axis=1)

In [108]:
df_num = df[['bill_length_mm', 'bill_depth_mm',
             'flipper_length_mm', 'body_mass_g']]
df_num.head()

Unnamed: 0,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g
0,39.1,18.7,181.0,3750.0
1,39.5,17.4,186.0,3800.0
2,40.3,18.0,195.0,3250.0
3,,,,
4,36.7,19.3,193.0,3450.0


In [109]:
# max value in a column
df_num.apply(max)
# df_num.apply(max, axis=0)

bill_length_mm         59.6
bill_depth_mm          21.5
flipper_length_mm     231.0
body_mass_g          6300.0
dtype: float64

In [110]:
# max value in a row
df_num.apply(max, axis=1)

0      3750.0
1      3800.0
2      3250.0
3         NaN
4      3450.0
        ...  
339       NaN
340    4850.0
341    5750.0
342    5200.0
343    5400.0
Length: 344, dtype: float64

In [111]:
import pandas as pd

# custom function
def num_null(data):
    null_vec = pd.isnull(data)      # check if it is missing value
    null_count = np.sum(null_vec)   # sum all True value(1)s
    
    return null_count

df_num.apply(num_null)

bill_length_mm       2
bill_depth_mm        2
flipper_length_mm    2
body_mass_g          2
dtype: int64

# Calculate grups

- split : 데이터를 특정 기준으로 분할

- apply :  데이터를 집계, 변환, 필터링

- combine : 적용의 결과를 하나로 결합

In [112]:
import seaborn as sns

df = sns.load_dataset('penguins')

df.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female


## groupby the groups

In [113]:
df_group = df.groupby(['species'])

df_group

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x0000020632C2C2E0>

In [114]:
df_group.head(2)

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
152,Chinstrap,Dream,46.5,17.9,192.0,3500.0,Female
153,Chinstrap,Dream,50.0,19.5,196.0,3900.0,Male
220,Gentoo,Biscoe,46.1,13.2,211.0,4500.0,Female
221,Gentoo,Biscoe,50.0,16.3,230.0,5700.0,Male


`groupby()` 메서드 내에 기준이 되는 열을 입력하면 그룹 객체가 만들어진다. 현재는 분할만 이루어진 상태이므로 데이터를 출력해도 기존의 데이터프레임과는 크게 차이가 나지 않는다.

In [115]:
for key, group in df_group:
    print(key)
    display(group.head(2))

('Adelie',)


Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female


('Chinstrap',)


Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
152,Chinstrap,Dream,46.5,17.9,192.0,3500.0,Female
153,Chinstrap,Dream,50.0,19.5,196.0,3900.0,Male


('Gentoo',)


Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
220,Gentoo,Biscoe,46.1,13.2,211.0,4500.0,Female
221,Gentoo,Biscoe,50.0,16.3,230.0,5700.0,Male


# calculating according to groups

In [116]:
df_group[["bill_length_mm", "bill_depth_mm", "flipper_length_mm", "body_mass_g"]].mean()

Unnamed: 0_level_0,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Adelie,38.791391,18.346358,189.953642,3700.662252
Chinstrap,48.833824,18.420588,195.823529,3733.088235
Gentoo,47.504878,14.982114,217.186992,5076.01626


In [117]:
df.groupby(['species', 'sex'])[["bill_length_mm", "bill_depth_mm", "flipper_length_mm", "body_mass_g"]].mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g
species,sex,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Adelie,Female,37.257534,17.621918,187.794521,3368.835616
Adelie,Male,40.390411,19.072603,192.410959,4043.493151
Chinstrap,Female,46.573529,17.588235,191.735294,3527.205882
Chinstrap,Male,51.094118,19.252941,199.911765,3938.970588
Gentoo,Female,45.563793,14.237931,212.706897,4679.741379
Gentoo,Male,49.47377,15.718033,221.540984,5484.836066


In [118]:
def min_max(x):
    return x.max() - x.min()

df.groupby(['species'])['bill_length_mm'].agg(min_max)

species
Adelie       13.9
Chinstrap    17.1
Gentoo       18.7
Name: bill_length_mm, dtype: float64

In [119]:
df.groupby(['species'])[["bill_length_mm", "bill_depth_mm", "flipper_length_mm", "body_mass_g"]].agg(['max', 'min'])

Unnamed: 0_level_0,bill_length_mm,bill_length_mm,bill_depth_mm,bill_depth_mm,flipper_length_mm,flipper_length_mm,body_mass_g,body_mass_g
Unnamed: 0_level_1,max,min,max,min,max,min,max,min
species,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
Adelie,46.0,32.1,21.5,15.5,210.0,172.0,4775.0,2850.0
Chinstrap,58.0,40.9,20.8,16.4,212.0,178.0,4800.0,2700.0
Gentoo,59.6,40.9,17.3,13.1,231.0,203.0,6300.0,3950.0


`agg()` 메서드 내에 원하는 집계 연산을 리스트 행태로 입력하면 일괄적으로 적용이 된다. 각 열마다 다른 종류의 함수를 적용할 수도 있다.

In [120]:
df.groupby(['species']).agg({'bill_length_mm': ['max', 'min'],
                             'island': ['count']})

Unnamed: 0_level_0,bill_length_mm,bill_length_mm,island
Unnamed: 0_level_1,max,min,count
species,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
Adelie,46.0,32.1,152
Chinstrap,58.0,40.9,68
Gentoo,59.6,40.9,124


`transform()` 메서드 이용할 경우 그룹별로 함수를 적용하는 것은 동일하지만, 그 결과를 본래의 행 인덱스와 열 인덱스를 기준으로 반환. 따라서 원본 데이터프레임과 같은 형태로 변형하여 정리.

In [121]:
df.groupby(['species'])['bill_length_mm'].transform('mean')

0      38.791391
1      38.791391
2      38.791391
3      38.791391
4      38.791391
         ...    
339    47.504878
340    47.504878
341    47.504878
342    47.504878
343    47.504878
Name: bill_length_mm, Length: 344, dtype: float64

`species`별로 그룹을 나눈 후 'bill_length_mm'열을 선택하였다. 그후 `transform()` 메서드를 통해 평균을 구하면, 각 species별 평균이 집계되는 것이 아닌 원래의 행 인덱스와 열 인덱스에 결과가 반환된다. 이러한 점을 응용해 z-score를 계산

$$z=\frac{x-\mu}{\sigma}$$

In [122]:
def z_score(x):
    z = (x - x.mean()) / x.std()
    return z

df.groupby(['species'])['bill_length_mm'].transform(z_score)

0      0.115870
1      0.266054
2      0.566421
3           NaN
4     -0.785232
         ...   
339         NaN
340   -0.228719
341    0.939408
342   -0.747886
343    0.777168
Name: bill_length_mm, Length: 344, dtype: float64

In [123]:
# apply() 메서드를 그룹 객체에 적용 가능
df.groupby(['species'])['bill_length_mm'].apply(min)

species
Adelie        NaN
Chinstrap    40.9
Gentoo        NaN
Name: bill_length_mm, dtype: float64

In [124]:
df.groupby(['species'])['bill_length_mm'].apply(z_score)

species     
Adelie   0      0.115870
         1      0.266054
         2      0.566421
         3           NaN
         4     -0.785232
                  ...   
Gentoo   339         NaN
         340   -0.228719
         341    0.939408
         342   -0.747886
         343    0.777168
Name: bill_length_mm, Length: 344, dtype: float64

In [125]:
df.groupby(['species'])['bill_length_mm'].mean()

species
Adelie       38.791391
Chinstrap    48.833824
Gentoo       47.504878
Name: bill_length_mm, dtype: float64

In [126]:
# 'bill_length_mm'의 평균이 40 이상인 그룹만 필터링
df.groupby(['species']).filter(lambda x: x['bill_length_mm'].mean() >= 40)

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
152,Chinstrap,Dream,46.5,17.9,192.0,3500.0,Female
153,Chinstrap,Dream,50.0,19.5,196.0,3900.0,Male
154,Chinstrap,Dream,51.3,19.2,193.0,3650.0,Male
155,Chinstrap,Dream,45.4,18.7,188.0,3525.0,Female
156,Chinstrap,Dream,52.7,19.8,197.0,3725.0,Male
...,...,...,...,...,...,...,...
339,Gentoo,Biscoe,,,,,
340,Gentoo,Biscoe,46.8,14.3,215.0,4850.0,Female
341,Gentoo,Biscoe,50.4,15.7,222.0,5750.0,Male
342,Gentoo,Biscoe,45.2,14.8,212.0,5200.0,Female


# Timeseries data

In [127]:
import seaborn as sns

df = sns.load_dataset('taxis')
df.head()

Unnamed: 0,pickup,dropoff,passengers,distance,fare,tip,tolls,total,color,payment,pickup_zone,dropoff_zone,pickup_borough,dropoff_borough
0,2019-03-23 20:21:09,2019-03-23 20:27:24,1,1.6,7.0,2.15,0.0,12.95,yellow,credit card,Lenox Hill West,UN/Turtle Bay South,Manhattan,Manhattan
1,2019-03-04 16:11:55,2019-03-04 16:19:00,1,0.79,5.0,0.0,0.0,9.3,yellow,cash,Upper West Side South,Upper West Side South,Manhattan,Manhattan
2,2019-03-27 17:53:01,2019-03-27 18:00:25,1,1.37,7.5,2.36,0.0,14.16,yellow,credit card,Alphabet City,West Village,Manhattan,Manhattan
3,2019-03-10 01:23:59,2019-03-10 01:49:51,1,7.7,27.0,6.15,0.0,36.95,yellow,credit card,Hudson Sq,Yorkville West,Manhattan,Manhattan
4,2019-03-30 13:27:42,2019-03-30 13:37:14,3,2.16,9.0,1.1,0.0,13.4,yellow,credit card,Midtown East,Yorkville West,Manhattan,Manhattan


In [128]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6433 entries, 0 to 6432
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   pickup           6433 non-null   datetime64[ns]
 1   dropoff          6433 non-null   datetime64[ns]
 2   passengers       6433 non-null   int64         
 3   distance         6433 non-null   float64       
 4   fare             6433 non-null   float64       
 5   tip              6433 non-null   float64       
 6   tolls            6433 non-null   float64       
 7   total            6433 non-null   float64       
 8   color            6433 non-null   object        
 9   payment          6389 non-null   object        
 10  pickup_zone      6407 non-null   object        
 11  dropoff_zone     6388 non-null   object        
 12  pickup_borough   6407 non-null   object        
 13  dropoff_borough  6388 non-null   object        
dtypes: datetime64[ns](2), float64(5), int64(

In [129]:
df['pickup'] = pd.to_datetime(df['pickup'])
df['dropoff'] = pd.to_datetime(df['dropoff'])

In [130]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6433 entries, 0 to 6432
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   pickup           6433 non-null   datetime64[ns]
 1   dropoff          6433 non-null   datetime64[ns]
 2   passengers       6433 non-null   int64         
 3   distance         6433 non-null   float64       
 4   fare             6433 non-null   float64       
 5   tip              6433 non-null   float64       
 6   tolls            6433 non-null   float64       
 7   total            6433 non-null   float64       
 8   color            6433 non-null   object        
 9   payment          6389 non-null   object        
 10  pickup_zone      6407 non-null   object        
 11  dropoff_zone     6388 non-null   object        
 12  pickup_borough   6407 non-null   object        
 13  dropoff_borough  6388 non-null   object        
dtypes: datetime64[ns](2), float64(5), int64(

In [131]:
# extract year

df['pickup']

0      2019-03-23 20:21:09
1      2019-03-04 16:11:55
2      2019-03-27 17:53:01
3      2019-03-10 01:23:59
4      2019-03-30 13:27:42
               ...        
6428   2019-03-31 09:51:53
6429   2019-03-31 17:38:00
6430   2019-03-23 22:55:18
6431   2019-03-04 10:09:25
6432   2019-03-13 19:31:22
Name: pickup, Length: 6433, dtype: datetime64[ns]

In [132]:
# extract year

df['pickup'][0]

Timestamp('2019-03-23 20:21:09')

In [133]:
# extract year

df['pickup'][0]

Timestamp('2019-03-23 20:21:09')

In [134]:
# extract year

df['pickup'][0]

Timestamp('2019-03-23 20:21:09')

# Series

## Making series

In [135]:
import pandas as pd

# from dictionary
dict_data = {'a': 1, 'b': 2, 'c':3}
series = pd.Series(dict_data)

print(series)

a    1
b    2
c    3
dtype: int64


In [136]:
type(series)

pandas.core.series.Series

In [137]:
series.index

Index(['a', 'b', 'c'], dtype='object')

In [138]:
series.values

array([1, 2, 3], dtype=int64)

딕셔너리가 아닌 리스트를 통해 시리즈를 만들 수도 있다.

In [139]:
list_data = ['a', 'b', 'c']
series_2 = pd.Series(list_data)

print(series_2)

0    a
1    b
2    c
dtype: object


In [140]:
series_3 = pd.Series(list_data, index=['index1', 'index2', 'index3'])
print(series_3)

index1    a
index2    b
index3    c
dtype: object


## Select from a series

In [141]:
capital = pd.Series({'Korea': 'Seoul',
                     'Japan': 'Tokyo',
                     'China': 'Beijing',
                     'India': 'New Delhi',
                     'Taiwan': 'Taipei',
                     'Singapore': 'Singapore'})

print(capital)

Korea            Seoul
Japan            Tokyo
China          Beijing
India        New Delhi
Taiwan          Taipei
Singapore    Singapore
dtype: object


In [142]:
capital['Korea']

'Seoul'

In [143]:
capital[['Korea', 'Taiwan']]

Korea      Seoul
Taiwan    Taipei
dtype: object

In [144]:
capital[0]

'Seoul'

In [145]:
capital[[0, 3]]

Korea        Seoul
India    New Delhi
dtype: object

In [146]:
capital[0:3]

Korea      Seoul
Japan      Tokyo
China    Beijing
dtype: object

## Calculate serieses

In [147]:
series_1 = pd.Series([1, 2, 3])
series_2 = pd.Series([4, 5, 6])

series_1 + series_2

0    5
1    7
2    9
dtype: int64

In [148]:
series_1 * 2

0    2
1    4
2    6
dtype: int64

# DataFrame

## Making DataFrames

In [149]:
dict_data = {'col1': [1,2,3], 'col2': [4,5,6], 'col3': [7,8,9]}
df = pd.DataFrame(dict_data)

df

Unnamed: 0,col1,col2,col3
0,1,4,7
1,2,5,8
2,3,6,9


In [150]:
type(df)

pandas.core.frame.DataFrame

In [151]:
df2 = pd.DataFrame([[1,2,3], [4,5,6], [7,8,9]])

df2

Unnamed: 0,0,1,2
0,1,2,3
1,4,5,6
2,7,8,9


In [152]:
df3 = pd.DataFrame([[1,2,3], [4,5,6], [7,8,9]],
                   index=['index1', 'index2', 'index3'],
                   columns=['col1', 'col2', 'col3'])
df3

Unnamed: 0,col1,col2,col3
index1,1,2,3
index2,4,5,6
index3,7,8,9


In [153]:
# delete a row and a column from DataFrame.
df3.drop('index3', axis=0, inplace=True)
df3.drop('col1', axis=1, inplace=True)

df3

Unnamed: 0,col2,col3
index1,2,3
index2,5,6


## Select rows and columns

In [154]:
dict_data = {'col1': [1,2,3,4], 'col2': [5,6,7,8],
             'col3': [9,10,11,12], 'col4': [13,14,15,16]}
df = pd.DataFrame(dict_data, index=['index1', 'index2', 'index3', 'index4'])

df

Unnamed: 0,col1,col2,col3,col4
index1,1,5,9,13
index2,2,6,10,14
index3,3,7,11,15
index4,4,8,12,16


In [155]:
df['col1']

index1    1
index2    2
index3    3
index4    4
Name: col1, dtype: int64

In [156]:
df.col1

index1    1
index2    2
index3    3
index4    4
Name: col1, dtype: int64

In [157]:
# make it as a dataframe
df[['col1']]

Unnamed: 0,col1
index1,1
index2,2
index3,3
index4,4


In [158]:
df[['col1', 'col2']]

Unnamed: 0,col1,col2
index1,1,5
index2,2,6
index3,3,7
index4,4,8


In [159]:
# DataFrame.loc['row index']
# DataFrame.iloc[location index]

df.loc['index1']

col1     1
col2     5
col3     9
col4    13
Name: index1, dtype: int64

In [160]:
df.iloc[0]

col1     1
col2     5
col3     9
col4    13
Name: index1, dtype: int64

In [161]:
# return as a dataframe type
df.loc[['index1']]

Unnamed: 0,col1,col2,col3,col4
index1,1,5,9,13


In [162]:
df.iloc[0]

col1     1
col2     5
col3     9
col4    13
Name: index1, dtype: int64

In [163]:
df.iloc[[0]]

Unnamed: 0,col1,col2,col3,col4
index1,1,5,9,13


In [164]:
df.loc['index1':'index3']

Unnamed: 0,col1,col2,col3,col4
index1,1,5,9,13
index2,2,6,10,14
index3,3,7,11,15


In [165]:
df.iloc[0:2]

Unnamed: 0,col1,col2,col3,col4
index1,1,5,9,13
index2,2,6,10,14


In [166]:
df.loc['index1', 'col1']

1

In [167]:
df.loc[['index1', 'index3'], ['col1', 'col4']]

Unnamed: 0,col1,col4
index1,1,13
index3,3,15


In [168]:
df.loc['index1':'index2', 'col1':'col3']

Unnamed: 0,col1,col2,col3
index1,1,5,9
index2,2,6,10


In [169]:
df.iloc[[0, 2], [0, 3]]

Unnamed: 0,col1,col4
index1,1,13
index3,3,15


# Explore and analysis data

In [170]:
import seaborn as sns

df = sns.load_dataset('titanic')
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [171]:
df.tail()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
886,0,2,male,27.0,0,0,13.0,S,Second,man,True,,Southampton,no,True
887,1,1,female,19.0,0,0,30.0,S,First,woman,False,B,Southampton,yes,True
888,0,3,female,,1,2,23.45,S,Third,woman,False,,Southampton,no,False
889,1,1,male,26.0,0,0,30.0,C,First,man,True,C,Cherbourg,yes,True
890,0,3,male,32.0,0,0,7.75,Q,Third,man,True,,Queenstown,no,True


In [172]:
df.shape

(891, 15)

In [173]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    object  
 3   age          714 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   embarked     889 non-null    object  
 8   class        891 non-null    category
 9   who          891 non-null    object  
 10  adult_male   891 non-null    bool    
 11  deck         203 non-null    category
 12  embark_town  889 non-null    object  
 13  alive        891 non-null    object  
 14  alone        891 non-null    bool    
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 80.7+ KB


In [174]:
df['sex'].value_counts()

sex
male      577
female    314
Name: count, dtype: int64

In [175]:
df[['sex', 'survived']].value_counts()

sex     survived
male    0           468
female  1           233
male    1           109
female  0            81
Name: count, dtype: int64

In [176]:
# ratio of counts
df[['sex', 'survived']].value_counts(normalize=True).sort_index()

sex     survived
female  0           0.090909
        1           0.261504
male    0           0.525253
        1           0.122334
Name: proportion, dtype: float64

In [177]:
df['survived'].mean()

0.3838383838383838

In [178]:
df[['survived', 'age']].mean()

survived     0.383838
age         29.699118
dtype: float64

In [179]:
df['fare'].min()

0.0

In [180]:
df['fare'].max()

512.3292

In [181]:
df['fare'].mean()

32.204207968574636

In [182]:
df['fare'].median()

14.4542

# Manage missing data

In [183]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    object  
 3   age          714 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   embarked     889 non-null    object  
 8   class        891 non-null    category
 9   who          891 non-null    object  
 10  adult_male   891 non-null    bool    
 11  deck         203 non-null    category
 12  embark_town  889 non-null    object  
 13  alive        891 non-null    object  
 14  alone        891 non-null    bool    
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 80.7+ KB


In [184]:
df.head().isnull()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False
1,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False
3,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False


## drop the missing values

In [185]:
# 결측치가 있는 경우 행을 모두 삭제
df.dropna()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
6,0,1,male,54.0,0,0,51.8625,S,First,man,True,E,Southampton,no,True
10,1,3,female,4.0,1,1,16.7000,S,Third,child,False,G,Southampton,yes,False
11,1,1,female,58.0,0,0,26.5500,S,First,woman,False,C,Southampton,yes,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
871,1,1,female,47.0,1,1,52.5542,S,First,woman,False,D,Southampton,yes,False
872,0,1,male,33.0,0,0,5.0000,S,First,man,True,B,Southampton,no,True
879,1,1,female,56.0,0,1,83.1583,C,First,woman,False,C,Cherbourg,yes,False
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True


In [186]:
# dropna() 메서드 내에 subset을 입력하면 해당 열 중에서 결측치가 있는 경우 행을 삭제
# axis=0은 행 방향으로 동작을 의미
df.dropna(subset= ['age'], axis=0)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,,Southampton,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
885,0,3,female,39.0,0,5,29.1250,Q,Third,woman,False,,Queenstown,no,False
886,0,2,male,27.0,0,0,13.0000,S,Second,man,True,,Southampton,no,True
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True
889,1,1,male,26.0,0,0,30.0000,C,First,man,True,C,Cherbourg,yes,True


In [187]:
# 결측치가 있는 열을 삭제
df.dropna(axis=1)

Unnamed: 0,survived,pclass,sex,sibsp,parch,fare,class,who,adult_male,alive,alone
0,0,3,male,1,0,7.2500,Third,man,True,no,False
1,1,1,female,1,0,71.2833,First,woman,False,yes,False
2,1,3,female,0,0,7.9250,Third,woman,False,yes,True
3,1,1,female,1,0,53.1000,First,woman,False,yes,False
4,0,3,male,0,0,8.0500,Third,man,True,no,True
...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,0,0,13.0000,Second,man,True,no,True
887,1,1,female,0,0,30.0000,First,woman,False,yes,True
888,0,3,female,1,2,23.4500,Third,woman,False,no,False
889,1,1,male,0,0,30.0000,First,man,True,yes,True


In [188]:
# thres=300 : 결측치가 300개 이상 갖는 열을 삭제
df.dropna(axis=1, thresh=300)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,Southampton,yes,False
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,Southampton,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,Second,man,True,Southampton,no,True
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,Southampton,yes,True
888,0,3,female,,1,2,23.4500,S,Third,woman,False,Southampton,no,False
889,1,1,male,26.0,0,0,30.0000,C,First,man,True,Cherbourg,yes,True


## Replace the missing values

In [189]:
df_2 = df.copy()
df_2.head(6)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True
5,0,3,male,,0,0,8.4583,Q,Third,man,True,,Queenstown,no,True


In [190]:
mean_age = df_2['age'].mean()
print(mean_age)

29.69911764705882


In [191]:
# 결측치 특정 값을 대체
df_2['age'].fillna(mean_age, inplace=True)

In [192]:
df_2['age'].head(6)

0    22.000000
1    38.000000
2    26.000000
3    35.000000
4    35.000000
5    29.699118
Name: age, dtype: float64

In [193]:
df_2['embark_town'].fillna('Southampton', inplace=True)

In [194]:
# forward fill and backward fill
df_2['deck_ffill'] = df_2['deck'].fillna(method='ffill')
df_2['deck_bfill'] = df_2['deck'].fillna(method='bfill')

df_2[['deck', 'deck_ffill', 'deck_bfill']].head(12)

Unnamed: 0,deck,deck_ffill,deck_bfill
0,,,C
1,C,C,C
2,,C,C
3,C,C,C
4,,C,E
5,,C,E
6,E,E,E
7,,E,G
8,,E,G
9,,E,G


# Indexing

In [195]:
df = sns.load_dataset('mpg')
df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name
0,18.0,8,307.0,130.0,3504,12.0,70,usa,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693,11.5,70,usa,buick skylark 320
2,18.0,8,318.0,150.0,3436,11.0,70,usa,plymouth satellite
3,16.0,8,304.0,150.0,3433,12.0,70,usa,amc rebel sst
4,17.0,8,302.0,140.0,3449,10.5,70,usa,ford torino


In [196]:
# set 'name' column as an index
df.set_index('name', inplace=True)
df.head()

Unnamed: 0_level_0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
chevrolet chevelle malibu,18.0,8,307.0,130.0,3504,12.0,70,usa
buick skylark 320,15.0,8,350.0,165.0,3693,11.5,70,usa
plymouth satellite,18.0,8,318.0,150.0,3436,11.0,70,usa
amc rebel sst,16.0,8,304.0,150.0,3433,12.0,70,usa
ford torino,17.0,8,302.0,140.0,3449,10.5,70,usa


In [197]:
df.sort_index(inplace=True)
df.head()

Unnamed: 0_level_0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
amc ambassador brougham,13.0,8,360.0,175.0,3821,11.0,73,usa
amc ambassador dpl,15.0,8,390.0,190.0,3850,8.5,70,usa
amc ambassador sst,17.0,8,304.0,150.0,3672,11.5,72,usa
amc concord,24.3,4,151.0,90.0,3003,20.1,80,usa
amc concord,19.4,6,232.0,90.0,3210,17.2,78,usa


In [198]:
df.sort_index(inplace=True, ascending=False)
df.head()

Unnamed: 0_level_0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
vw rabbit custom,31.9,4,89.0,71.0,1925,14.0,79,europe
vw rabbit c (diesel),44.3,4,90.0,48.0,2085,21.7,80,europe
vw rabbit,29.0,4,90.0,70.0,1937,14.2,76,europe
vw rabbit,41.5,4,98.0,76.0,2144,14.7,80,europe
vw pickup,44.0,4,97.0,52.0,2130,24.6,82,europe


In [199]:
# reset index
df.reset_index(inplace=True)
df

Unnamed: 0,name,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin
0,vw rabbit custom,31.9,4,89.0,71.0,1925,14.0,79,europe
1,vw rabbit c (diesel),44.3,4,90.0,48.0,2085,21.7,80,europe
2,vw rabbit,29.0,4,90.0,70.0,1937,14.2,76,europe
3,vw rabbit,41.5,4,98.0,76.0,2144,14.7,80,europe
4,vw pickup,44.0,4,97.0,52.0,2130,24.6,82,europe
...,...,...,...,...,...,...,...,...,...
393,amc concord,19.4,6,232.0,90.0,3210,17.2,78,usa
394,amc concord,24.3,4,151.0,90.0,3003,20.1,80,usa
395,amc ambassador sst,17.0,8,304.0,150.0,3672,11.5,72,usa
396,amc ambassador dpl,15.0,8,390.0,190.0,3850,8.5,70,usa


# Filtering

## Boolean indexing

In [200]:
df = sns.load_dataset('mpg')
df.tail(10)

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name
388,26.0,4,156.0,92.0,2585,14.5,82,usa,chrysler lebaron medallion
389,22.0,6,232.0,112.0,2835,14.7,82,usa,ford granada l
390,32.0,4,144.0,96.0,2665,13.9,82,japan,toyota celica gt
391,36.0,4,135.0,84.0,2370,13.0,82,usa,dodge charger 2.2
392,27.0,4,151.0,90.0,2950,17.3,82,usa,chevrolet camaro
393,27.0,4,140.0,86.0,2790,15.6,82,usa,ford mustang gl
394,44.0,4,97.0,52.0,2130,24.6,82,europe,vw pickup
395,32.0,4,135.0,84.0,2295,11.6,82,usa,dodge rampage
396,28.0,4,120.0,79.0,2625,18.6,82,usa,ford ranger
397,31.0,4,119.0,82.0,2720,19.4,82,usa,chevy s-10


In [201]:
# unique values
df['cylinders'].unique()

array([8, 4, 6, 3, 5], dtype=int64)

In [202]:
filter_bool = (df['cylinders'] == 4)
filter_bool.tail(10)

388     True
389    False
390     True
391     True
392     True
393     True
394     True
395     True
396     True
397     True
Name: cylinders, dtype: bool

In [203]:
# 행 인덱스에 boolean series를 입력하면 해당 조건을 만족하는 행만 선택
df.loc[filter_bool]

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name
14,24.0,4,113.0,95.0,2372,15.0,70,japan,toyota corona mark ii
18,27.0,4,97.0,88.0,2130,14.5,70,japan,datsun pl510
19,26.0,4,97.0,46.0,1835,20.5,70,europe,volkswagen 1131 deluxe sedan
20,25.0,4,110.0,87.0,2672,17.5,70,europe,peugeot 504
21,24.0,4,107.0,90.0,2430,14.5,70,europe,audi 100 ls
...,...,...,...,...,...,...,...,...,...
393,27.0,4,140.0,86.0,2790,15.6,82,usa,ford mustang gl
394,44.0,4,97.0,52.0,2130,24.6,82,europe,vw pickup
395,32.0,4,135.0,84.0,2295,11.6,82,usa,dodge rampage
396,28.0,4,120.0,79.0,2625,18.6,82,usa,ford ranger


In [204]:
# 여러 개 조건도 가능
filter_bool_2 = (df['cylinders'] == 4) & (df['horsepower'] >= 100)
df.loc[filter_bool_2, ['cylinders', 'horsepower', 'name']]

Unnamed: 0,cylinders,horsepower,name
23,4,113.0,bmw 2002
76,4,112.0,volvo 145e (sw)
120,4,112.0,volvo 144ea
122,4,110.0,saab 99le
180,4,115.0,saab 99le
207,4,102.0,volvo 245
242,4,110.0,bmw 320i
271,4,105.0,plymouth sapporo
276,4,115.0,saab 99gle
323,4,105.0,dodge colt


## isin() method

In [205]:
# name이 'fond maverick', 'ford mustang ii', 'chevrolet impala' 인 데이터 선택
filter_bool_3 = (df['name'] == 'ford maverick') | (df['name'] == 'ford mustang ii') | (df['name'] == 'chevrolet impala')
df.loc[filter_bool_3,]

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name
6,14.0,8,454.0,220.0,4354,9.0,70,usa,chevrolet impala
17,21.0,6,200.0,85.0,2587,16.0,70,usa,ford maverick
38,14.0,8,350.0,165.0,4209,12.0,71,usa,chevrolet impala
62,13.0,8,350.0,165.0,4274,12.0,72,usa,chevrolet impala
100,18.0,6,250.0,88.0,3021,16.5,73,usa,ford maverick
103,11.0,8,400.0,150.0,4997,14.0,73,usa,chevrolet impala
126,21.0,6,200.0,,2875,17.0,74,usa,ford maverick
155,15.0,6,250.0,72.0,3158,19.5,75,usa,ford maverick
166,13.0,8,302.0,129.0,3169,12.0,75,usa,ford mustang ii
193,24.0,6,200.0,81.0,3012,17.6,76,usa,ford maverick


In [206]:
filter_isin = df['name'].isin(
    ['ford maverick', 'ford mustang ii', 'chevrolet impala']
)

df.loc[filter_isin]

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name
6,14.0,8,454.0,220.0,4354,9.0,70,usa,chevrolet impala
17,21.0,6,200.0,85.0,2587,16.0,70,usa,ford maverick
38,14.0,8,350.0,165.0,4209,12.0,71,usa,chevrolet impala
62,13.0,8,350.0,165.0,4274,12.0,72,usa,chevrolet impala
100,18.0,6,250.0,88.0,3021,16.5,73,usa,ford maverick
103,11.0,8,400.0,150.0,4997,14.0,73,usa,chevrolet impala
126,21.0,6,200.0,,2875,17.0,74,usa,ford maverick
155,15.0,6,250.0,72.0,3158,19.5,75,usa,ford maverick
166,13.0,8,302.0,129.0,3169,12.0,75,usa,ford mustang ii
193,24.0,6,200.0,81.0,3012,17.6,76,usa,ford maverick


In [207]:
df.loc[filter_isin, ].sort_values('horsepower')

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name
155,15.0,6,250.0,72.0,3158,19.5,75,usa,ford maverick
193,24.0,6,200.0,81.0,3012,17.6,76,usa,ford maverick
17,21.0,6,200.0,85.0,2587,16.0,70,usa,ford maverick
100,18.0,6,250.0,88.0,3021,16.5,73,usa,ford maverick
166,13.0,8,302.0,129.0,3169,12.0,75,usa,ford mustang ii
103,11.0,8,400.0,150.0,4997,14.0,73,usa,chevrolet impala
38,14.0,8,350.0,165.0,4209,12.0,71,usa,chevrolet impala
62,13.0,8,350.0,165.0,4274,12.0,72,usa,chevrolet impala
6,14.0,8,454.0,220.0,4354,9.0,70,usa,chevrolet impala
126,21.0,6,200.0,,2875,17.0,74,usa,ford maverick


# Insert a new column

In [208]:
df['ratio'] = (df['mpg'] / df['weight']) * 100
df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name,ratio
0,18.0,8,307.0,130.0,3504,12.0,70,usa,chevrolet chevelle malibu,0.513699
1,15.0,8,350.0,165.0,3693,11.5,70,usa,buick skylark 320,0.406174
2,18.0,8,318.0,150.0,3436,11.0,70,usa,plymouth satellite,0.523865
3,16.0,8,304.0,150.0,3433,12.0,70,usa,amc rebel sst,0.466065
4,17.0,8,302.0,140.0,3449,10.5,70,usa,ford torino,0.492896


In [209]:
# 특정 열의 조건을 기반으로 새로운 열을 만들며, 조건문 함수 사용

import numpy as np

num = pd.Series([-2, -1, 1, 2])
np.where(num >= 0)

(array([2, 3], dtype=int64),)

In [210]:
np.where(num >=0, '양수', '음수')

array(['음수', '음수', '양수', '양수'], dtype='<U2')

In [211]:
# horsepower가 100미만, 100 이상, 200 이상인지 구분
import numpy as np

df['horse_power_div'] = np.where(
    df['horsepower'] < 100, '100 미만',
    np.where((df['horsepower'] >= 100) & (df['horsepower'] < 200), '100 이상',
             np.where(df['horsepower'] >= 200, '200 이상', '기타'))
)

df.head(8)

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name,ratio,horse_power_div
0,18.0,8,307.0,130.0,3504,12.0,70,usa,chevrolet chevelle malibu,0.513699,100 이상
1,15.0,8,350.0,165.0,3693,11.5,70,usa,buick skylark 320,0.406174,100 이상
2,18.0,8,318.0,150.0,3436,11.0,70,usa,plymouth satellite,0.523865,100 이상
3,16.0,8,304.0,150.0,3433,12.0,70,usa,amc rebel sst,0.466065,100 이상
4,17.0,8,302.0,140.0,3449,10.5,70,usa,ford torino,0.492896,100 이상
5,15.0,8,429.0,198.0,4341,10.0,70,usa,ford galaxie 500,0.345543,100 이상
6,14.0,8,454.0,220.0,4354,9.0,70,usa,chevrolet impala,0.321543,200 이상
7,14.0,8,440.0,215.0,4312,8.5,70,usa,plymouth fury iii,0.324675,200 이상


# Merge dataframes

## concat()

In [212]:
import pandas as pd

df1 = pd.DataFrame({
    "A": ["A0", "A1", "A2", "A3"],
    "B": ["B0", "B1", "B2", "B3"],
    "C": ["C0", "C1", "C2", "C3"],
    "D": ["D0", "D1", "D2", "D3"]
}, index=[0, 1, 2, 3])

In [213]:
# 열 이름이 같은 경우
df2 = pd.DataFrame({
    "A": ["A4", "A5", "A6", "A7"],
    "B": ["B4", "B5", "B6", "B7"],
    "C": ["C4", "C5", "C6", "C7"],
    "D": ["D4", "D5", "D6", "D7"]
}, index=[4, 5, 6, 7])

df3 = pd.DataFrame({
    "A": ["A8", "A9", "A10", "A11"],
    "B": ["B8", "B9", "B10", "B11"],
    "C": ["C8", "C9", "C10", "C11"],
    "D": ["D8", "D9", "D10", "D11"]
}, index=[8, 9, 10, 11])

result = pd.concat([df1, df2, df3])
result

Unnamed: 0,A,B,C,D
0,A0,B0,C0,D0
1,A1,B1,C1,D1
2,A2,B2,C2,D2
3,A3,B3,C3,D3
4,A4,B4,C4,D4
5,A5,B5,C5,D5
6,A6,B6,C6,D6
7,A7,B7,C7,D7
8,A8,B8,C8,D8
9,A9,B9,C9,D9


In [214]:
# 열 이름이 다른 경우
df4 = pd.DataFrame({
    "B": ["B2", "B3", "B6", "B7"],
    "D": ["D2", "D3", "D6", "D7"],
    "F": ["F2", "F3", "F6", "F7"]
}, index = [2, 3, 6, 7])

result = pd.concat([df1, df4])

result

Unnamed: 0,A,B,C,D,F
0,A0,B0,C0,D0,
1,A1,B1,C1,D1,
2,A2,B2,C2,D2,
3,A3,B3,C3,D3,
2,,B2,,D2,F2
3,,B3,,D3,F3
6,,B6,,D6,F6
7,,B7,,D7,F7


In [215]:
# reset index
result = pd.concat([df1, df4], ignore_index=True)

result

Unnamed: 0,A,B,C,D,F
0,A0,B0,C0,D0,
1,A1,B1,C1,D1,
2,A2,B2,C2,D2,
3,A3,B3,C3,D3,
4,,B2,,D2,F2
5,,B3,,D3,F3
6,,B6,,D6,F6
7,,B7,,D7,F7


In [216]:
# 열기준 데이터 합치기
result = pd.concat([df1, df2], axis=1)

result

Unnamed: 0,A,B,C,D,A.1,B.1,C.1,D.1
0,A0,B0,C0,D0,,,,
1,A1,B1,C1,D1,,,,
2,A2,B2,C2,D2,,,,
3,A3,B3,C3,D3,,,,
4,,,,,A4,B4,C4,D4
5,,,,,A5,B5,C5,D5
6,,,,,A6,B6,C6,D6
7,,,,,A7,B7,C7,D7


In [217]:
# 교집합 기준으로 합치기
result = pd.concat([df1, df4], axis=1, join='inner')

result

Unnamed: 0,A,B,C,D,B.1,D.1,F
2,A2,B2,C2,D2,B2,D2,F2
3,A3,B3,C3,D3,B3,D3,F3


In [218]:
df1

Unnamed: 0,A,B,C,D
0,A0,B0,C0,D0
1,A1,B1,C1,D1
2,A2,B2,C2,D2
3,A3,B3,C3,D3


In [219]:
# 데이터프레임에 시리즈 합치기
s1 = pd.Series(["X0", "X1", "X2", "X3"], name="X")

result = pd.concat([df1, s1], axis=1)

result

Unnamed: 0,A,B,C,D,X
0,A0,B0,C0,D0,X0
1,A1,B1,C1,D1,X1
2,A2,B2,C2,D2,X2
3,A3,B3,C3,D3,X3


## merge()

merge() 함수는 기준이 되는 열이나 인덱스, 즉 key를 기준으로 두 데이터프레임을 합침
- inner join : `pd.merge(left, right, on='key', how='inner)` 
- left join : `pd.merge(left, right, on='key', how='left)`
- right join : `pd.merge(left, right, on='key', how='right')`
- outer join : `pd.merge(left, right, on='key', how='outer')`

In [220]:
left = pd.DataFrame({
    "key": ["K0", "K1", "K2", "K3"],
    "A": ["A0", "A1", "A2", "A3"],
    "B": ["B0", "B1", "B2", "B3"]
})

right = pd.DataFrame({
    "key": ["K0", "K1", "K3", "K4"],
    "C": ["C0", "C1", "C3", "C4"],
    "D": ["D0", "D1", "D3", "D4"],
})

# inner join

result = pd.merge(left, right, on='key')

result

Unnamed: 0,key,A,B,C,D
0,K0,A0,B0,C0,D0
1,K1,A1,B1,C1,D1
2,K3,A3,B3,C3,D3


In [221]:
# left join
result = pd.merge(left, right, on='key', how='left')

result

Unnamed: 0,key,A,B,C,D
0,K0,A0,B0,C0,D0
1,K1,A1,B1,C1,D1
2,K2,A2,B2,,
3,K3,A3,B3,C3,D3


In [222]:
# right join
result = pd.merge(left, right, on='key', how='right')

result

Unnamed: 0,key,A,B,C,D
0,K0,A0,B0,C0,D0
1,K1,A1,B1,C1,D1
2,K3,A3,B3,C3,D3
3,K4,,,C4,D4


In [223]:
# outer join
result = pd.merge(left, right, on='key', how='outer')
result

Unnamed: 0,key,A,B,C,D
0,K0,A0,B0,C0,D0
1,K1,A1,B1,C1,D1
2,K2,A2,B2,,
3,K3,A3,B3,C3,D3
4,K4,,,C4,D4


In [224]:
# 기준되는 열의 이름이 다른 경우 `left_on`, `right_on`을 통해 키를 직접 선언
left = pd.DataFrame({
    "key_left": ["K0", "K1", "K2", "K3"],
    "A": ["A0", "A1", "A2", "A3"],
    "B": ["B0", "B1", "B2", "B3"]
})

right = pd.DataFrame({
    "key_right": ["K0", "K1", "K3", "K4"],
    "C": ["C0", "C1", "C3", "C4"],
    "D": ["D0", "D1", "D3", "D4"],
})

result = pd.merge(left, right, left_on='key_left', 
                  right_on='key_right', how='inner')

result

Unnamed: 0,key_left,A,B,key_right,C,D
0,K0,A0,B0,K0,C0,D0
1,K1,A1,B1,K1,C1,D1
2,K3,A3,B3,K3,C3,D3


In [225]:
# left.merge(right) 
result = left.merge(right, left_on='key_left',
                    right_on='key_right', how='inner')
result

Unnamed: 0,key_left,A,B,key_right,C,D
0,K0,A0,B0,K0,C0,D0
1,K1,A1,B1,K1,C1,D1
2,K3,A3,B3,K3,C3,D3


## join()

`join()` 메서드는 두 데이터프레임의 행 인덱스를 기준으로 데이터를 결합

In [226]:
left = pd.DataFrame({
    "A": ["A0", "A1", "A2", "A3"],
    "B": ["B0", "B1", "B2", "B3"],
}, index=["K0", "K1", "K2", "K3"])

right = pd.DataFrame({
    "C": ["C0", "C1", "C3", "C4"],
    "D": ["D0", "D1", "D3", "D4"]
}, index=["K0", "K1", "K3", "K4"])

In [227]:
left

Unnamed: 0,A,B
K0,A0,B0
K1,A1,B1
K2,A2,B2
K3,A3,B3


In [228]:
right

Unnamed: 0,C,D
K0,C0,D0
K1,C1,D1
K3,C3,D3
K4,C4,D4


In [229]:
result = left.join(right)

result

Unnamed: 0,A,B,C,D
K0,A0,B0,C0,D0
K1,A1,B1,C1,D1
K2,A2,B2,,
K3,A3,B3,C3,D3


# Data Restructuring

In [230]:
import seaborn as sns

df = sns.load_dataset('penguins')

df.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female


## `melt()`

`melt()` 함수는 ID 변수를 기준으로 원본 데이터프레임의 열 이름들을 variable 열에 넣고, 각 열에 있던 데이터는 value열에 넣어 아래로 긴 형태로 만들어 준다.

In [231]:
df.melt(id_vars=['species', 'island']).head(10)

Unnamed: 0,species,island,variable,value
0,Adelie,Torgersen,bill_length_mm,39.1
1,Adelie,Torgersen,bill_length_mm,39.5
2,Adelie,Torgersen,bill_length_mm,40.3
3,Adelie,Torgersen,bill_length_mm,
4,Adelie,Torgersen,bill_length_mm,36.7
5,Adelie,Torgersen,bill_length_mm,39.3
6,Adelie,Torgersen,bill_length_mm,38.9
7,Adelie,Torgersen,bill_length_mm,39.2
8,Adelie,Torgersen,bill_length_mm,34.1
9,Adelie,Torgersen,bill_length_mm,42.0


## `pivot_table()`

- index : 행 인덱스

- column : 열 인덱스

- values : 데이터 값

- aggfunc : 데이터 집계 함수

In [232]:
df_pivot_1 = df.pivot_table(index='species',
                            columns='island',
                            values='bill_length_mm',
                            aggfunc='mean')

df_pivot_1

island,Biscoe,Dream,Torgersen
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Adelie,38.975,38.501786,38.95098
Chinstrap,,48.833824,
Gentoo,47.504878,,


In [233]:
df_pivot_2 = df.pivot_table(index=['species', 'sex'],
                            columns='island',
                            values=['bill_length_mm', 'flipper_length_mm'],
                            aggfunc=['mean', 'count'])

df_pivot_2

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,mean,mean,mean,mean,mean,count,count,count,count,count,count
Unnamed: 0_level_1,Unnamed: 1_level_1,bill_length_mm,bill_length_mm,bill_length_mm,flipper_length_mm,flipper_length_mm,flipper_length_mm,bill_length_mm,bill_length_mm,bill_length_mm,flipper_length_mm,flipper_length_mm,flipper_length_mm
Unnamed: 0_level_2,island,Biscoe,Dream,Torgersen,Biscoe,Dream,Torgersen,Biscoe,Dream,Torgersen,Biscoe,Dream,Torgersen
species,sex,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3
Adelie,Female,37.359091,36.911111,37.554167,187.181818,187.851852,188.291667,22.0,27.0,24.0,22.0,27.0,24.0
Adelie,Male,40.590909,40.071429,40.586957,190.409091,191.928571,194.913043,22.0,28.0,23.0,22.0,28.0,23.0
Chinstrap,Female,,46.573529,,,191.735294,,,34.0,,,34.0,
Chinstrap,Male,,51.094118,,,199.911765,,,34.0,,,34.0,
Gentoo,Female,45.563793,,,212.706897,,,58.0,,,58.0,,
Gentoo,Male,49.47377,,,221.540984,,,61.0,,,61.0,,


## `stack()`과 `unstack()`

- `stack()` : 열 인덱스를 행 인덱스로 변환

- `unstack()` : 행 인덱스를 열 인덱스로 변환

데이터 프레임 형태로 변환을 위해서는 `to_frame()` 메서드를 추가

In [234]:
df_pivot_4 = df.pivot_table(index=['species', 'sex'],
                            columns='island',
                            values='bill_length_mm',
                            aggfunc='mean')
df_pivot_4

Unnamed: 0_level_0,island,Biscoe,Dream,Torgersen
species,sex,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Adelie,Female,37.359091,36.911111,37.554167
Adelie,Male,40.590909,40.071429,40.586957
Chinstrap,Female,,46.573529,
Chinstrap,Male,,51.094118,
Gentoo,Female,45.563793,,
Gentoo,Male,49.47377,,


In [235]:
# stack method
df_pivot_4.stack()

species    sex     island   
Adelie     Female  Biscoe       37.359091
                   Dream        36.911111
                   Torgersen    37.554167
           Male    Biscoe       40.590909
                   Dream        40.071429
                   Torgersen    40.586957
Chinstrap  Female  Dream        46.573529
           Male    Dream        51.094118
Gentoo     Female  Biscoe       45.563793
           Male    Biscoe       49.473770
dtype: float64

In [236]:
# to dataframe
df_pivot_4.stack().to_frame()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,0
species,sex,island,Unnamed: 3_level_1
Adelie,Female,Biscoe,37.359091
Adelie,Female,Dream,36.911111
Adelie,Female,Torgersen,37.554167
Adelie,Male,Biscoe,40.590909
Adelie,Male,Dream,40.071429
Adelie,Male,Torgersen,40.586957
Chinstrap,Female,Dream,46.573529
Chinstrap,Male,Dream,51.094118
Gentoo,Female,Biscoe,45.563793
Gentoo,Male,Biscoe,49.47377


In [237]:
# unstack method
df_pivot_4.unstack()

island,Biscoe,Biscoe,Dream,Dream,Torgersen,Torgersen
sex,Female,Male,Female,Male,Female,Male
species,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
Adelie,37.359091,40.590909,36.911111,40.071429,37.554167,40.586957
Chinstrap,,,46.573529,51.094118,,
Gentoo,45.563793,49.47377,,,,


# Apply function in the DataFrame

## functions into a Series

- series.apply(함수)

In [238]:
import seaborn as sns

sns.load_dataset('penguins')
bill_length_mm = df['bill_length_mm']

bill_length_mm.head()

0    39.1
1    39.5
2    40.3
3     NaN
4    36.7
Name: bill_length_mm, dtype: float64

In [239]:
import numpy as np
result = bill_length_mm.apply(np.sqrt)

result.head()

0    6.252999
1    6.284903
2    6.348228
3         NaN
4    6.058052
Name: bill_length_mm, dtype: float64

In [240]:
def mm_to_cm(num):
    return num / 10

result_2 = bill_length_mm.apply(mm_to_cm)
result_2

0      3.91
1      3.95
2      4.03
3       NaN
4      3.67
       ... 
339     NaN
340    4.68
341    5.04
342    4.52
343    4.99
Name: bill_length_mm, Length: 344, dtype: float64

## functions into a DataFrame

- 각 열에 적용 : DataFrame.apply(함수) 또는 Dataframe.apply(함수, axis=0)

- 각 행에 적용 : DataFrame.apply(함수, axis=1)

In [241]:
df_num = df[['bill_length_mm', 'bill_depth_mm',
             'flipper_length_mm', 'body_mass_g']]
df_num.head()

Unnamed: 0,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g
0,39.1,18.7,181.0,3750.0
1,39.5,17.4,186.0,3800.0
2,40.3,18.0,195.0,3250.0
3,,,,
4,36.7,19.3,193.0,3450.0


In [242]:
# max value in a column
df_num.apply(max)
# df_num.apply(max, axis=0)

bill_length_mm         59.6
bill_depth_mm          21.5
flipper_length_mm     231.0
body_mass_g          6300.0
dtype: float64

In [243]:
# max value in a row
df_num.apply(max, axis=1)

0      3750.0
1      3800.0
2      3250.0
3         NaN
4      3450.0
        ...  
339       NaN
340    4850.0
341    5750.0
342    5200.0
343    5400.0
Length: 344, dtype: float64

In [244]:
import pandas as pd

# custom function
def num_null(data):
    null_vec = pd.isnull(data)      # check if it is missing value
    null_count = np.sum(null_vec)   # sum all True value(1)s
    
    return null_count

df_num.apply(num_null)

bill_length_mm       2
bill_depth_mm        2
flipper_length_mm    2
body_mass_g          2
dtype: int64

# Calculate grups

- split : 데이터를 특정 기준으로 분할

- apply :  데이터를 집계, 변환, 필터링

- combine : 적용의 결과를 하나로 결합

In [245]:
import seaborn as sns

df = sns.load_dataset('penguins')

df.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female


## groupby the groups

In [246]:
df_group = df.groupby(['species'])

df_group

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x0000020632E52B50>

In [247]:
df_group.head(2)

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
152,Chinstrap,Dream,46.5,17.9,192.0,3500.0,Female
153,Chinstrap,Dream,50.0,19.5,196.0,3900.0,Male
220,Gentoo,Biscoe,46.1,13.2,211.0,4500.0,Female
221,Gentoo,Biscoe,50.0,16.3,230.0,5700.0,Male


`groupby()` 메서드 내에 기준이 되는 열을 입력하면 그룹 객체가 만들어진다. 현재는 분할만 이루어진 상태이므로 데이터를 출력해도 기존의 데이터프레임과는 크게 차이가 나지 않는다.

In [248]:
for key, group in df_group:
    print(key)
    display(group.head(2))

('Adelie',)


Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female


('Chinstrap',)


Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
152,Chinstrap,Dream,46.5,17.9,192.0,3500.0,Female
153,Chinstrap,Dream,50.0,19.5,196.0,3900.0,Male


('Gentoo',)


Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
220,Gentoo,Biscoe,46.1,13.2,211.0,4500.0,Female
221,Gentoo,Biscoe,50.0,16.3,230.0,5700.0,Male


# calculating according to groups

In [249]:
df_group[["bill_length_mm", "bill_depth_mm", "flipper_length_mm", "body_mass_g"]].mean()

Unnamed: 0_level_0,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Adelie,38.791391,18.346358,189.953642,3700.662252
Chinstrap,48.833824,18.420588,195.823529,3733.088235
Gentoo,47.504878,14.982114,217.186992,5076.01626


In [250]:
df.groupby(['species', 'sex'])[["bill_length_mm", "bill_depth_mm", "flipper_length_mm", "body_mass_g"]].mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g
species,sex,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Adelie,Female,37.257534,17.621918,187.794521,3368.835616
Adelie,Male,40.390411,19.072603,192.410959,4043.493151
Chinstrap,Female,46.573529,17.588235,191.735294,3527.205882
Chinstrap,Male,51.094118,19.252941,199.911765,3938.970588
Gentoo,Female,45.563793,14.237931,212.706897,4679.741379
Gentoo,Male,49.47377,15.718033,221.540984,5484.836066


In [251]:
def min_max(x):
    return x.max() - x.min()

df.groupby(['species'])['bill_length_mm'].agg(min_max)

species
Adelie       13.9
Chinstrap    17.1
Gentoo       18.7
Name: bill_length_mm, dtype: float64

In [252]:
df.groupby(['species'])[["bill_length_mm", "bill_depth_mm", "flipper_length_mm", "body_mass_g"]].agg(['max', 'min'])

Unnamed: 0_level_0,bill_length_mm,bill_length_mm,bill_depth_mm,bill_depth_mm,flipper_length_mm,flipper_length_mm,body_mass_g,body_mass_g
Unnamed: 0_level_1,max,min,max,min,max,min,max,min
species,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
Adelie,46.0,32.1,21.5,15.5,210.0,172.0,4775.0,2850.0
Chinstrap,58.0,40.9,20.8,16.4,212.0,178.0,4800.0,2700.0
Gentoo,59.6,40.9,17.3,13.1,231.0,203.0,6300.0,3950.0


`agg()` 메서드 내에 원하는 집계 연산을 리스트 행태로 입력하면 일괄적으로 적용이 된다. 각 열마다 다른 종류의 함수를 적용할 수도 있다.

In [253]:
df.groupby(['species']).agg({'bill_length_mm': ['max', 'min'],
                             'island': ['count']})

Unnamed: 0_level_0,bill_length_mm,bill_length_mm,island
Unnamed: 0_level_1,max,min,count
species,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
Adelie,46.0,32.1,152
Chinstrap,58.0,40.9,68
Gentoo,59.6,40.9,124


`transform()` 메서드 이용할 경우 그룹별로 함수를 적용하는 것은 동일하지만, 그 결과를 본래의 행 인덱스와 열 인덱스를 기준으로 반환. 따라서 원본 데이터프레임과 같은 형태로 변형하여 정리.

In [254]:
df.groupby(['species'])['bill_length_mm'].transform('mean')

0      38.791391
1      38.791391
2      38.791391
3      38.791391
4      38.791391
         ...    
339    47.504878
340    47.504878
341    47.504878
342    47.504878
343    47.504878
Name: bill_length_mm, Length: 344, dtype: float64

`species`별로 그룹을 나눈 후 'bill_length_mm'열을 선택하였다. 그후 `transform()` 메서드를 통해 평균을 구하면, 각 species별 평균이 집계되는 것이 아닌 원래의 행 인덱스와 열 인덱스에 결과가 반환된다. 이러한 점을 응용해 z-score를 계산

$$z=\frac{x-\mu}{\sigma}$$

In [255]:
def z_score(x):
    z = (x - x.mean()) / x.std()
    return z

df.groupby(['species'])['bill_length_mm'].transform(z_score)

0      0.115870
1      0.266054
2      0.566421
3           NaN
4     -0.785232
         ...   
339         NaN
340   -0.228719
341    0.939408
342   -0.747886
343    0.777168
Name: bill_length_mm, Length: 344, dtype: float64

In [256]:
# apply() 메서드를 그룹 객체에 적용 가능
df.groupby(['species'])['bill_length_mm'].apply(min)

species
Adelie        NaN
Chinstrap    40.9
Gentoo        NaN
Name: bill_length_mm, dtype: float64

In [257]:
df.groupby(['species'])['bill_length_mm'].apply(z_score)

species     
Adelie   0      0.115870
         1      0.266054
         2      0.566421
         3           NaN
         4     -0.785232
                  ...   
Gentoo   339         NaN
         340   -0.228719
         341    0.939408
         342   -0.747886
         343    0.777168
Name: bill_length_mm, Length: 344, dtype: float64

In [258]:
df.groupby(['species'])['bill_length_mm'].mean()

species
Adelie       38.791391
Chinstrap    48.833824
Gentoo       47.504878
Name: bill_length_mm, dtype: float64

In [259]:
# 'bill_length_mm'의 평균이 40 이상인 그룹만 필터링
df.groupby(['species']).filter(lambda x: x['bill_length_mm'].mean() >= 40)

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
152,Chinstrap,Dream,46.5,17.9,192.0,3500.0,Female
153,Chinstrap,Dream,50.0,19.5,196.0,3900.0,Male
154,Chinstrap,Dream,51.3,19.2,193.0,3650.0,Male
155,Chinstrap,Dream,45.4,18.7,188.0,3525.0,Female
156,Chinstrap,Dream,52.7,19.8,197.0,3725.0,Male
...,...,...,...,...,...,...,...
339,Gentoo,Biscoe,,,,,
340,Gentoo,Biscoe,46.8,14.3,215.0,4850.0,Female
341,Gentoo,Biscoe,50.4,15.7,222.0,5750.0,Male
342,Gentoo,Biscoe,45.2,14.8,212.0,5200.0,Female


# Timeseries data

In [260]:
import seaborn as sns

df = sns.load_dataset('taxis')
df.head()

Unnamed: 0,pickup,dropoff,passengers,distance,fare,tip,tolls,total,color,payment,pickup_zone,dropoff_zone,pickup_borough,dropoff_borough
0,2019-03-23 20:21:09,2019-03-23 20:27:24,1,1.6,7.0,2.15,0.0,12.95,yellow,credit card,Lenox Hill West,UN/Turtle Bay South,Manhattan,Manhattan
1,2019-03-04 16:11:55,2019-03-04 16:19:00,1,0.79,5.0,0.0,0.0,9.3,yellow,cash,Upper West Side South,Upper West Side South,Manhattan,Manhattan
2,2019-03-27 17:53:01,2019-03-27 18:00:25,1,1.37,7.5,2.36,0.0,14.16,yellow,credit card,Alphabet City,West Village,Manhattan,Manhattan
3,2019-03-10 01:23:59,2019-03-10 01:49:51,1,7.7,27.0,6.15,0.0,36.95,yellow,credit card,Hudson Sq,Yorkville West,Manhattan,Manhattan
4,2019-03-30 13:27:42,2019-03-30 13:37:14,3,2.16,9.0,1.1,0.0,13.4,yellow,credit card,Midtown East,Yorkville West,Manhattan,Manhattan


In [261]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6433 entries, 0 to 6432
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   pickup           6433 non-null   datetime64[ns]
 1   dropoff          6433 non-null   datetime64[ns]
 2   passengers       6433 non-null   int64         
 3   distance         6433 non-null   float64       
 4   fare             6433 non-null   float64       
 5   tip              6433 non-null   float64       
 6   tolls            6433 non-null   float64       
 7   total            6433 non-null   float64       
 8   color            6433 non-null   object        
 9   payment          6389 non-null   object        
 10  pickup_zone      6407 non-null   object        
 11  dropoff_zone     6388 non-null   object        
 12  pickup_borough   6407 non-null   object        
 13  dropoff_borough  6388 non-null   object        
dtypes: datetime64[ns](2), float64(5), int64(

In [262]:
df['pickup'] = pd.to_datetime(df['pickup'])
df['dropoff'] = pd.to_datetime(df['dropoff'])

In [263]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6433 entries, 0 to 6432
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   pickup           6433 non-null   datetime64[ns]
 1   dropoff          6433 non-null   datetime64[ns]
 2   passengers       6433 non-null   int64         
 3   distance         6433 non-null   float64       
 4   fare             6433 non-null   float64       
 5   tip              6433 non-null   float64       
 6   tolls            6433 non-null   float64       
 7   total            6433 non-null   float64       
 8   color            6433 non-null   object        
 9   payment          6389 non-null   object        
 10  pickup_zone      6407 non-null   object        
 11  dropoff_zone     6388 non-null   object        
 12  pickup_borough   6407 non-null   object        
 13  dropoff_borough  6388 non-null   object        
dtypes: datetime64[ns](2), float64(5), int64(

In [264]:
# extract year

df['pickup']

0      2019-03-23 20:21:09
1      2019-03-04 16:11:55
2      2019-03-27 17:53:01
3      2019-03-10 01:23:59
4      2019-03-30 13:27:42
               ...        
6428   2019-03-31 09:51:53
6429   2019-03-31 17:38:00
6430   2019-03-23 22:55:18
6431   2019-03-04 10:09:25
6432   2019-03-13 19:31:22
Name: pickup, Length: 6433, dtype: datetime64[ns]

In [265]:
# extract year

df['pickup'][0]

Timestamp('2019-03-23 20:21:09')

In [266]:
# extract year

df['pickup'][0]

Timestamp('2019-03-23 20:21:09')

In [267]:
# extract year

df['pickup'][0].year

2019

dt 접근자를 사용하면 datetime 타입의 열에 한번에 접근 가능

In [268]:
df['year'] = df['pickup'].dt.year
df['month'] = df['pickup'].dt.month
df['day'] = df['pickup'].dt.day

df[['pickup', 'year', 'month', 'day']].head()

Unnamed: 0,pickup,year,month,day
0,2019-03-23 20:21:09,2019,3,23
1,2019-03-04 16:11:55,2019,3,4
2,2019-03-27 17:53:01,2019,3,27
3,2019-03-10 01:23:59,2019,3,10
4,2019-03-30 13:27:42,2019,3,30


In [269]:
# sort table
df.sort_values('pickup', inplace=True)
df.reset_index(drop=True, inplace=True)

df.head()

Unnamed: 0,pickup,dropoff,passengers,distance,fare,tip,tolls,total,color,payment,pickup_zone,dropoff_zone,pickup_borough,dropoff_borough,year,month,day
0,2019-02-28 23:29:03,2019-02-28 23:32:35,1,0.9,5.0,0.0,0.0,6.3,green,cash,Old Astoria,Long Island City/Queens Plaza,Queens,Queens,2019,2,28
1,2019-03-01 00:03:29,2019-03-01 00:13:32,3,2.16,10.0,2.0,0.0,15.8,yellow,credit card,Lincoln Square East,Upper East Side North,Manhattan,Manhattan,2019,3,1
2,2019-03-01 00:08:32,2019-03-01 00:29:47,3,7.35,22.5,1.0,0.0,27.3,yellow,credit card,East Chelsea,Mott Haven/Port Morris,Manhattan,Bronx,2019,3,1
3,2019-03-01 00:15:53,2019-03-01 00:47:58,1,7.0,25.5,7.3,0.0,36.6,yellow,credit card,West Village,Astoria,Manhattan,Queens,2019,3,1
4,2019-03-01 00:29:22,2019-03-01 00:32:48,4,0.74,4.5,1.0,0.0,9.3,yellow,credit card,Meatpacking/West Village West,Meatpacking/West Village West,Manhattan,Manhattan,2019,3,1


In [270]:
# 운행 시간 pickup - dropoff

df['dropoff'] - df['pickup']

0      0 days 00:03:32
1      0 days 00:10:03
2      0 days 00:21:15
3      0 days 00:32:05
4      0 days 00:03:26
             ...      
6428   0 days 00:09:13
6429   0 days 00:02:18
6430   0 days 00:03:17
6431   0 days 00:12:09
6432   0 days 00:30:13
Length: 6433, dtype: timedelta64[ns]

In [271]:
# set 'pickup' column as an index
df.set_index('pickup', inplace=True)
df.head()

Unnamed: 0_level_0,dropoff,passengers,distance,fare,tip,tolls,total,color,payment,pickup_zone,dropoff_zone,pickup_borough,dropoff_borough,year,month,day
pickup,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2019-02-28 23:29:03,2019-02-28 23:32:35,1,0.9,5.0,0.0,0.0,6.3,green,cash,Old Astoria,Long Island City/Queens Plaza,Queens,Queens,2019,2,28
2019-03-01 00:03:29,2019-03-01 00:13:32,3,2.16,10.0,2.0,0.0,15.8,yellow,credit card,Lincoln Square East,Upper East Side North,Manhattan,Manhattan,2019,3,1
2019-03-01 00:08:32,2019-03-01 00:29:47,3,7.35,22.5,1.0,0.0,27.3,yellow,credit card,East Chelsea,Mott Haven/Port Morris,Manhattan,Bronx,2019,3,1
2019-03-01 00:15:53,2019-03-01 00:47:58,1,7.0,25.5,7.3,0.0,36.6,yellow,credit card,West Village,Astoria,Manhattan,Queens,2019,3,1
2019-03-01 00:29:22,2019-03-01 00:32:48,4,0.74,4.5,1.0,0.0,9.3,yellow,credit card,Meatpacking/West Village West,Meatpacking/West Village West,Manhattan,Manhattan,2019,3,1


In [272]:
# check index type
df.index

DatetimeIndex(['2019-02-28 23:29:03', '2019-03-01 00:03:29',
               '2019-03-01 00:08:32', '2019-03-01 00:15:53',
               '2019-03-01 00:29:22', '2019-03-01 00:30:59',
               '2019-03-01 00:32:49', '2019-03-01 00:53:00',
               '2019-03-01 00:56:50', '2019-03-01 01:25:30',
               ...
               '2019-03-31 21:27:22', '2019-03-31 21:35:29',
               '2019-03-31 21:40:28', '2019-03-31 21:55:23',
               '2019-03-31 22:07:15', '2019-03-31 22:13:37',
               '2019-03-31 22:32:27', '2019-03-31 22:51:53',
               '2019-03-31 23:15:03', '2019-03-31 23:43:45'],
              dtype='datetime64[ns]', name='pickup', length=6433, freq=None)

In [273]:
df.loc['2019-02']

Unnamed: 0_level_0,dropoff,passengers,distance,fare,tip,tolls,total,color,payment,pickup_zone,dropoff_zone,pickup_borough,dropoff_borough,year,month,day
pickup,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2019-02-28 23:29:03,2019-02-28 23:32:35,1,0.9,5.0,0.0,0.0,6.3,green,cash,Old Astoria,Long Island City/Queens Plaza,Queens,Queens,2019,2,28


In [274]:
df.loc['2019-03-01':'2019-03-02']

Unnamed: 0_level_0,dropoff,passengers,distance,fare,tip,tolls,total,color,payment,pickup_zone,dropoff_zone,pickup_borough,dropoff_borough,year,month,day
pickup,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2019-03-01 00:03:29,2019-03-01 00:13:32,3,2.16,10.0,2.00,0.0,15.80,yellow,credit card,Lincoln Square East,Upper East Side North,Manhattan,Manhattan,2019,3,1
2019-03-01 00:08:32,2019-03-01 00:29:47,3,7.35,22.5,1.00,0.0,27.30,yellow,credit card,East Chelsea,Mott Haven/Port Morris,Manhattan,Bronx,2019,3,1
2019-03-01 00:15:53,2019-03-01 00:47:58,1,7.00,25.5,7.30,0.0,36.60,yellow,credit card,West Village,Astoria,Manhattan,Queens,2019,3,1
2019-03-01 00:29:22,2019-03-01 00:32:48,4,0.74,4.5,1.00,0.0,9.30,yellow,credit card,Meatpacking/West Village West,Meatpacking/West Village West,Manhattan,Manhattan,2019,3,1
2019-03-01 00:30:59,2019-03-01 00:37:39,2,1.35,7.0,0.00,0.0,8.30,green,cash,Astoria,Queensbridge/Ravenswood,Queens,Queens,2019,3,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2019-03-02 23:51:46,2019-03-02 23:51:49,1,0.00,14.0,2.86,0.0,17.16,yellow,credit card,Lenox Hill East,Lenox Hill East,Manhattan,Manhattan,2019,3,2
2019-03-02 23:52:13,2019-03-03 00:03:03,2,1.30,8.5,2.45,0.0,14.75,yellow,credit card,Greenwich Village North,Gramercy,Manhattan,Manhattan,2019,3,2
2019-03-02 23:52:21,2019-03-03 00:06:59,1,3.53,13.0,0.00,0.0,16.80,yellow,cash,Manhattan Valley,Garment District,Manhattan,Manhattan,2019,3,2
2019-03-02 23:55:24,2019-03-03 00:04:02,1,1.60,8.0,2.36,0.0,14.16,yellow,credit card,Central Park,Upper West Side North,Manhattan,Manhattan,2019,3,2


## create timeseries data

In [275]:
pd.date_range(start='2021-01-01',
              end='2021-12-31',
              freq='M')

DatetimeIndex(['2021-01-31', '2021-02-28', '2021-03-31', '2021-04-30',
               '2021-05-31', '2021-06-30', '2021-07-31', '2021-08-31',
               '2021-09-30', '2021-10-31', '2021-11-30', '2021-12-31'],
              dtype='datetime64[ns]', freq='M')

In [276]:
pd.date_range(start='2021-01-01',
              end='2021-01-31',
              freq='3D')

DatetimeIndex(['2021-01-01', '2021-01-04', '2021-01-07', '2021-01-10',
               '2021-01-13', '2021-01-16', '2021-01-19', '2021-01-22',
               '2021-01-25', '2021-01-28', '2021-01-31'],
              dtype='datetime64[ns]', freq='3D')

In [277]:
# 매주 월요일
pd.date_range(start='2021-01-01',
              end='2021-01-31',
              freq='W-MON')

DatetimeIndex(['2021-01-04', '2021-01-11', '2021-01-18', '2021-01-25'], dtype='datetime64[ns]', freq='W-MON')

In [279]:
# WOM : week of month
# 2THU : 둘째주 목요일
# WOM-2THU : 매월 둘째주 목요일

pd.date_range(start='2021-01-01',
              end='2021-12-31',
              freq='WOM-2THU')

DatetimeIndex(['2021-01-14', '2021-02-11', '2021-03-11', '2021-04-08',
               '2021-05-13', '2021-06-10', '2021-07-08', '2021-08-12',
               '2021-09-09', '2021-10-14', '2021-11-11', '2021-12-09'],
              dtype='datetime64[ns]', freq='WOM-2THU')