# Pandas 란?

- 파이썬에서 데이터를 간편하게 다룰 수 있도록 만들어진 라이브러리
- **2차원 데이터**에 대한 집계, 전처리 등을 쉽게 할 수 있게 해준다.
- **정형 데이터**를 다루는 라이브러리


# pandas의 데이터 구조
- DataFrame : 2차원 구조로 되어있는 형식 (행렬 데이터)
- Series : 1차원 구조로 되어있는 한종류의 데이터(열방향데이터)
- Series가 여러 개 모이면 DataFrame이 된다.

In [None]:
import pandas as pd

- DataFrame 만들기


In [None]:
data = [
            ["A군", 30, 170],
            ["B군", 25 , 180]
        ]
df = pd.DataFrame(data,columns=["이름","나이","키"])
df

Unnamed: 0,이름,나이,키
0,A군,30,170
1,B군,25,180


In [None]:
type(df)

pandas.core.frame.DataFrame

- Series

In [None]:
df["이름"]

0    A군
1    B군
Name: 이름, dtype: object

In [None]:
type(df["이름"])

pandas.core.series.Series

In [None]:
# key 가 컬럼명
# values는 각컬럼에 채워질 열방향 데이터
data = {
    "이름" : ["A군","B군","C군"],
    "나이" : [35, 33, 28],
    "키" : [180.1, 175.5, 163.3]
}
pd.DataFrame(data)

Unnamed: 0,이름,나이,키
0,A군,35,180.1
1,B군,33,175.5
2,C군,28,163.3


# CSV 파일 불러오기
- https://drive.google.com/file/d/1UCwmBEjjzS8D5VeLvLXNR81lkWtVNrEo/view?usp=sharing

- read_csv 함수
    - csv 파일 읽어서 dataframe 객체로 변환하기

In [None]:
df = pd.read_csv("titanic.csv")
df

Unnamed: 0,passengerid,survived,pclass,name,gender,age,sibsp,parch,ticket,fare,cabin,embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
1304,1305,0,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S
1305,1306,1,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C
1306,1307,0,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
1307,1308,0,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S


## 타이타닉 호 침몰 당시의 승객 명단 데이터
- passengerid: 승객번호
- survived: 생존 여부
- pclass: 티켓 등급
- name : 이름
- gender: 성별
- age: 나이
- sibsp: 함께 탑승한 형제자매, 배우자의 수
- parch: 함께 탑승한 부모, 자식의 수
- ticket: 티켓 번호
- fare: 운임
- cabin: 객실 번호
- embarked: 탑승 항구

# 데이터 프레임 기초 정보 확인하기

- 컬럼명 확인

In [None]:
df.columns

Index(['passengerid', 'survived', 'pclass', 'name', 'gender', 'age', 'sibsp',
       'parch', 'ticket', 'fare', 'cabin', 'embarked'],
      dtype='object')

- 데이터 프레임 정보 확인하기

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   passengerid  1309 non-null   int64  
 1   survived     1309 non-null   int64  
 2   pclass       1309 non-null   int64  
 3   name         1309 non-null   object 
 4   gender       1309 non-null   object 
 5   age          1046 non-null   float64
 6   sibsp        1309 non-null   int64  
 7   parch        1309 non-null   int64  
 8   ticket       1309 non-null   object 
 9   fare         1308 non-null   float64
 10  cabin        295 non-null    object 
 11  embarked     1307 non-null   object 
dtypes: float64(2), int64(5), object(5)
memory usage: 122.8+ KB


- 수치형 컬럼에 대한 요약 통계 보기

In [None]:
df.describe()

Unnamed: 0,passengerid,survived,pclass,age,sibsp,parch,fare
count,1309.0,1309.0,1309.0,1046.0,1309.0,1309.0,1308.0
mean,655.0,0.377387,2.294882,29.881138,0.498854,0.385027,33.295479
std,378.020061,0.484918,0.837836,14.413493,1.041658,0.86556,51.758668
min,1.0,0.0,1.0,0.17,0.0,0.0,0.0
25%,328.0,0.0,2.0,21.0,0.0,0.0,7.8958
50%,655.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,982.0,1.0,3.0,39.0,1.0,0.0,31.275
max,1309.0,1.0,3.0,80.0,8.0,9.0,512.3292


In [None]:
df.shape

(1309, 12)

In [None]:
df.head()

Unnamed: 0,passengerid,survived,pclass,name,gender,age,sibsp,parch,ticket,fare,cabin,embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [None]:
df.tail()

Unnamed: 0,passengerid,survived,pclass,name,gender,age,sibsp,parch,ticket,fare,cabin,embarked
1304,1305,0,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.05,,S
1305,1306,1,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9,C105,C
1306,1307,0,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.25,,S
1307,1308,0,3,"Ware, Mr. Frederick",male,,0,0,359309,8.05,,S
1308,1309,0,3,"Peter, Master. Michael J",male,,1,1,2668,22.3583,,C


# 데이터프레임 다루기

- 컬럼명 변경하기

In [None]:
# 한번에 변경하기
cols = ["승객번호","생존여부","티켓등급","이름","성별","나이","형제자매_배우자의수","부모_자식의수","티켓번호","운임","객실번호","탑승항구"]
df.columns = cols
df.head()

Unnamed: 0,승객번호,생존여부,티켓등급,이름,성별,나이,형제자매_배우자의수,부모_자식의수,티켓번호,운임,객실번호,탑승항구
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


- rename 메소드
    - 지정한 컬럼명들을 변경할수있다.
    - 딕셔너리 형태로 전달하면 된다.
    - key 는 변경전 열이름, value는 변경후 열이름

In [None]:
cols_rename = {
    "생존여부" : "생존"
}
df = df.rename(columns= cols_rename)
df.head()

Unnamed: 0,승객번호,생존,티켓등급,이름,성별,나이,형제자매_배우자의수,부모_자식의수,티켓번호,운임,객실번호,탑승항구
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


- add_prefix
    - 컬럼명 앞부분에 공통된 문자열 붙혀주기

In [None]:
df = df.add_prefix("타이타닉_")
df.head()

Unnamed: 0,타이타닉_승객번호,타이타닉_생존,타이타닉_티켓등급,타이타닉_이름,타이타닉_성별,타이타닉_나이,타이타닉_형제자매_배우자의수,타이타닉_부모_자식의수,타이타닉_티켓번호,타이타닉_운임,타이타닉_객실번호,타이타닉_탑승항구
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


- add_suffix
    - 컬럼명 뒷부분에 공통된 문자열 붙혀주기

In [None]:
df = df.add_suffix("_컬럼")
df.head()

Unnamed: 0,타이타닉_승객번호_컬럼,타이타닉_생존_컬럼,타이타닉_티켓등급_컬럼,타이타닉_이름_컬럼,타이타닉_성별_컬럼,타이타닉_나이_컬럼,타이타닉_형제자매_배우자의수_컬럼,타이타닉_부모_자식의수_컬럼,타이타닉_티켓번호_컬럼,타이타닉_운임_컬럼,타이타닉_객실번호_컬럼,타이타닉_탑승항구_컬럼
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


- 특정 컬럼들만 선택하기

In [None]:
cols = ["타이타닉_생존_컬럼","타이타닉_나이_컬럼","타이타닉_탑승항구_컬럼"]
df[cols].head()

Unnamed: 0,타이타닉_생존_컬럼,타이타닉_나이_컬럼,타이타닉_탑승항구_컬럼
0,0,22.0,S
1,1,38.0,C
2,1,26.0,S
3,1,35.0,S
4,0,35.0,S


- 컬럼 삭제하기

In [None]:
df.drop("타이타닉_승객번호_컬럼",axis=1) # 열방향

Unnamed: 0,타이타닉_생존_컬럼,타이타닉_티켓등급_컬럼,타이타닉_이름_컬럼,타이타닉_성별_컬럼,타이타닉_나이_컬럼,타이타닉_형제자매_배우자의수_컬럼,타이타닉_부모_자식의수_컬럼,타이타닉_티켓번호_컬럼,타이타닉_운임_컬럼,타이타닉_객실번호_컬럼,타이타닉_탑승항구_컬럼
0,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...
1304,0,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S
1305,1,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C
1306,0,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
1307,0,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S


In [None]:
df.drop(["타이타닉_승객번호_컬럼","타이타닉_생존_컬럼"],axis=1) # 여러개

Unnamed: 0,타이타닉_티켓등급_컬럼,타이타닉_이름_컬럼,타이타닉_성별_컬럼,타이타닉_나이_컬럼,타이타닉_형제자매_배우자의수_컬럼,타이타닉_부모_자식의수_컬럼,타이타닉_티켓번호_컬럼,타이타닉_운임_컬럼,타이타닉_객실번호_컬럼,타이타닉_탑승항구_컬럼
0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...
1304,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S
1305,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C
1306,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
1307,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S


In [None]:
df.drop(columns=["타이타닉_승객번호_컬럼","타이타닉_생존_컬럼"]) # 키워드 아규먼트를 이용

Unnamed: 0,타이타닉_티켓등급_컬럼,타이타닉_이름_컬럼,타이타닉_성별_컬럼,타이타닉_나이_컬럼,타이타닉_형제자매_배우자의수_컬럼,타이타닉_부모_자식의수_컬럼,타이타닉_티켓번호_컬럼,타이타닉_운임_컬럼,타이타닉_객실번호_컬럼,타이타닉_탑승항구_컬럼
0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...
1304,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S
1305,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C
1306,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
1307,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S


- 컬럼을 기준으로 데이터 정렬하기(기본은 오름차순)

In [None]:
df.sort_values(by="타이타닉_나이_컬럼") # nan 값은 제외하고 정렬

Unnamed: 0,타이타닉_승객번호_컬럼,타이타닉_생존_컬럼,타이타닉_티켓등급_컬럼,타이타닉_이름_컬럼,타이타닉_성별_컬럼,타이타닉_나이_컬럼,타이타닉_형제자매_배우자의수_컬럼,타이타닉_부모_자식의수_컬럼,타이타닉_티켓번호_컬럼,타이타닉_운임_컬럼,타이타닉_객실번호_컬럼,타이타닉_탑승항구_컬럼
1245,1246,1,3,"Dean, Miss. Elizabeth Gladys Millvina""""",female,0.17,1,2,C.A. 2315,20.5750,,S
1092,1093,0,3,"Danbom, Master. Gilbert Sigvard Emanuel",male,0.33,0,2,347080,14.4000,,S
803,804,1,3,"Thomas, Master. Assad Alexander",male,0.42,0,1,2625,8.5167,,C
755,756,1,2,"Hamalainen, Master. Viljo",male,0.67,1,1,250649,14.5000,,S
469,470,1,3,"Baclini, Miss. Helene Barbara",female,0.75,2,1,2666,19.2583,,C
...,...,...,...,...,...,...,...,...,...,...,...,...
1299,1300,1,3,"Riordan, Miss. Johanna Hannah""""",female,,0,0,334915,7.7208,,Q
1301,1302,1,3,"Naughton, Miss. Hannah",female,,0,0,365237,7.7500,,Q
1304,1305,0,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S
1307,1308,0,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S


In [None]:
df2 = df.sort_values(by="타이타닉_나이_컬럼" , ascending=False) # 내림차순
df2

Unnamed: 0,타이타닉_승객번호_컬럼,타이타닉_생존_컬럼,타이타닉_티켓등급_컬럼,타이타닉_이름_컬럼,타이타닉_성별_컬럼,타이타닉_나이_컬럼,타이타닉_형제자매_배우자의수_컬럼,타이타닉_부모_자식의수_컬럼,타이타닉_티켓번호_컬럼,타이타닉_운임_컬럼,타이타닉_객실번호_컬럼,타이타닉_탑승항구_컬럼
630,631,1,1,"Barkworth, Mr. Algernon Henry Wilson",male,80.0,0,0,27042,30.0000,A23,S
987,988,1,1,"Cavendish, Mrs. Tyrell William (Julia Florence...",female,76.0,1,0,19877,78.8500,C46,S
851,852,0,3,"Svensson, Mr. Johan",male,74.0,0,0,347060,7.7750,,S
493,494,0,1,"Artagaveytia, Mr. Ramon",male,71.0,0,0,PC 17609,49.5042,,C
96,97,0,1,"Goldschmidt, Mr. George B",male,71.0,0,0,PC 17754,34.6542,A5,C
...,...,...,...,...,...,...,...,...,...,...,...,...
1299,1300,1,3,"Riordan, Miss. Johanna Hannah""""",female,,0,0,334915,7.7208,,Q
1301,1302,1,3,"Naughton, Miss. Hannah",female,,0,0,365237,7.7500,,Q
1304,1305,0,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S
1307,1308,0,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S


# 행 열 다루기
- 데이터프레임에는 행에대한 이름을 index 라고하고, 열에대한 이름을 column 이라한다.
- DataFrame도 numpy 기반으로 돌아가기 때문에 행번호, 열번호 가 있다.


In [None]:
import numpy as np
np.arange(1,7).reshape([2,3])

array([[1, 2, 3],
       [4, 5, 6]])

- `iloc`
    - 행번호 , 열번호 를 이용한 행, 열 슬라이싱


In [None]:
df2.iloc[:5,1:5]

Unnamed: 0,타이타닉_생존_컬럼,타이타닉_티켓등급_컬럼,타이타닉_이름_컬럼,타이타닉_성별_컬럼
630,1,1,"Barkworth, Mr. Algernon Henry Wilson",male
987,1,1,"Cavendish, Mrs. Tyrell William (Julia Florence...",female
851,0,3,"Svensson, Mr. Johan",male
493,0,1,"Artagaveytia, Mr. Ramon",male
96,0,1,"Goldschmidt, Mr. George B",male


- 인덱스를 담은 리스트를 이용하여 행열 선택이 가능

In [None]:
num_rows = [1,3]
num_cols = [0,4,5]
df2.iloc[num_rows,num_cols]

Unnamed: 0,타이타닉_승객번호_컬럼,타이타닉_성별_컬럼,타이타닉_나이_컬럼
987,988,female,76.0
493,494,male,71.0


- `loc`
    - index 명 과 column명  이용한 행, 열 슬라이싱
    - 마스킹을 이용한 행과 열 선택이 가능하다.

In [None]:
df

Unnamed: 0,타이타닉_승객번호_컬럼,타이타닉_생존_컬럼,타이타닉_티켓등급_컬럼,타이타닉_이름_컬럼,타이타닉_성별_컬럼,타이타닉_나이_컬럼,타이타닉_형제자매_배우자의수_컬럼,타이타닉_부모_자식의수_컬럼,타이타닉_티켓번호_컬럼,타이타닉_운임_컬럼,타이타닉_객실번호_컬럼,타이타닉_탑승항구_컬럼
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
1304,1305,0,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S
1305,1306,1,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C
1306,1307,0,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
1307,1308,0,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S


In [None]:
df.loc[:3,:"타이타닉_성별_컬럼"]

Unnamed: 0,타이타닉_승객번호_컬럼,타이타닉_생존_컬럼,타이타닉_티켓등급_컬럼,타이타닉_이름_컬럼,타이타닉_성별_컬럼
0,1,0,3,"Braund, Mr. Owen Harris",male
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female
2,3,1,3,"Heikkinen, Miss. Laina",female
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female


- 인덱스명과 컬럼명을 이용한 선택

In [None]:
idx = [0,2,3]
cols = ["타이타닉_승객번호_컬럼" ,"타이타닉_티켓등급_컬럼", "타이타닉_이름_컬럼"]

df.loc[idx,cols]

Unnamed: 0,타이타닉_승객번호_컬럼,타이타닉_티켓등급_컬럼,타이타닉_이름_컬럼
0,1,3,"Braund, Mr. Owen Harris"
2,3,3,"Heikkinen, Miss. Laina"
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)"


- 인덱스 새로 생성하기

In [None]:
df2.reset_index() # 기존 인덱스가 컬럼이 되고 새로운 인덱스가 생성된다.

Unnamed: 0,index,타이타닉_승객번호_컬럼,타이타닉_생존_컬럼,타이타닉_티켓등급_컬럼,타이타닉_이름_컬럼,타이타닉_성별_컬럼,타이타닉_나이_컬럼,타이타닉_형제자매_배우자의수_컬럼,타이타닉_부모_자식의수_컬럼,타이타닉_티켓번호_컬럼,타이타닉_운임_컬럼,타이타닉_객실번호_컬럼,타이타닉_탑승항구_컬럼
0,630,631,1,1,"Barkworth, Mr. Algernon Henry Wilson",male,80.0,0,0,27042,30.0000,A23,S
1,987,988,1,1,"Cavendish, Mrs. Tyrell William (Julia Florence...",female,76.0,1,0,19877,78.8500,C46,S
2,851,852,0,3,"Svensson, Mr. Johan",male,74.0,0,0,347060,7.7750,,S
3,493,494,0,1,"Artagaveytia, Mr. Ramon",male,71.0,0,0,PC 17609,49.5042,,C
4,96,97,0,1,"Goldschmidt, Mr. George B",male,71.0,0,0,PC 17754,34.6542,A5,C
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1304,1299,1300,1,3,"Riordan, Miss. Johanna Hannah""""",female,,0,0,334915,7.7208,,Q
1305,1301,1302,1,3,"Naughton, Miss. Hannah",female,,0,0,365237,7.7500,,Q
1306,1304,1305,0,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S
1307,1307,1308,0,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S


In [None]:
df2 = df2.reset_index(drop=True) # drop=True 옵션을 주면 기존 인덱스를 삭제하고 새로운 인덱스를 생성한다.
df2

Unnamed: 0,타이타닉_승객번호_컬럼,타이타닉_생존_컬럼,타이타닉_티켓등급_컬럼,타이타닉_이름_컬럼,타이타닉_성별_컬럼,타이타닉_나이_컬럼,타이타닉_형제자매_배우자의수_컬럼,타이타닉_부모_자식의수_컬럼,타이타닉_티켓번호_컬럼,타이타닉_운임_컬럼,타이타닉_객실번호_컬럼,타이타닉_탑승항구_컬럼
0,631,1,1,"Barkworth, Mr. Algernon Henry Wilson",male,80.0,0,0,27042,30.0000,A23,S
1,988,1,1,"Cavendish, Mrs. Tyrell William (Julia Florence...",female,76.0,1,0,19877,78.8500,C46,S
2,852,0,3,"Svensson, Mr. Johan",male,74.0,0,0,347060,7.7750,,S
3,494,0,1,"Artagaveytia, Mr. Ramon",male,71.0,0,0,PC 17609,49.5042,,C
4,97,0,1,"Goldschmidt, Mr. George B",male,71.0,0,0,PC 17754,34.6542,A5,C
...,...,...,...,...,...,...,...,...,...,...,...,...
1304,1300,1,3,"Riordan, Miss. Johanna Hannah""""",female,,0,0,334915,7.7208,,Q
1305,1302,1,3,"Naughton, Miss. Hannah",female,,0,0,365237,7.7500,,Q
1306,1305,0,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S
1307,1308,0,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S


- 마스킹을 이용한 방법
    - loc 를 사용해야한다.

In [None]:
mask = df["타이타닉_생존_컬럼"] == 1 # 생존자만 찾아보기
mask

0       False
1        True
2        True
3        True
4       False
        ...  
1304    False
1305     True
1306    False
1307    False
1308    False
Name: 타이타닉_생존_컬럼, Length: 1309, dtype: bool

In [None]:
df.loc[mask]

Unnamed: 0,타이타닉_승객번호_컬럼,타이타닉_생존_컬럼,타이타닉_티켓등급_컬럼,타이타닉_이름_컬럼,타이타닉_성별_컬럼,타이타닉_나이_컬럼,타이타닉_형제자매_배우자의수_컬럼,타이타닉_부모_자식의수_컬럼,타이타닉_티켓번호_컬럼,타이타닉_운임_컬럼,타이타닉_객실번호_컬럼,타이타닉_탑승항구_컬럼
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C
...,...,...,...,...,...,...,...,...,...,...,...,...
1300,1301,1,3,"Peacock, Miss. Treasteall",female,3.0,1,1,SOTON/O.Q. 3101315,13.7750,,S
1301,1302,1,3,"Naughton, Miss. Hannah",female,,0,0,365237,7.7500,,Q
1302,1303,1,1,"Minahan, Mrs. William Edward (Lillian E Thorpe)",female,37.0,1,0,19928,90.0000,C78,Q
1303,1304,1,3,"Henriksson, Miss. Jenny Lovisa",female,28.0,0,0,347086,7.7750,,S


# 데이터 형식에 기반한 열선택

In [None]:
df.select_dtypes(include="float64")

Unnamed: 0,타이타닉_나이_컬럼,타이타닉_운임_컬럼
0,22.0,7.2500
1,38.0,71.2833
2,26.0,7.9250
3,35.0,53.1000
4,35.0,8.0500
...,...,...
1304,,8.0500
1305,39.0,108.9000
1306,38.5,7.2500
1307,,8.0500


In [None]:
df.select_dtypes(include="number")

Unnamed: 0,타이타닉_승객번호_컬럼,타이타닉_생존_컬럼,타이타닉_티켓등급_컬럼,타이타닉_나이_컬럼,타이타닉_형제자매_배우자의수_컬럼,타이타닉_부모_자식의수_컬럼,타이타닉_운임_컬럼
0,1,0,3,22.0,1,0,7.2500
1,2,1,1,38.0,1,0,71.2833
2,3,1,3,26.0,0,0,7.9250
3,4,1,1,35.0,1,0,53.1000
4,5,0,3,35.0,0,0,8.0500
...,...,...,...,...,...,...,...
1304,1305,0,3,,0,0,8.0500
1305,1306,1,1,39.0,0,0,108.9000
1306,1307,0,3,38.5,0,0,7.2500
1307,1308,0,3,,0,0,8.0500


In [None]:
df.select_dtypes(include="object")

Unnamed: 0,타이타닉_이름_컬럼,타이타닉_성별_컬럼,타이타닉_티켓번호_컬럼,타이타닉_객실번호_컬럼,타이타닉_탑승항구_컬럼
0,"Braund, Mr. Owen Harris",male,A/5 21171,,S
1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,PC 17599,C85,C
2,"Heikkinen, Miss. Laina",female,STON/O2. 3101282,,S
3,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,113803,C123,S
4,"Allen, Mr. William Henry",male,373450,,S
...,...,...,...,...,...
1304,"Spector, Mr. Woolf",male,A.5. 3236,,S
1305,"Oliva y Ocana, Dona. Fermina",female,PC 17758,C105,C
1306,"Saether, Mr. Simon Sivertsen",male,SOTON/O.Q. 3101262,,S
1307,"Ware, Mr. Frederick",male,359309,,S


# 데이터 집계 하기

In [None]:
df["타이타닉_운임_컬럼"].mean() # 평균

33.29547928134557

In [None]:
df["타이타닉_운임_컬럼"].median() # 중앙값

14.4542

In [None]:
df["타이타닉_운임_컬럼"].var() # 분산

2678.959737892891

In [None]:
df["타이타닉_운임_컬럼"].std() # 표준편차

51.75866823917411

In [None]:
df["타이타닉_운임_컬럼"].max()  # 최대값

512.3292

In [None]:
df["타이타닉_운임_컬럼"].min()  # 최소값

0.0

In [None]:
df["타이타닉_운임_컬럼"].quantile([0.25,0.5,0.75]) # 분위수 계산

0.25     7.8958
0.50    14.4542
0.75    31.2750
Name: 타이타닉_운임_컬럼, dtype: float64

In [None]:
df["타이타닉_탑승항구_컬럼"].nunique() # 고유값 개수

3

In [None]:
df["타이타닉_탑승항구_컬럼"].unique() # 고유값 목록

array(['S', 'C', 'Q', nan], dtype=object)

In [None]:
df["타이타닉_탑승항구_컬럼"].mode() # 최빈값

0    S
dtype: object

In [None]:
df["타이타닉_탑승항구_컬럼"].value_counts() # 고유값 별로 개수 세기

S    914
C    270
Q    123
Name: 타이타닉_탑승항구_컬럼, dtype: int64

# 결측치 다루기

In [None]:
df.isnull() # 해당값에 결측치가 있을경우 True 없을경우 False

Unnamed: 0,타이타닉_승객번호_컬럼,타이타닉_생존_컬럼,타이타닉_티켓등급_컬럼,타이타닉_이름_컬럼,타이타닉_성별_컬럼,타이타닉_나이_컬럼,타이타닉_형제자매_배우자의수_컬럼,타이타닉_부모_자식의수_컬럼,타이타닉_티켓번호_컬럼,타이타닉_운임_컬럼,타이타닉_객실번호_컬럼,타이타닉_탑승항구_컬럼
0,False,False,False,False,False,False,False,False,False,False,True,False
1,False,False,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,True,False
3,False,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...
1304,False,False,False,False,False,True,False,False,False,False,True,False
1305,False,False,False,False,False,False,False,False,False,False,False,False
1306,False,False,False,False,False,False,False,False,False,False,True,False
1307,False,False,False,False,False,True,False,False,False,False,True,False


In [None]:
df.isnull().sum() # 컬럼별 결측치 개수 확인

타이타닉_승객번호_컬럼             0
타이타닉_생존_컬럼               0
타이타닉_티켓등급_컬럼             0
타이타닉_이름_컬럼               0
타이타닉_성별_컬럼               0
타이타닉_나이_컬럼             263
타이타닉_형제자매_배우자의수_컬럼       0
타이타닉_부모_자식의수_컬럼          0
타이타닉_티켓번호_컬럼             0
타이타닉_운임_컬럼               1
타이타닉_객실번호_컬럼          1014
타이타닉_탑승항구_컬럼             2
dtype: int64

In [None]:
df.dropna(how="any") # 행의 결측치가 하나라도 있으면 제거(기본옵션)

Unnamed: 0,타이타닉_승객번호_컬럼,타이타닉_생존_컬럼,타이타닉_티켓등급_컬럼,타이타닉_이름_컬럼,타이타닉_성별_컬럼,타이타닉_나이_컬럼,타이타닉_형제자매_배우자의수_컬럼,타이타닉_부모_자식의수_컬럼,타이타닉_티켓번호_컬럼,타이타닉_운임_컬럼,타이타닉_객실번호_컬럼,타이타닉_탑승항구_컬럼
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
10,11,1,3,"Sandstrom, Miss. Marguerite Rut",female,4.0,1,1,PP 9549,16.7000,G6,S
11,12,1,1,"Bonnell, Miss. Elizabeth",female,58.0,0,0,113783,26.5500,C103,S
...,...,...,...,...,...,...,...,...,...,...,...,...
1295,1296,0,1,"Frauenthal, Mr. Isaac Gerald",male,43.0,1,0,17765,27.7208,D40,C
1296,1297,0,2,"Nourney, Mr. Alfred (Baron von Drachstedt"")""",male,20.0,0,0,SC/PARIS 2166,13.8625,D38,C
1298,1299,0,1,"Widener, Mr. George Dunton",male,50.0,1,1,113503,211.5000,C80,C
1302,1303,1,1,"Minahan, Mrs. William Edward (Lillian E Thorpe)",female,37.0,1,0,19928,90.0000,C78,Q


In [None]:
df.dropna(how="all") # 행에 모든 값이 결측치 일경우 제거

Unnamed: 0,타이타닉_승객번호_컬럼,타이타닉_생존_컬럼,타이타닉_티켓등급_컬럼,타이타닉_이름_컬럼,타이타닉_성별_컬럼,타이타닉_나이_컬럼,타이타닉_형제자매_배우자의수_컬럼,타이타닉_부모_자식의수_컬럼,타이타닉_티켓번호_컬럼,타이타닉_운임_컬럼,타이타닉_객실번호_컬럼,타이타닉_탑승항구_컬럼
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
1304,1305,0,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S
1305,1306,1,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C
1306,1307,0,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
1307,1308,0,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S


# 문자열 다루기
- object 타입 Series 의 `str` 하위 속성을 이용하여 다양한 메소드를 사용할수있다.

In [None]:
df["타이타닉_이름_컬럼"].str.strip() # 앞뒤 공백제거

0                                 Braund, Mr. Owen Harris
1       Cumings, Mrs. John Bradley (Florence Briggs Th...
2                                  Heikkinen, Miss. Laina
3            Futrelle, Mrs. Jacques Heath (Lily May Peel)
4                                Allen, Mr. William Henry
                              ...                        
1304                                   Spector, Mr. Woolf
1305                         Oliva y Ocana, Dona. Fermina
1306                         Saether, Mr. Simon Sivertsen
1307                                  Ware, Mr. Frederick
1308                             Peter, Master. Michael J
Name: 타이타닉_이름_컬럼, Length: 1309, dtype: object

In [None]:
df["타이타닉_이름_컬럼"].str.replace(",","!") # 문자열 바꾸기

0                                 Braund! Mr. Owen Harris
1       Cumings! Mrs. John Bradley (Florence Briggs Th...
2                                  Heikkinen! Miss. Laina
3            Futrelle! Mrs. Jacques Heath (Lily May Peel)
4                                Allen! Mr. William Henry
                              ...                        
1304                                   Spector! Mr. Woolf
1305                         Oliva y Ocana! Dona. Fermina
1306                         Saether! Mr. Simon Sivertsen
1307                                  Ware! Mr. Frederick
1308                             Peter! Master. Michael J
Name: 타이타닉_이름_컬럼, Length: 1309, dtype: object

In [None]:
df["타이타닉_이름_컬럼"].str.split() # 문자열 나누기

0                            [Braund,, Mr., Owen, Harris]
1       [Cumings,, Mrs., John, Bradley, (Florence, Bri...
2                              [Heikkinen,, Miss., Laina]
3       [Futrelle,, Mrs., Jacques, Heath, (Lily, May, ...
4                           [Allen,, Mr., William, Henry]
                              ...                        
1304                               [Spector,, Mr., Woolf]
1305                   [Oliva, y, Ocana,, Dona., Fermina]
1306                    [Saether,, Mr., Simon, Sivertsen]
1307                              [Ware,, Mr., Frederick]
1308                        [Peter,, Master., Michael, J]
Name: 타이타닉_이름_컬럼, Length: 1309, dtype: object

In [None]:
df["타이타닉_이름_컬럼"].str.split(",")

0                              [Braund,  Mr. Owen Harris]
1       [Cumings,  Mrs. John Bradley (Florence Briggs ...
2                               [Heikkinen,  Miss. Laina]
3         [Futrelle,  Mrs. Jacques Heath (Lily May Peel)]
4                             [Allen,  Mr. William Henry]
                              ...                        
1304                                [Spector,  Mr. Woolf]
1305                      [Oliva y Ocana,  Dona. Fermina]
1306                      [Saether,  Mr. Simon Sivertsen]
1307                               [Ware,  Mr. Frederick]
1308                          [Peter,  Master. Michael J]
Name: 타이타닉_이름_컬럼, Length: 1309, dtype: object

In [None]:
df["타이타닉_이름_컬럼"].str.len() # 문자열 길이측정

0       23
1       51
2       22
3       44
4       24
        ..
1304    18
1305    28
1306    28
1307    19
1308    24
Name: 타이타닉_이름_컬럼, Length: 1309, dtype: int64

In [None]:
df["타이타닉_이름_컬럼"].str[:10] # 문자열 슬라이싱

0       Braund, Mr
1       Cumings, M
2       Heikkinen,
3       Futrelle, 
4       Allen, Mr.
           ...    
1304    Spector, M
1305    Oliva y Oc
1306    Saether, M
1307    Ware, Mr. 
1308    Peter, Mas
Name: 타이타닉_이름_컬럼, Length: 1309, dtype: object

In [None]:
df["타이타닉_이름_컬럼"].str.lower() # 소문자 변환

0                                 braund, mr. owen harris
1       cumings, mrs. john bradley (florence briggs th...
2                                  heikkinen, miss. laina
3            futrelle, mrs. jacques heath (lily may peel)
4                                allen, mr. william henry
                              ...                        
1304                                   spector, mr. woolf
1305                         oliva y ocana, dona. fermina
1306                         saether, mr. simon sivertsen
1307                                  ware, mr. frederick
1308                             peter, master. michael j
Name: 타이타닉_이름_컬럼, Length: 1309, dtype: object

In [None]:
df["타이타닉_이름_컬럼"].str.upper() # 대문자 변환

0                                 BRAUND, MR. OWEN HARRIS
1       CUMINGS, MRS. JOHN BRADLEY (FLORENCE BRIGGS TH...
2                                  HEIKKINEN, MISS. LAINA
3            FUTRELLE, MRS. JACQUES HEATH (LILY MAY PEEL)
4                                ALLEN, MR. WILLIAM HENRY
                              ...                        
1304                                   SPECTOR, MR. WOOLF
1305                         OLIVA Y OCANA, DONA. FERMINA
1306                         SAETHER, MR. SIMON SIVERTSEN
1307                                  WARE, MR. FREDERICK
1308                             PETER, MASTER. MICHAEL J
Name: 타이타닉_이름_컬럼, Length: 1309, dtype: object

In [None]:
df["타이타닉_이름_컬럼"].str.contains("Mr.")  # 문자열 포함 여부

0        True
1        True
2       False
3        True
4        True
        ...  
1304     True
1305    False
1306     True
1307     True
1308    False
Name: 타이타닉_이름_컬럼, Length: 1309, dtype: bool

# groupby 사용하기
- 데이터를 그룹화하여 집계해서 가공하는데 아주 중요하다.


In [None]:
df.groupby("타이타닉_탑승항구_컬럼")["타이타닉_운임_컬럼"].mean()

타이타닉_탑승항구_컬럼
C    62.336267
Q    12.409012
S    27.418824
Name: 타이타닉_운임_컬럼, dtype: float64

In [None]:
df.groupby("타이타닉_탑승항구_컬럼")["타이타닉_운임_컬럼"].agg("mean")

타이타닉_탑승항구_컬럼
C    62.336267
Q    12.409012
S    27.418824
Name: 타이타닉_운임_컬럼, dtype: float64

In [None]:
agg_dict = {"타이타닉_운임_컬럼":"mean"}
df.groupby("타이타닉_탑승항구_컬럼").agg(agg_dict)

Unnamed: 0_level_0,타이타닉_운임_컬럼
타이타닉_탑승항구_컬럼,Unnamed: 1_level_1
C,62.336267
Q,12.409012
S,27.418824


In [None]:
agg_dict = {"타이타닉_운임_컬럼":"mean","타이타닉_나이_컬럼":"median"}
df.groupby("타이타닉_탑승항구_컬럼").agg(agg_dict)

Unnamed: 0_level_0,타이타닉_운임_컬럼,타이타닉_나이_컬럼
타이타닉_탑승항구_컬럼,Unnamed: 1_level_1,Unnamed: 2_level_1
C,62.336267,30.0
Q,12.409012,26.5
S,27.418824,28.0


# 데이터 프레임 병합하기

- concat
    - 수직 또는 수평으로 병합
    - `pd.concat` 함수에 데이터프레임을 리스트에 담아서 전달

In [None]:
pd.concat([df,df]) # 수직 병합

Unnamed: 0,타이타닉_승객번호_컬럼,타이타닉_생존_컬럼,타이타닉_티켓등급_컬럼,타이타닉_이름_컬럼,타이타닉_성별_컬럼,타이타닉_나이_컬럼,타이타닉_형제자매_배우자의수_컬럼,타이타닉_부모_자식의수_컬럼,타이타닉_티켓번호_컬럼,타이타닉_운임_컬럼,타이타닉_객실번호_컬럼,타이타닉_탑승항구_컬럼
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
1304,1305,0,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S
1305,1306,1,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C
1306,1307,0,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
1307,1308,0,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S


In [None]:
pd.concat([df,df],axis=1) # 수평 병합

Unnamed: 0,타이타닉_승객번호_컬럼,타이타닉_생존_컬럼,타이타닉_티켓등급_컬럼,타이타닉_이름_컬럼,타이타닉_성별_컬럼,타이타닉_나이_컬럼,타이타닉_형제자매_배우자의수_컬럼,타이타닉_부모_자식의수_컬럼,타이타닉_티켓번호_컬럼,타이타닉_운임_컬럼,...,타이타닉_티켓등급_컬럼.1,타이타닉_이름_컬럼.1,타이타닉_성별_컬럼.1,타이타닉_나이_컬럼.1,타이타닉_형제자매_배우자의수_컬럼.1,타이타닉_부모_자식의수_컬럼.1,타이타닉_티켓번호_컬럼.1,타이타닉_운임_컬럼.1,타이타닉_객실번호_컬럼,타이타닉_탑승항구_컬럼
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,...,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,...,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,...,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,...,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,...,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1304,1305,0,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,...,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S
1305,1306,1,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,...,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C
1306,1307,0,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,...,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
1307,1308,0,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,...,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S


- merge
```python
pd.merge(left, right, on='기준 컬럼', how='left')
```

In [None]:
agg_dict = {"타이타닉_운임_컬럼":"mean"}
right_df = df.groupby("타이타닉_탑승항구_컬럼").agg(agg_dict)
right_df

Unnamed: 0_level_0,타이타닉_운임_컬럼
타이타닉_탑승항구_컬럼,Unnamed: 1_level_1
C,62.336267
Q,12.409012
S,27.418824


In [None]:
right_df = right_df.reset_index()
right_df

Unnamed: 0,타이타닉_탑승항구_컬럼,타이타닉_운임_컬럼
0,C,62.336267
1,Q,12.409012
2,S,27.418824


In [None]:
right_df = right_df.rename(columns = {"타이타닉_운임_컬럼":"항구별_운임평균"})
right_df

Unnamed: 0,타이타닉_탑승항구_컬럼,항구별_운임평균
0,C,62.336267
1,Q,12.409012
2,S,27.418824


In [None]:
pd.merge(df, right_df, on='타이타닉_탑승항구_컬럼', how='left')

Unnamed: 0,타이타닉_승객번호_컬럼,타이타닉_생존_컬럼,타이타닉_티켓등급_컬럼,타이타닉_이름_컬럼,타이타닉_성별_컬럼,타이타닉_나이_컬럼,타이타닉_형제자매_배우자의수_컬럼,타이타닉_부모_자식의수_컬럼,타이타닉_티켓번호_컬럼,타이타닉_운임_컬럼,타이타닉_객실번호_컬럼,타이타닉_탑승항구_컬럼,항구별_운임평균
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S,27.418824
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,62.336267
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S,27.418824
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S,27.418824
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S,27.418824
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1304,1305,0,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S,27.418824
1305,1306,1,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C,62.336267
1306,1307,0,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S,27.418824
1307,1308,0,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S,27.418824


# apply 메소드 사용해보기
- Series, DataFrame에 단순한 집계가 아닌 **구체적인 로직**을 적용하고 싶을 때 활용
- 단순 집계 뿐이 아닌 각 데이터에 조건검사등 복잡한 처리가 가능
- apply의 경우 apply 의 대상이되는 데이터 구조에 따라 다르게 처리하고 축개념이 있습니다.

- Series apply 의 경우 축개념이 없다 한행씩 처리한다.

In [None]:
df["타이타닉_성별_컬럼"].apply(lambda x : 1 if x == "male" else 0)

0       1
1       0
2       0
3       0
4       1
       ..
1304    1
1305    0
1306    1
1307    1
1308    1
Name: 타이타닉_성별_컬럼, Length: 1309, dtype: int64

- Dataframe 의 경우 축개념이 있다.
    - `axis = 0` : 행방향, 컬럼단위로 처리한다.  (기본값)
    - `axis = 1` : 열방향, 행단위로 처리한다.

In [None]:
def do_apply(x):
    print(x.shape)
    return x
tmp = df.apply(do_apply) # 행방향

(1309,)
(1309,)
(1309,)
(1309,)
(1309,)
(1309,)
(1309,)
(1309,)
(1309,)
(1309,)
(1309,)
(1309,)


In [None]:
df2 = df.iloc[:5]
df2

Unnamed: 0,타이타닉_승객번호_컬럼,타이타닉_생존_컬럼,타이타닉_티켓등급_컬럼,타이타닉_이름_컬럼,타이타닉_성별_컬럼,타이타닉_나이_컬럼,타이타닉_형제자매_배우자의수_컬럼,타이타닉_부모_자식의수_컬럼,타이타닉_티켓번호_컬럼,타이타닉_운임_컬럼,타이타닉_객실번호_컬럼,타이타닉_탑승항구_컬럼
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [None]:
def do_apply(x):
    print(x.shape)
    return x
tmp = df2.apply(do_apply,axis=1)

(12,)
(12,)
(12,)
(12,)
(12,)


- groupby 경우 그룹핑 되어 데이터프레임이 넘어간다

In [None]:
def do_apply(x):
    print(x.shape)
    return x
tmp = df.groupby("타이타닉_탑승항구_컬럼").apply(do_apply)

(270, 12)
(123, 12)
(914, 12)
