## 넘파이 이론

In [None]:
import numpy as np

# 1차원 배열 생성
array1 = np.array([1, 2, 3])

# 2차원 배열 생성
array2 = np.array([[1, 2, 3],
                   [4, 5, 6]])


In [None]:
sequence_array = np.arange(10)  ## 얘는 10행 0열??
print(sequence_array)
print(sequence_array.dtype, sequence_array.shape)


[0 1 2 3 4 5 6 7 8 9]
int64 (10,)


In [3]:
list1 = [1, 2, 3]
print(type(list1))

array1 = np.array(list1)
print(type(array1))
print(array1, array1.dtype)


<class 'list'>
<class 'numpy.ndarray'>
[1 2 3] int64


In [4]:
zero_array = np.zeros((3,2), dtype = 'int32')
zero_array

array([[0, 0],
       [0, 0],
       [0, 0]], dtype=int32)

In [5]:
array2 = np.array([[1, 2, 3],
                   [4, 5, 6]])

print("array2:")
print(array2)
print("Shape:", array2.shape)
print("차원 수 (ndim):", array2.ndim)


array2:
[[1 2 3]
 [4 5 6]]
Shape: (2, 3)
차원 수 (ndim): 2


axis = 0은 가장 바깥쪽이다.
따라서 2차원의 경우 axis = 0은 컬럼을 의미한다. -> [0]을 찍으면 첫번째 열의 로우라는 의미
(1차원일 때는 로우) -> 인덱싱[0]을 찍으면 로우의 첫번째 값이 나옴

shape을 찍을 때 나오는 (2,3)의 [0]인 2는 로우가 몇 줄인지와 열의 길이를 의미한다

In [None]:
array1 = np.arange(20)
print('array1:\n', array1)

# (2, 5)로 reshape
array2 = array1.reshape(4, 5)
print('array2:\n', array2)

# (5, 2)로 reshape
array3 = array1.reshape(2, 10)
print('array3:\n', array3)

# (4, 3)으로 reshape 시도 - 오류 발생
# array1.reshape(4, 3)  # ValueError 발생

array1:
 [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19]
array2:
 [[ 0  1  2  3  4]
 [ 5  6  7  8  9]
 [10 11 12 13 14]
 [15 16 17 18 19]]
array3:
 [[ 0  1  2  3  4  5  6  7  8  9]
 [10 11 12 13 14 15 16 17 18 19]]


In [18]:
array1 = np.arange(10)

# 자동 계산 가능한 경우
array2 = array1.reshape(-1, 5)
print('array2 shape:', array2.shape)

array3 = array1.reshape(5, -1)
print('array3 shape:', array3.shape)

# 자동 계산 불가능한 경우 - 오류 발생
# array4 = array1.reshape(-1, 4)  # ValueError 발생

array2 shape: (2, 5)
array3 shape: (5, 2)


## Pandas와 Numpy

In [3]:
import numpy as np
import pandas as pd

s = pd.Series([1, 3, 5, np.nan, 6, 8])
print(s)


0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64


In [4]:
dates = pd.date_range("20250912", periods=6)
print(dates)

df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list("ABCD"))
print(df)


DatetimeIndex(['2025-09-12', '2025-09-13', '2025-09-14', '2025-09-15',
               '2025-09-16', '2025-09-17'],
              dtype='datetime64[ns]', freq='D')
                   A         B         C         D
2025-09-12  1.210527  0.765174  2.179116 -0.815430
2025-09-13 -0.899673 -1.217391 -0.862354  1.187439
2025-09-14  0.326929  1.170700  1.490924 -1.387816
2025-09-15 -2.799538 -0.539676  1.456105  0.438374
2025-09-16 -2.178622  0.075266  0.507859 -0.734511
2025-09-17  1.585036  1.934895  0.442659 -0.381014


In [5]:
df2 = pd.DataFrame({
    "A": 1.0,
    "B": pd.Timestamp("20130102"),
    "C": pd.Series(1, index=list(range(4)), dtype="int32"),
    "D": np.array([3] * 4, dtype="int32"),
    "E": pd.Categorical(["test", "train", "test", "train"]),
    "F": "foo"
})
print(df2)


     A          B  C  D      E    F
0  1.0 2013-01-02  1  3   test  foo
1  1.0 2013-01-02  1  3  train  foo
2  1.0 2013-01-02  1  3   test  foo
3  1.0 2013-01-02  1  3  train  foo


In [6]:
print(df2.dtypes)


A          float64
B    datetime64[s]
C            int32
D            int32
E         category
F           object
dtype: object


In [7]:
print(df.head())     # 상위 5개 행
print(df.tail(3))    # 하위 3개 행


                   A         B         C         D
2025-09-12  1.210527  0.765174  2.179116 -0.815430
2025-09-13 -0.899673 -1.217391 -0.862354  1.187439
2025-09-14  0.326929  1.170700  1.490924 -1.387816
2025-09-15 -2.799538 -0.539676  1.456105  0.438374
2025-09-16 -2.178622  0.075266  0.507859 -0.734511
                   A         B         C         D
2025-09-15 -2.799538 -0.539676  1.456105  0.438374
2025-09-16 -2.178622  0.075266  0.507859 -0.734511
2025-09-17  1.585036  1.934895  0.442659 -0.381014


In [8]:
print(df.index)
print(df.columns)


DatetimeIndex(['2025-09-12', '2025-09-13', '2025-09-14', '2025-09-15',
               '2025-09-16', '2025-09-17'],
              dtype='datetime64[ns]', freq='D')
Index(['A', 'B', 'C', 'D'], dtype='object')


In [9]:
print(df.to_numpy())


[[ 1.21052675  0.76517359  2.17911643 -0.81542992]
 [-0.89967338 -1.21739147 -0.86235369  1.18743871]
 [ 0.32692948  1.17069958  1.4909244  -1.38781569]
 [-2.79953822 -0.5396762   1.45610516  0.4383741 ]
 [-2.1786216   0.07526634  0.50785875 -0.73451102]
 [ 1.58503601  1.93489496  0.44265886 -0.38101402]]


In [10]:
print(df.describe())


              A         B         C         D
count  6.000000  6.000000  6.000000  6.000000
mean  -0.459223  0.364828  0.869052 -0.282160
std    1.800694  1.155873  1.073095  0.938098
min   -2.799538 -1.217391 -0.862354 -1.387816
25%   -1.858885 -0.385941  0.458959 -0.795200
50%   -0.286372  0.420220  0.981982 -0.557763
75%    0.989627  1.069318  1.482220  0.233527
max    1.585036  1.934895  2.179116  1.187439


In [11]:
print(df.T)


   2025-09-12  2025-09-13  2025-09-14  2025-09-15  2025-09-16  2025-09-17
A    1.210527   -0.899673    0.326929   -2.799538   -2.178622    1.585036
B    0.765174   -1.217391    1.170700   -0.539676    0.075266    1.934895
C    2.179116   -0.862354    1.490924    1.456105    0.507859    0.442659
D   -0.815430    1.187439   -1.387816    0.438374   -0.734511   -0.381014


In [12]:
print(df["A"])

2025-09-12    1.210527
2025-09-13   -0.899673
2025-09-14    0.326929
2025-09-15   -2.799538
2025-09-16   -2.178622
2025-09-17    1.585036
Freq: D, Name: A, dtype: float64


In [13]:
df[0:3]

Unnamed: 0,A,B,C,D
2025-09-12,1.210527,0.765174,2.179116,-0.81543
2025-09-13,-0.899673,-1.217391,-0.862354,1.187439
2025-09-14,0.326929,1.1707,1.490924,-1.387816


In [14]:
df["20130102":"20130104"]

Unnamed: 0,A,B,C,D


In [15]:
df.loc["20130102":"20130104", ["A", "B"]]


Unnamed: 0,A,B


In [20]:

df.at[dates[0], "A"]        # 가장 빠른 방식


np.float64(1.2105267469191692)

In [21]:
df.loc[dates[0], "A"]       # 위치 기반 접근

np.float64(1.2105267469191692)

In [18]:
df.iloc[3]                  # 4번째 행 전체

df.iloc[[1, 2, 4], [0, 2]]  # 특정 행/열 조합
df.iloc[1:3, :]             # 2~3번째 행, 전체 열
df.iloc[:, 1:3]             # 전체 행, 2~3번째 열
df.iloc[1, 1]               # 2행 2열 값
df.iat[1, 1]                # 빠른 2행 2열 접근


np.float64(-1.2173914652342623)

In [19]:
df.iloc[3:5, 0:2]           # 4~5번째 행, 1~2번째 열

Unnamed: 0,A,B
2025-09-15,-2.799538,-0.539676
2025-09-16,-2.178622,0.075266


In [22]:
df[df["A"] > 0]             # A 열이 0보다 큰 행만

df[df > 0]                 # 전체 DataFrame에서 양수만 표시, 나머지는 NaN


Unnamed: 0,A,B,C,D
2025-09-12,1.210527,0.765174,2.179116,
2025-09-13,,,,1.187439
2025-09-14,0.326929,1.1707,1.490924,
2025-09-15,,,1.456105,0.438374
2025-09-16,,0.075266,0.507859,
2025-09-17,1.585036,1.934895,0.442659,


In [23]:
df2 = df.copy()
df2["E"] = ["one", "one", "two", "three", "four", "three"]

df2[df2["E"].isin(["two", "four"])]


Unnamed: 0,A,B,C,D,E
2025-09-14,0.326929,1.1707,1.490924,-1.387816,two
2025-09-16,-2.178622,0.075266,0.507859,-0.734511,four


In [24]:
df1 = df.reindex(index=dates[0:4], columns=list(df.columns) + ["E"])
df1.loc[dates[0] : dates[1], "E"] = 1
print(df1)


                   A         B         C         D    E
2025-09-12  1.210527  0.765174  2.179116 -0.815430  1.0
2025-09-13 -0.899673 -1.217391 -0.862354  1.187439  1.0
2025-09-14  0.326929  1.170700  1.490924 -1.387816  NaN
2025-09-15 -2.799538 -0.539676  1.456105  0.438374  NaN


In [25]:
# 결측치가 있는 모든 행 제거
df1.dropna(how="any")

# 결측치를 5로 채우기
df1.fillna(value=5)


Unnamed: 0,A,B,C,D,E
2025-09-12,1.210527,0.765174,2.179116,-0.81543,1.0
2025-09-13,-0.899673,-1.217391,-0.862354,1.187439,1.0
2025-09-14,0.326929,1.1707,1.490924,-1.387816,5.0
2025-09-15,-2.799538,-0.539676,1.456105,0.438374,5.0


In [26]:
pd.isna(df1)

Unnamed: 0,A,B,C,D,E
2025-09-12,False,False,False,False,False
2025-09-13,False,False,False,False,False
2025-09-14,False,False,False,False,True
2025-09-15,False,False,False,False,True


## 데이터 처리와 집계

In [1]:
import pandas as pd

# Titanic 데이터 CSV 파일 로드
titanic_df = pd.read_csv("../1_pandas_basic/data/titanic.csv")

# DataFrame 출력
titanic_df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [2]:
# 이름 기준 오름차순 정렬
titanic_sorted = titanic_df.sort_values(by=['Name'])
titanic_sorted.head(3)

# Pclass와 Name 기준 내림차순 정렬
titanic_sorted = titanic_df.sort_values(by=['Pclass', 'Name'], ascending=False)
titanic_sorted.head(3)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
868,869,0,3,"van Melkebeke, Mr. Philemon",male,,0,0,345777,9.5,,S
153,154,0,3,"van Billiard, Mr. Austin Blyler",male,40.5,0,2,A/5. 851,14.5,,S
282,283,0,3,"de Pelsmaeker, Mr. Alfons",male,16.0,0,0,345778,9.5,,S


In [4]:
print(titanic_df['Pclass'].nunique())
print(titanic_df['Survived'].nunique())
print(titanic_df['Name'].nunique())

print(titanic_df['Pclass'].unique())

3
2
891
[3 1 2]


In [5]:
titanic_df.count()

PassengerId    891
Survived       891
Pclass         891
Name           891
Sex            891
Age            714
SibSp          891
Parch          891
Ticket         891
Fare           891
Cabin          204
Embarked       889
dtype: int64

In [None]:


# 합계
titanic_df[['Age', 'Fare']].sum()

# 최솟값
titanic_df[['Age', 'Fare']].min()


Age     0.42
Fare    0.00
dtype: float64

In [8]:
# 평균
titanic_df[['Age', 'Fare']].mean()

Age     29.699118
Fare    32.204208
dtype: float64

In [10]:
# groupby 객체 생성
titanic_groupby = titanic_df.groupby('Pclass')

# Age와 Fare에 대해 count
titanic_groupby[['Age', 'Fare']].mean()


Unnamed: 0_level_0,Age,Fare
Pclass,Unnamed: 1_level_1,Unnamed: 2_level_1
1,38.233441,84.154687
2,29.87763,20.662183
3,25.14062,13.67555


In [11]:
# 최대값과 최소값을 나란히 출력
titanic_df.groupby('Pclass')['Age'].max(), titanic_df.groupby('Pclass')['Age'].min()


(Pclass
 1    80.0
 2    70.0
 3    74.0
 Name: Age, dtype: float64,
 Pclass
 1    0.92
 2    0.67
 3    0.42
 Name: Age, dtype: float64)

In [12]:
# max, min 함께 보기
titanic_df.groupby('Pclass')['Age'].agg([max, min])


  titanic_df.groupby('Pclass')['Age'].agg([max, min])
  titanic_df.groupby('Pclass')['Age'].agg([max, min])


Unnamed: 0_level_0,max,min
Pclass,Unnamed: 1_level_1,Unnamed: 2_level_1
1,80.0,0.92
2,70.0,0.67
3,74.0,0.42


In [13]:
titanic_df.groupby(['Pclass']).agg(
    age_max=('Age', 'max'),
    age_mean=('Age', 'mean'),
    fare_mean=('Fare', 'mean')
)


Unnamed: 0_level_0,age_max,age_mean,fare_mean
Pclass,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,80.0,38.233441,84.154687
2,70.0,29.87763,20.662183
3,74.0,25.14062,13.67555


In [18]:
agg_format = {
    'Age': 'max',
    'SibSp': 'sum',
    'Fare': 'mean'
}
titanic_df.groupby('Pclass').agg(agg_format)


Unnamed: 0_level_0,Age,SibSp,Fare
Pclass,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,80.0,90,84.154687
2,70.0,74,20.662183
3,74.0,302,13.67555


In [19]:
# 이름의 길이 계산
titanic_df['Name_len'] = titanic_df['Name'].apply(lambda x: len(x))
titanic_df[['Name', 'Name_len']].head(3)


Unnamed: 0,Name,Name_len
0,"Braund, Mr. Owen Harris",23
1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",51
2,"Heikkinen, Miss. Laina",22


In [32]:
# 나이 기준으로 아동/성인 구분
titanic_df['Child_Adult'] = titanic_df['Age'].apply(lambda x: 'Child' if x <= 15 else 'Adult')
titanic_df[['Age', 'Child_Adult']].sample(8)


Unnamed: 0,Age,Child_Adult
453,49.0,Adult
469,0.75,Child
748,19.0,Adult
883,28.0,Adult
637,31.0,Adult
381,1.0,Child
712,48.0,Adult
209,40.0,Adult


In [33]:
def categorize_age(age):
    """
    나이에 따라 연령대를 분류하는 함수
    """
    if age <= 5:
        return 'Baby'
    elif age <= 12:
        return 'Child'
    elif age <= 18:
        return 'Teenager'
    elif age <= 25:
        return 'Student'
    elif age <= 35:
        return 'Young Adult'
    elif age <= 60:
        return 'Adult'
    else:
        return 'Elderly'

# 적용 및 확인
titanic_df['Age_cate'] = titanic_df['Age'].apply(categorize_age)
titanic_df[['Age', 'Age_cate']].head()


Unnamed: 0,Age,Age_cate
0,22.0,Student
1,38.0,Adult
2,26.0,Young Adult
3,35.0,Young Adult
4,35.0,Young Adult


## 데이터 병합 및 변환 이론

In [34]:
import pandas as pd

df1 = pd.DataFrame({'A': ['A0', 'A1', 'A2'],
                    'B': ['B0', 'B1', 'B2']})

df2 = pd.DataFrame({'C': ['C0', 'C1', 'C2'],
                    'D': ['D0', 'D1', 'D2']})

df3 = pd.concat([df1, df2], axis=0)  # 행 병합
df4 = pd.concat([df1, df2], axis=1)  # 열 병합

print(df1)
print('\n')
print(df2)
print('\n')
print(df3)
print('\n')
print(df4)


    A   B
0  A0  B0
1  A1  B1
2  A2  B2


    C   D
0  C0  D0
1  C1  D1
2  C2  D2


     A    B    C    D
0   A0   B0  NaN  NaN
1   A1   B1  NaN  NaN
2   A2   B2  NaN  NaN
0  NaN  NaN   C0   D0
1  NaN  NaN   C1   D1
2  NaN  NaN   C2   D2


    A   B   C   D
0  A0  B0  C0  D0
1  A1  B1  C1  D1
2  A2  B2  C2  D2


In [35]:
import pandas as pd

df1 = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
                    'A': ['A0', 'A1', 'A2', 'A3'],
                    'B': ['B0', 'B1', 'B2', 'B3']})

df2 = pd.DataFrame({'key': ['K0', 'K1', 'K2'],
                    'C': ['C0', 'C1', 'C2'],
                    'D': ['D0', 'D1', 'D2']})

df_merged = pd.merge(df1, df2, on='key', how='inner')

print(df1)
print('\n')
print(df2)
print('\n')
print(df_merged)


  key   A   B
0  K0  A0  B0
1  K1  A1  B1
2  K2  A2  B2
3  K3  A3  B3


  key   C   D
0  K0  C0  D0
1  K1  C1  D1
2  K2  C2  D2


  key   A   B   C   D
0  K0  A0  B0  C0  D0
1  K1  A1  B1  C1  D1
2  K2  A2  B2  C2  D2


In [36]:
import pandas as pd

df1 = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
                    'A': ['A0', 'A1', 'A2', 'A3'],
                    'B': ['B0', 'B1', 'B2', 'B3']})

df2 = pd.DataFrame({'key': ['K0', 'K1', 'K2'],
                    'C': ['C0', 'C1', 'C2'],
                    'D': ['D0', 'D1', 'D2']})

df_merged = pd.merge(df1, df2, on='key', how='outer')

print(df1)
print('\n')
print(df2)
print('\n')
print(df_merged)


  key   A   B
0  K0  A0  B0
1  K1  A1  B1
2  K2  A2  B2
3  K3  A3  B3


  key   C   D
0  K0  C0  D0
1  K1  C1  D1
2  K2  C2  D2


  key   A   B    C    D
0  K0  A0  B0   C0   D0
1  K1  A1  B1   C1   D1
2  K2  A2  B2   C2   D2
3  K3  A3  B3  NaN  NaN


In [37]:
import pandas as pd

df1 = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
                    'A': ['A0', 'A1', 'A2', 'A3'],
                    'B': ['B0', 'B1', 'B2', 'B3']})

df2 = pd.DataFrame({'key': ['K0', 'K1', 'K2'],
                    'C': ['C0', 'C1', 'C2'],
                    'D': ['D0', 'D1', 'D2']})

df_merged = pd.merge(df1, df2, on='key', how='left')

print(df1)
print('\n')
print(df2)
print('\n')
print(df_merged)


  key   A   B
0  K0  A0  B0
1  K1  A1  B1
2  K2  A2  B2
3  K3  A3  B3


  key   C   D
0  K0  C0  D0
1  K1  C1  D1
2  K2  C2  D2


  key   A   B    C    D
0  K0  A0  B0   C0   D0
1  K1  A1  B1   C1   D1
2  K2  A2  B2   C2   D2
3  K3  A3  B3  NaN  NaN


In [38]:
import pandas as pd

df1 = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
                    'A': ['A0', 'A1', 'A2', 'A3'],
                    'B': ['B0', 'B1', 'B2', 'B3']})

df2 = pd.DataFrame({'key': ['K0', 'K1', 'K2'],
                    'C': ['C0', 'C1', 'C2'],
                    'D': ['D0', 'D1', 'D2']})

df_merged = pd.merge(df1, df2, on='key', how='right')

print(df1)
print('\n')
print(df2)
print('\n')
print(df_merged)


  key   A   B
0  K0  A0  B0
1  K1  A1  B1
2  K2  A2  B2
3  K3  A3  B3


  key   C   D
0  K0  C0  D0
1  K1  C1  D1
2  K2  C2  D2


  key   A   B   C   D
0  K0  A0  B0  C0  D0
1  K1  A1  B1  C1  D1
2  K2  A2  B2  C2  D2


In [39]:
import pandas as pd

df1 = pd.DataFrame({
    'A': ['A0', 'A1', 'A2'],
    'B': ['B0', 'B1', 'B2']
}, index=['K0', 'K1', 'K2'])

df2 = pd.DataFrame({
    'C': ['C0', 'C1', 'C2'],
    'D': ['D0', 'D1', 'D2']
}, index=['K0', 'K2', 'K3'])

df_merged = df1.join(df2, how='left')

print(df1)
print('\n')
print(df2)
print('\n')
print(df_merged)


     A   B
K0  A0  B0
K1  A1  B1
K2  A2  B2


     C   D
K0  C0  D0
K2  C1  D1
K3  C2  D2


     A   B    C    D
K0  A0  B0   C0   D0
K1  A1  B1  NaN  NaN
K2  A2  B2   C1   D1


In [40]:
sales = pd.DataFrame({
    'customer_id': [1, 2, 3, 4],
    'product_id': [101, 102, 103, 104],
    'quantity': [5, 2, 3, 1]
})

customers = pd.DataFrame({
    'customer_id': [1, 2, 3, 5],
    'name': ['Alice', 'Bob', 'Charlie', 'David'],
    'city': ['Seoul', 'Busan', 'Daegu', 'Incheon']
})

sales = sales.set_index('customer_id')
customers = customers.set_index('customer_id')

merged_data = sales.join(customers, how='left')

print(sales)
print('\n')
print(customers)
print('\n')
print(merged_data)


             product_id  quantity
customer_id                      
1                   101         5
2                   102         2
3                   103         3
4                   104         1


                name     city
customer_id                  
1              Alice    Seoul
2                Bob    Busan
3            Charlie    Daegu
5              David  Incheon


             product_id  quantity     name   city
customer_id                                      
1                   101         5    Alice  Seoul
2                   102         2      Bob  Busan
3                   103         3  Charlie  Daegu
4                   104         1      NaN    NaN


In [41]:
df1 = pd.DataFrame({
    'key': ['K0', 'K1', 'K2', 'K3'],
    'A': ['A0', 'A1', 'A2', 'A3'],
    'C': ['C0', 'C1', 'C2', 'C3']
})

df2 = pd.DataFrame({
    'key': ['K0', 'K1', 'K2'],
    'C': ['C4', 'C5', 'C6'],
    'D': ['D0', 'D1', 'D2']
})

df_merged = pd.merge(df1, df2, on='key', how='inner', suffixes=('_left', '_right'))

print(df1)
print('\n')
print(df2)
print('\n')
print(df_merged)


  key   A   C
0  K0  A0  C0
1  K1  A1  C1
2  K2  A2  C2
3  K3  A3  C3


  key   C   D
0  K0  C4  D0
1  K1  C5  D1
2  K2  C6  D2


  key   A C_left C_right   D
0  K0  A0     C0      C4  D0
1  K1  A1     C1      C5  D1
2  K2  A2     C2      C6  D2


In [42]:
df1 = pd.DataFrame({
    'key': ['K0', 'K1', 'K2', 'K3'],
    'A': ['A0', 'A1', 'A2', 'A3'],
    'B': ['B0', 'B1', 'B2', 'B3']
})

df2 = pd.DataFrame({
    'key': ['K0', 'K1', 'K2'],
    'C': ['C0', 'C1', 'C2'],
    'D': ['D0', 'D1', 'D2']
})

df_merged = pd.merge(df1, df2, on='key', how='inner')
df_merged = df_merged.drop('B', axis=1)

print(df1)
print('\n')
print(df2)
print('\n')
print(df_merged)


  key   A   B
0  K0  A0  B0
1  K1  A1  B1
2  K2  A2  B2
3  K3  A3  B3


  key   C   D
0  K0  C0  D0
1  K1  C1  D1
2  K2  C2  D2


  key   A   C   D
0  K0  A0  C0  D0
1  K1  A1  C1  D1
2  K2  A2  C2  D2


In [43]:
df1 = pd.DataFrame({
    'key1': ['K0', 'K1', 'K2', 'K3'],
    'key2': ['K4', 'K5', 'K6', 'K7'],
    'A': ['A0', 'A1', 'A2', 'A3'],
    'B': ['B0', 'B1', 'B2', 'B3']
})

df2 = pd.DataFrame({
    'key1': ['K0', 'K1', 'K2'],
    'key2': ['K4', 'K5', 'K6'],
    'C': ['C0', 'C1', 'C2'],
    'D': ['D0', 'D1', 'D2']
})

df_merged = pd.merge(df1, df2, on=['key1', 'key2'], how='inner')

print(df1)
print('\n')
print(df2)
print('\n')
print(df_merged)


  key1 key2   A   B
0   K0   K4  A0  B0
1   K1   K5  A1  B1
2   K2   K6  A2  B2
3   K3   K7  A3  B3


  key1 key2   C   D
0   K0   K4  C0  D0
1   K1   K5  C1  D1
2   K2   K6  C2  D2


  key1 key2   A   B   C   D
0   K0   K4  A0  B0  C0  D0
1   K1   K5  A1  B1  C1  D1
2   K2   K6  A2  B2  C2  D2


In [44]:
import pandas as pd

# Titanic 데이터 CSV 파일 로드
replace_test_df = pd.read_csv("../1_pandas_basic/data/titanic.csv")

In [45]:
import numpy as np

# 예: Titanic 데이터
replace_test_df['Sex'] = replace_test_df['Sex'].replace({'male': 'Man', 'female': 'Woman'})
print(replace_test_df.head(10))


   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   
5            6         0       3   
6            7         0       1   
7            8         0       3   
8            9         1       3   
9           10         1       2   

                                                Name    Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    Man  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  Woman  38.0      1   
2                             Heikkinen, Miss. Laina  Woman  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  Woman  35.0      1   
4                           Allen, Mr. William Henry    Man  35.0      0   
5                                   Moran, Mr. James    Man   NaN      0   
6                            McCarthy, Mr. Timothy J    Man  54.0      

In [46]:
# NaN을 특정 값으로 대체
replace_test_df['Cabin'] = replace_test_df['Cabin'].replace(np.nan, 'CXXX')

# Cabin별 값 개수 확인
print(replace_test_df['Cabin'].value_counts(dropna=False))

Cabin
CXXX           687
G6               4
C23 C25 C27      4
B96 B98          4
F2               3
              ... 
E17              1
A24              1
C50              1
B42              1
C148             1
Name: count, Length: 148, dtype: int64
