# TEXT MINING for PRACTICE: Pandas
---

In [1]:
import numpy as np
import pandas as pd

In [2]:
#초기 데이터 셋팅
data = np.random.randint(0,100,25).reshape((5,5))
columns = ['A','B','C','D','E']
df = pd.DataFrame(data, columns=columns)
df

Unnamed: 0,A,B,C,D,E
0,42,13,95,2,3
1,15,25,86,36,14
2,15,29,3,49,97
3,7,43,70,52,96
4,13,79,96,29,67


## 1. 데이터프레임 (Dataframe)

### 1-1. 값 접근

In [3]:
# 열기준으로 선택
df['A']

0    42
1    15
2    15
3     7
4    13
Name: A, dtype: int64

In [4]:
# 여러 열 기준으로 선택
df[['A','D','E']]

Unnamed: 0,A,D,E
0,42,2,3
1,15,36,14
2,15,49,97
3,7,52,96
4,13,29,67


In [5]:
# 행 기준으로 선택
df.loc[3]


A     7
B    43
C    70
D    52
E    96
Name: 3, dtype: int64

In [7]:
# 특정 행-열에 있는 원소 선택
df.loc[3,'A']

7

In [8]:
# 특정 행-열들 에 있는 원소 선택
df.loc[[3,4],['A','B']]

Unnamed: 0,A,B
3,7,43
4,13,79


In [9]:
# 상위 n개만 출력
df.head(n=3)

Unnamed: 0,A,B,C,D,E
0,42,13,95,2,3
1,15,25,86,36,14
2,15,29,3,49,97


In [10]:
# 하위 n개만 출력
df.tail(n=3)

Unnamed: 0,A,B,C,D,E
2,15,29,3,49,97
3,7,43,70,52,96
4,13,79,96,29,67


### 1-2. 조건부 선택

In [11]:
## 50보다 큰 원소들을 갖고 있는 셀은?
df>50


Unnamed: 0,A,B,C,D,E
0,False,False,True,False,False
1,False,False,True,False,False
2,False,False,False,False,True
3,False,False,True,True,True
4,False,True,True,False,True


In [12]:
## 50보다 큰 원소들은?
df[df>50]

Unnamed: 0,A,B,C,D,E
0,,,95.0,,
1,,,86.0,,
2,,,,,97.0
3,,,70.0,52.0,96.0
4,,79.0,96.0,,67.0


In [13]:
## 특정열이 50보다 큰 행은?
df[df['B']>50]

Unnamed: 0,A,B,C,D,E
4,13,79,96,29,67


In [15]:
## 특정열들이 동시에 조건을 만족시키는 행은?
df[(df['B']>50)&(df['C']<70)]

Unnamed: 0,A,B,C,D,E


### 1-3. 데이터 생성 삭제 갱신

In [16]:
# 새로운 열 추가
df['X'] = [0,0,0,0,0]

In [17]:
df

Unnamed: 0,A,B,C,D,E,X
0,42,13,95,2,3,0
1,15,25,86,36,14,0
2,15,29,3,49,97,0
3,7,43,70,52,96,0
4,13,79,96,29,67,0


In [18]:
# 특정 열 삭제
df.drop('X', axis=1)

Unnamed: 0,A,B,C,D,E
0,42,13,95,2,3
1,15,25,86,36,14
2,15,29,3,49,97
3,7,43,70,52,96
4,13,79,96,29,67


In [19]:
df

Unnamed: 0,A,B,C,D,E,X
0,42,13,95,2,3,0
1,15,25,86,36,14,0
2,15,29,3,49,97,0
3,7,43,70,52,96,0
4,13,79,96,29,67,0


In [20]:
## 원본 데이터에서 삭제를 원할 시 inplace 속성을 True 적용
df.drop('X', axis=1, inplace=True)

In [21]:
df

Unnamed: 0,A,B,C,D,E
0,42,13,95,2,3
1,15,25,86,36,14
2,15,29,3,49,97
3,7,43,70,52,96
4,13,79,96,29,67


In [22]:
df['Index'] = ["NOK","USD","EUR","GBP","KRW"]

In [23]:
df

Unnamed: 0,A,B,C,D,E,Index
0,42,13,95,2,3,NOK
1,15,25,86,36,14,USD
2,15,29,3,49,97,EUR
3,7,43,70,52,96,GBP
4,13,79,96,29,67,KRW


In [24]:
## 인덱스 변경 (0,1,2,3,4) 대신 화폐단위로
df.set_index('Index',inplace=True)

In [25]:
df

Unnamed: 0_level_0,A,B,C,D,E
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
NOK,42,13,95,2,3
USD,15,25,86,36,14
EUR,15,29,3,49,97
GBP,7,43,70,52,96
KRW,13,79,96,29,67


In [26]:
df.loc['KRW']

A    13
B    79
C    96
D    29
E    67
Name: KRW, dtype: int64

In [27]:
# Numpy와 비슷하게 기본적인 수학 연산을 테이블 전체에 적용가능
df/10

Unnamed: 0_level_0,A,B,C,D,E
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
NOK,4.2,1.3,9.5,0.2,0.3
USD,1.5,2.5,8.6,3.6,1.4
EUR,1.5,2.9,0.3,4.9,9.7
GBP,0.7,4.3,7.0,5.2,9.6
KRW,1.3,7.9,9.6,2.9,6.7


In [28]:
# 다른 자료구조와 마찬가지로 데이터에 접근하여 변경 가능
df.loc['KRW']['A'] = 1000

In [29]:
df

Unnamed: 0_level_0,A,B,C,D,E
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
NOK,42,13,95,2,3
USD,15,25,86,36,14
EUR,15,29,3,49,97
GBP,7,43,70,52,96
KRW,1000,79,96,29,67


## 2. 데이터 다루기

### 2-1. 빈 데이터 다루기

In [30]:
sample_data ={
    '나이':[23,np.nan,28],
    '성별':['여성',np.nan, '남성'],
    '키':[np.nan,np.nan,'175']
}
df = pd.DataFrame(sample_data)
df

Unnamed: 0,나이,성별,키
0,23.0,여성,
1,,,
2,28.0,남성,175.0


In [31]:
# 하나라도 Nan (값이 없는) 행 제거하기
df.dropna()

Unnamed: 0,나이,성별,키
2,28.0,남성,175


In [32]:
# 하나라도 Nan (값이 없는) 열 제거하기
df.dropna(axis=1)

0
1
2


In [33]:
# 두개 이상의 Nan값인 행 제거하기
df.dropna(thresh=2)

Unnamed: 0,나이,성별,키
0,23.0,여성,
2,28.0,남성,175.0


In [34]:
# 성별의 Nan값 한번에 채우기
df['성별'].fillna('기타',inplace=True)
df

Unnamed: 0,나이,성별,키
0,23.0,여성,
1,,기타,
2,28.0,남성,175.0


In [35]:
# 나이의 평균으로 빈 값 채우기
df['나이'].fillna(df['나이'].dropna().mean(),inplace=True)
df

Unnamed: 0,나이,성별,키
0,23.0,여성,
1,25.5,기타,
2,28.0,남성,175.0


In [36]:
# 키의 평균으로 빈 값 채우기
df['키'].fillna(df['키'].dropna().mean(),inplace=True)
df

Unnamed: 0,나이,성별,키
0,23.0,여성,175.0
1,25.5,기타,175.0
2,28.0,남성,175.0


### 2-2. 고급 선택: GroupBy

In [38]:
## Dictionary를 활용해서 Dataframe 생성가능
## 키가 열이 됨
sample_data ={
    '나이':[23,25,28,27,23,24],
    '전공':['빅데이터','빅데이터', '경영','경영','경제','기계'],
    '성별':['남','여', '여','남','여','남'],
    '학년':[1,1,2,1,3,2],
    '키':[170,165,175,178,170,176],
    '자퇴':[True,True,False,False,False,True]
}
df = pd.DataFrame(sample_data)
df

Unnamed: 0,나이,전공,성별,학년,키,자퇴
0,23,빅데이터,남,1,170,True
1,25,빅데이터,여,1,165,True
2,28,경영,여,2,175,False
3,27,경영,남,1,178,False
4,23,경제,여,3,170,False
5,24,기계,남,2,176,True


In [39]:
## 자퇴여부에 따른 학년의 중앙값
df[['학년','자퇴']].groupby('자퇴').median()

Unnamed: 0_level_0,학년
자퇴,Unnamed: 1_level_1
False,2.0
True,1.0


In [40]:
## 학년별 자퇴 비율
df[['학년','자퇴']].groupby('학년').mean()


Unnamed: 0_level_0,자퇴
학년,Unnamed: 1_level_1
1,0.666667
2,0.5
3,0.0


In [41]:
## 쓸모없는 통계의 예
df[['키','자퇴']].groupby('자퇴').mean()

Unnamed: 0_level_0,키
자퇴,Unnamed: 1_level_1
False,174.333333
True,170.333333


In [42]:
## 각 항목의 최대값도 구할 수 있음: 자퇴여부에 따른 그룹에서 최대값 (글자는 가나다순)
df.groupby('자퇴').max()

Unnamed: 0_level_0,나이,전공,성별,학년,키
자퇴,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
False,28,경제,여,3,178
True,25,빅데이터,여,2,176


In [43]:
df.groupby('자퇴').min()

Unnamed: 0_level_0,나이,전공,성별,학년,키
자퇴,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
False,23,경영,남,1,170
True,23,기계,남,1,165


In [44]:
## 여러 정보를 한번에 보고 싶다면?
df.groupby('자퇴').describe().transpose()

Unnamed: 0,자퇴,False,True
나이,count,3.0,3.0
나이,mean,26.0,24.0
나이,std,2.645751,1.0
나이,min,23.0,23.0
나이,25%,25.0,23.5
나이,50%,27.0,24.0
나이,75%,27.5,24.5
나이,max,28.0,25.0
학년,count,3.0,3.0
학년,mean,2.0,1.333333


### 2-3. 데이터 프레임간의 조작

In [45]:
dataset_A = {
    'A':['A1','A2','A3','A4'],
    'B':['B1','B2','B3','B4'],
    'C':['C1','C2','C3','C4'],
}
dataset_B ={
    'A':['A5','A6','A7','A8'],
    'B':['B5','B6','B7','B8'],
    'C':['C5','C6','C7','C8'],
}
df1 = pd.DataFrame(dataset_A, index=[0,1,2,3])
df2 = pd.DataFrame(dataset_B, index=[4,5,6,7])

In [46]:
df1

Unnamed: 0,A,B,C
0,A1,B1,C1
1,A2,B2,C2
2,A3,B3,C3
3,A4,B4,C4


In [47]:
df2

Unnamed: 0,A,B,C
4,A5,B5,C5
5,A6,B6,C6
6,A7,B7,C7
7,A8,B8,C8


In [48]:
# 열 이름 기반으로 합치기
pd.concat([df1,df2])

Unnamed: 0,A,B,C
0,A1,B1,C1
1,A2,B2,C2
2,A3,B3,C3
3,A4,B4,C4
4,A5,B5,C5
5,A6,B6,C6
6,A7,B7,C7
7,A8,B8,C8


In [49]:
# 행 이름 기반으로 합치기
pd.concat([df1,df2],axis=1)

Unnamed: 0,A,B,C,A.1,B.1,C.1
0,A1,B1,C1,,,
1,A2,B2,C2,,,
2,A3,B3,C3,,,
3,A4,B4,C4,,,
4,,,,A5,B5,C5
5,,,,A6,B6,C6
6,,,,A7,B7,C7
7,,,,A8,B8,C8


In [53]:
dataset_B ={
    'D':['D5','D6','D7','D8'],
    'E':['E5','E6','E7','E8'],
    'F':['F5','F6','F7','F8'],
}
df2 = pd.DataFrame(dataset_B, index=[0,1,2,3])

df2

Unnamed: 0,D,E,F
0,D5,E5,F5
1,D6,E6,F6
2,D7,E7,F7
3,D8,E8,F8


In [54]:
# 행 이름 기반으로 합치기
pd.concat([df1,df2],axis=1)

Unnamed: 0,A,B,C,D,E,F
0,A1,B1,C1,D5,E5,F5
1,A2,B2,C2,D6,E6,F6
2,A3,B3,C3,D7,E7,F7
3,A4,B4,C4,D8,E8,F8


In [55]:
dataset_A = {
    '학번':['201701','201702'],
    '평점':['A','B'],
}
dataset_B ={
    '학번':['201701','201702','201801','201802'],
    '나이':[20,19,21,20],
    '성별':['F','M','F','M'],
}
df1 = pd.DataFrame(dataset_A)
df2 = pd.DataFrame(dataset_B)

In [56]:
df1

Unnamed: 0,학번,평점
0,201701,A
1,201702,B


In [57]:
df2

Unnamed: 0,학번,나이,성별
0,201701,20,F
1,201702,19,M
2,201801,21,F
3,201802,20,M


In [58]:
# 교집합
pd.merge(df1,df2,how='inner',on='학번')

Unnamed: 0,학번,평점,나이,성별
0,201701,A,20,F
1,201702,B,19,M


In [59]:
# 합집합
pd.merge(df1,df2,how='outer',on='학번')

Unnamed: 0,학번,평점,나이,성별
0,201701,A,20,F
1,201702,B,19,M
2,201801,,21,F
3,201802,,20,M


In [60]:
# 왼쪽 데이터 프레임 기준 (df1)
pd.merge(df1,df2,how='left',on='학번')

Unnamed: 0,학번,평점,나이,성별
0,201701,A,20,F
1,201702,B,19,M


In [61]:
# 오른쪽 데이터 프레임 기준 (df1)
pd.merge(df1,df2,how='right',on='학번')

Unnamed: 0,학번,평점,나이,성별
0,201701,A,20,F
1,201702,B,19,M
2,201801,,21,F
3,201802,,20,M


In [62]:
# Join을 이용한 방식 (왼쪽 데이터 프레임 기준)
df1.join(df2,lsuffix='_df1', rsuffix='_df2')

Unnamed: 0,학번_df1,평점,학번_df2,나이,성별
0,201701,A,201701,20,F
1,201702,B,201702,19,M


In [63]:
# Join을 이용한 방식 (오른쪽 데이터 프레임 기준)
df2.join(df1,lsuffix='_df2', rsuffix='_df1')

Unnamed: 0,학번_df2,나이,성별,학번_df1,평점
0,201701,20,F,201701.0,A
1,201702,19,M,201702.0,B
2,201801,21,F,,
3,201802,20,M,,


### 2-4. 그외 유용한 기능

In [64]:
df = pd.DataFrame([
['Jeff Bezos',112.4,55, 'United States',['Amazon']],
['Bill Gates',103.3,64, 'United States',['Microsoft']],
['Bernard Arnault',95.4,70, 'France',['LVMH']],
['Warren Buffett',79.5,88, 'United States',['Berkshire Hathaway']],
['Amancio Ortega',67.7 ,83, 'Spain', ['Zara']],
['Mark Zuckerberg',67.6,35, 'United States',['Facebook']],
['Larry Ellison',64.8,75, 'United States',['Oracle Corporation']],
['Michael Bloomberg',56.1,76, 'United States',['Bloomberg']],
['Larry Page',54.8,46,'United States',['Google']],
['Carlos Slim',54.3,79, 'Mexico',['América Móvil', 'Grupo Carso']],
], columns=['Name',"Net Worth","Age","Nationality","Companies"])

In [65]:
df

Unnamed: 0,Name,Net Worth,Age,Nationality,Companies
0,Jeff Bezos,112.4,55,United States,[Amazon]
1,Bill Gates,103.3,64,United States,[Microsoft]
2,Bernard Arnault,95.4,70,France,[LVMH]
3,Warren Buffett,79.5,88,United States,[Berkshire Hathaway]
4,Amancio Ortega,67.7,83,Spain,[Zara]
5,Mark Zuckerberg,67.6,35,United States,[Facebook]
6,Larry Ellison,64.8,75,United States,[Oracle Corporation]
7,Michael Bloomberg,56.1,76,United States,[Bloomberg]
8,Larry Page,54.8,46,United States,[Google]
9,Carlos Slim,54.3,79,Mexico,"[América Móvil, Grupo Carso]"


In [66]:
# 어떤 국적들이 있는지 확인
df['Nationality'].unique()

array(['United States', 'France', 'Spain', 'Mexico'], dtype=object)

In [70]:
df['Nationality'].values

array(['United States', 'United States', 'France', 'United States',
       'Spain', 'United States', 'United States', 'United States',
       'United States', 'Mexico'], dtype=object)

In [71]:
set(df['Nationality'].values)

{'France', 'Mexico', 'Spain', 'United States'}

In [72]:
# 나라별 몇 명이 각각 있나?
df['Nationality'].value_counts()

United States    7
France           1
Spain            1
Mexico           1
Name: Nationality, dtype: int64

In [75]:
df

Unnamed: 0,Name,Net Worth,Age,Nationality,Companies
0,Jeff Bezos,112.4,55,United States,[Amazon]
1,Bill Gates,103.3,64,United States,[Microsoft]
2,Bernard Arnault,95.4,70,France,[LVMH]
3,Warren Buffett,79.5,88,United States,[Berkshire Hathaway]
4,Amancio Ortega,67.7,83,Spain,[Zara]
5,Mark Zuckerberg,67.6,35,United States,[Facebook]
6,Larry Ellison,64.8,75,United States,[Oracle Corporation]
7,Michael Bloomberg,56.1,76,United States,[Bloomberg]
8,Larry Page,54.8,46,United States,[Google]
9,Carlos Slim,54.3,79,Mexico,"[América Móvil, Grupo Carso]"


In [73]:
# 기존값과 함수를 이용하여 새로운 값 생성가능
def billion(x):
    return x*10**9

df['Net Worth'].apply(billion)

0    1.124000e+11
1    1.033000e+11
2    9.540000e+10
3    7.950000e+10
4    6.770000e+10
5    6.760000e+10
6    6.480000e+10
7    5.610000e+10
8    5.480000e+10
9    5.430000e+10
Name: Net Worth, dtype: float64

In [76]:
df['Net Worth'].apply(lambda x : x*10**9)

0    1.124000e+11
1    1.033000e+11
2    9.540000e+10
3    7.950000e+10
4    6.770000e+10
5    6.760000e+10
6    6.480000e+10
7    5.610000e+10
8    5.480000e+10
9    5.430000e+10
Name: Net Worth, dtype: float64

In [74]:
# 회사 수
df['Companies'].apply(len)

0    1
1    1
2    1
3    1
4    1
5    1
6    1
7    1
8    1
9    2
Name: Companies, dtype: int64

In [77]:
## 평균 나이
df['Age'].mean()

67.1

In [78]:
## 재산 합
df['Net Worth'].sum()

755.9

In [79]:
## 나이 기준 정렬
df.sort_values(by='Age')

Unnamed: 0,Name,Net Worth,Age,Nationality,Companies
5,Mark Zuckerberg,67.6,35,United States,[Facebook]
8,Larry Page,54.8,46,United States,[Google]
0,Jeff Bezos,112.4,55,United States,[Amazon]
1,Bill Gates,103.3,64,United States,[Microsoft]
2,Bernard Arnault,95.4,70,France,[LVMH]
6,Larry Ellison,64.8,75,United States,[Oracle Corporation]
7,Michael Bloomberg,56.1,76,United States,[Bloomberg]
9,Carlos Slim,54.3,79,Mexico,"[América Móvil, Grupo Carso]"
4,Amancio Ortega,67.7,83,Spain,[Zara]
3,Warren Buffett,79.5,88,United States,[Berkshire Hathaway]


## 3. 데이터 읽기/쓰기

In [80]:
%pwd

'/content'

In [81]:
# 엑셀 파일 읽기
df = pd.read_excel('sample.xlsx',sheet_name='Sheet1', index_col=[0])
df

Unnamed: 0,한국,중국,독일
1월,22000,44000,23000
2월,15000,30000,25000
3월,16000,32000,26000
4월,14000,28000,24000
5월,13000,26000,43000
6월,19000,40000,59000
7월,12000,50000,32000
8월,13000,66000,23000
9월,10000,99000,40000
10월,11000,111000,31000


In [82]:
# JSON 타입으로
# 파일로 저장을 원할시: df.to_json("df.json")
df.to_json()

'{"\\ud55c\\uad6d":{"1\\uc6d4":22000,"2\\uc6d4":15000,"3\\uc6d4":16000,"4\\uc6d4":14000,"5\\uc6d4":13000,"6\\uc6d4":19000,"7\\uc6d4":12000,"8\\uc6d4":13000,"9\\uc6d4":10000,"10\\uc6d4":11000,"11\\uc6d4":12000,"12\\uc6d4":22000,"\\ucd1d":179000},"\\uc911\\uad6d":{"1\\uc6d4":44000,"2\\uc6d4":30000,"3\\uc6d4":32000,"4\\uc6d4":28000,"5\\uc6d4":26000,"6\\uc6d4":40000,"7\\uc6d4":50000,"8\\uc6d4":66000,"9\\uc6d4":99000,"10\\uc6d4":111000,"11\\uc6d4":122000,"12\\uc6d4":122000,"\\ucd1d":770000},"\\ub3c5\\uc77c":{"1\\uc6d4":23000,"2\\uc6d4":25000,"3\\uc6d4":26000,"4\\uc6d4":24000,"5\\uc6d4":43000,"6\\uc6d4":59000,"7\\uc6d4":32000,"8\\uc6d4":23000,"9\\uc6d4":40000,"10\\uc6d4":31000,"11\\uc6d4":22000,"12\\uc6d4":42000,"\\ucd1d":390000}}'

In [83]:
df.to_html()

'<table border="1" class="dataframe">\n  <thead>\n    <tr style="text-align: right;">\n      <th></th>\n      <th>한국</th>\n      <th>중국</th>\n      <th>독일</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>1월</th>\n      <td>22000</td>\n      <td>44000</td>\n      <td>23000</td>\n    </tr>\n    <tr>\n      <th>2월</th>\n      <td>15000</td>\n      <td>30000</td>\n      <td>25000</td>\n    </tr>\n    <tr>\n      <th>3월</th>\n      <td>16000</td>\n      <td>32000</td>\n      <td>26000</td>\n    </tr>\n    <tr>\n      <th>4월</th>\n      <td>14000</td>\n      <td>28000</td>\n      <td>24000</td>\n    </tr>\n    <tr>\n      <th>5월</th>\n      <td>13000</td>\n      <td>26000</td>\n      <td>43000</td>\n    </tr>\n    <tr>\n      <th>6월</th>\n      <td>19000</td>\n      <td>40000</td>\n      <td>59000</td>\n    </tr>\n    <tr>\n      <th>7월</th>\n      <td>12000</td>\n      <td>50000</td>\n      <td>32000</td>\n    </tr>\n    <tr>\n      <th>8월</th>\n      <td>13000</td>\n      <td>66

In [84]:
df.to_latex()

  df.to_latex()


'\\begin{tabular}{lrrr}\n\\toprule\n{} &      한국 &      중국 &      독일 \\\\\n\\midrule\n1월  &   22000 &   44000 &   23000 \\\\\n2월  &   15000 &   30000 &   25000 \\\\\n3월  &   16000 &   32000 &   26000 \\\\\n4월  &   14000 &   28000 &   24000 \\\\\n5월  &   13000 &   26000 &   43000 \\\\\n6월  &   19000 &   40000 &   59000 \\\\\n7월  &   12000 &   50000 &   32000 \\\\\n8월  &   13000 &   66000 &   23000 \\\\\n9월  &   10000 &   99000 &   40000 \\\\\n10월 &   11000 &  111000 &   31000 \\\\\n11월 &   12000 &  122000 &   22000 \\\\\n12월 &   22000 &  122000 &   42000 \\\\\n총   &  179000 &  770000 &  390000 \\\\\n\\bottomrule\n\\end{tabular}\n'

In [85]:
df = pd.read_html("https://ko.wikipedia.org/wiki/%EC%98%81%ED%99%94_%EB%A7%A4%EC%B6%9C_%EC%88%9C%EC%9C%84_%EB%AA%A9%EB%A1%9D")

In [None]:
df

In [87]:
import pandas as pd
df = pd.read_html('https://ko.wikipedia.org/wiki/%EC%98%81%ED%99%94_%EB%A7%A4%EC%B6%9C_%EC%88%9C%EC%9C%84_%EB%AA%A9%EB%A1%9D')
df[0]

Unnamed: 0,순위,최고,영화 제목,전 세계 수익($),연도,국가,출처
0,1,1,아바타,"$2,923,706,026",2009,,[2][# 1]
1,2,1,어벤져스: 엔드게임,"$2,797,501,328",2019,,[# 2][# 3]
2,3,3,아바타: 물의 길,"$2,318,567,513",2022,,[# 4]
3,4,1,타이타닉,"T$2,257,263,602",1997,,[3][# 5]
4,5,3,스타워즈: 깨어난 포스,"$2,068,223,624",2015,,[# 6][# 7]
5,6,4,어벤져스: 인피니티 워,"$2,048,359,754",2018,,[# 8][# 9]
6,7,6,스파이더맨: 노 웨이 홈,"$1,921,847,111",2021,,[# 10][# 11]
7,8,3,쥬라기 월드,"$1,671,537,444",2015,,[# 12][# 13]
8,9,7,라이온 킹,"$1,656,943,394",2019,,[# 14][# 3]
9,10,3,어벤져스,"$1,518,815,515",2012,,[# 15][# 16]
