### 판다스(Pandas)
- 데이터 처리 라이브러리 중 가장 인기있는 라이브러리이다.
- 2차원데이터(테이블, 엑셀, CSV 등)를 효율적으로 가공 및 처리할 수 있다.

### 판다스 구성 요소
- DataFrame: 행과 열로 구성된 2차원 Dataset을 의미한다.
- Series: 1개의 열로만 구성된 열벡터 Dataset을 의미한다.
- Index: DataFrame과 Series에서 중복없는 행 번호를 의미한다.

In [25]:
import pandas as pd

pd.__version__

'2.3.3'

### DataFrame()
- 

In [26]:
import pandas as pd

film = {
    'title': ['명량', '극한 직업', '범죄 도시3', '국제 시장'],
    'audience': [17_615_919, 16_266_480, 10_682_674, 14_265_222],
    'country': ['한국', '한국', '한국', '한국']
}

# DataFrame은 print가 아니라 display로 출력해야 함
film_df = pd.DataFrame(film)
display(film_df)

Unnamed: 0,title,audience,country
0,명량,17615919,한국
1,극한 직업,16266480,한국
2,범죄 도시3,10682674,한국
3,국제 시장,14265222,한국


In [27]:
# feature(컬럼) 추가
film_df['income'] = [135_758_658_810, 139_657_105_516, 104_686_489_632, 110_951_970_230]
display(film_df)

Unnamed: 0,title,audience,country,income
0,명량,17615919,한국,135758658810
1,극한 직업,16266480,한국,139657105516
2,범죄 도시3,10682674,한국,104686489632
3,국제 시장,14265222,한국,110951970230


In [28]:
# index 수정
film_df.index = ['one', 'two', 'three', 'four']
display(film_df)

Unnamed: 0,title,audience,country,income
one,명량,17615919,한국,135758658810
two,극한 직업,16266480,한국,139657105516
three,범죄 도시3,10682674,한국,104686489632
four,국제 시장,14265222,한국,110951970230


In [29]:
# 인덱스를 0부터 순서대로 다시 세팅
# drop: 기존 인덱스 삭제 여부
# inplace: 원본 Dataframe에 적용 여부
film_df.reset_index(drop=False, inplace=True)
film_df

Unnamed: 0,index,title,audience,country,income
0,one,명량,17615919,한국,135758658810
1,two,극한 직업,16266480,한국,139657105516
2,three,범죄 도시3,10682674,한국,104686489632
3,four,국제 시장,14265222,한국,110951970230


In [34]:
# feature 삭제
film_df.drop(labels=['index'], axis=1, inplace=True)

# 행 삭제
# film_df.drop(index=[0], axis=0)

film_df

Unnamed: 0,title,audience,country,income
0,명량,17615919,한국,135758658810
1,극한 직업,16266480,한국,139657105516
2,범죄 도시3,10682674,한국,104686489632
3,국제 시장,14265222,한국,110951970230


In [35]:
# feature 이름 수정
# {'기존이름: '새로운이름'}
film_df.rename(columns={'title': 'name'})

Unnamed: 0,name,audience,country,income
0,명량,17615919,한국,135758658810
1,극한 직업,16266480,한국,139657105516
2,범죄 도시3,10682674,한국,104686489632
3,국제 시장,14265222,한국,110951970230


### read_csv()
- csv 파일을 DataFrame으로 읽어온다.

In [36]:
import pandas as pd

happiness_df = pd.read_csv('./datasets/happiness_report_2022.csv')
display(happiness_df)

Unnamed: 0,country,score,income
0,Finland,7.821,High income
1,Denmark,7.636,High income
2,Iceland,7.557,High income
3,Switzerland,7.512,High income
4,Netherlands,7.415,High income
...,...,...,...
141,Botswana,3.471,Upper middle income
142,Rwanda,3.268,Low income
143,Zimbabwe,2.995,Lower middle income
144,Lebanon,2.955,Lower middle income


### head()
- 전체 데이터 중 앞부분 일부를 가져온다.

In [38]:
happiness_df.head(10)

Unnamed: 0,country,score,income
0,Finland,7.821,High income
1,Denmark,7.636,High income
2,Iceland,7.557,High income
3,Switzerland,7.512,High income
4,Netherlands,7.415,High income
5,Luxembourg,7.404,High income
6,Sweden,7.384,High income
7,Norway,7.365,High income
8,Israel,7.364,High income
9,New Zealand,7.2,High income


### tail()
- 전체 데이터 중 뒷부분 일부를 가져온다.

In [39]:
happiness_df.tail(10)

Unnamed: 0,country,score,income
136,Zambia,3.76,Low income
137,Malawi,3.75,Low income
138,Tanzania,3.702,Lower middle income
139,Sierra Leone,3.574,Low income
140,Lesotho,3.512,Lower middle income
141,Botswana,3.471,Upper middle income
142,Rwanda,3.268,Low income
143,Zimbabwe,2.995,Lower middle income
144,Lebanon,2.955,Lower middle income
145,Afghanistan,2.404,Low income


### iloc[], loc[]
- 원하는 행 또는 열을 가져온다.
- iloc은 인덱스 번호로 가져오고, loc은 인덱스 값 또는 컬럼명으로 가져온다.

In [40]:
happiness_df.index += 1
display(happiness_df)

Unnamed: 0,country,score,income
1,Finland,7.821,High income
2,Denmark,7.636,High income
3,Iceland,7.557,High income
4,Switzerland,7.512,High income
5,Netherlands,7.415,High income
...,...,...,...
142,Botswana,3.471,Upper middle income
143,Rwanda,3.268,Low income
144,Zimbabwe,2.995,Lower middle income
145,Lebanon,2.955,Lower middle income


In [42]:
print(happiness_df.iloc[0])
print(happiness_df.loc[1])

country        Finland
score            7.821
income     High income
Name: 1, dtype: object
country        Finland
score            7.821
income     High income
Name: 1, dtype: object


In [47]:
# 한 개의 feature를 가져오면 Series이다.
# to_frame()을 사용하면 다시 Dataframe으로 변경된다.

# 모든 행의 마지막 열만 출력
print(happiness_df.iloc[:, -1])
print(type(happiness_df.iloc[:, -1].to_frame()))

print(happiness_df.loc[:, 'income'])
print(type(happiness_df.loc[:, 'income'].to_frame()))

# 대괄호로 가져올 때에는 두 번 써서 Dataframe으로 가져올 수 있다.
print(happiness_df['income'])
print(type(happiness_df[['income']]))    # fancy인덱싱과 비슷?

1              High income
2              High income
3              High income
4              High income
5              High income
              ...         
142    Upper middle income
143             Low income
144    Lower middle income
145    Lower middle income
146             Low income
Name: income, Length: 146, dtype: object
<class 'pandas.core.frame.DataFrame'>
1              High income
2              High income
3              High income
4              High income
5              High income
              ...         
142    Upper middle income
143             Low income
144    Lower middle income
145    Lower middle income
146             Low income
Name: income, Length: 146, dtype: object
<class 'pandas.core.frame.DataFrame'>
1              High income
2              High income
3              High income
4              High income
5              High income
              ...         
142    Upper middle income
143             Low income
144    Lower middle income
145  

In [49]:
happiness_df[['score', 'income']]

Unnamed: 0,score,income
1,7.821,High income
2,7.636,High income
3,7.557,High income
4,7.512,High income
5,7.415,High income
...,...,...
142,3.471,Upper middle income
143,3.268,Low income
144,2.995,Lower middle income
145,2.955,Lower middle income


In [50]:
happiness_df.score.to_frame()

Unnamed: 0,score
1,7.821
2,7.636
3,7.557
4,7.512
5,7.415
...,...
142,3.471
143,3.268
144,2.995
145,2.955


In [57]:
# 행복 점수가 3보다 작은 데이터 가져오기
happiness_df_lt_3 = happiness_df[happiness_df.score < 3]
display(happiness_df_lt_3)
happiness_df_lt_3.shape

Unnamed: 0,country,score,income
144,Zimbabwe,2.995,Lower middle income
145,Lebanon,2.955,Lower middle income
146,Afghanistan,2.404,Low income


(3, 3)

In [63]:
import pandas as pd

happiness_df = pd.read_csv('./datasets/happiness_report_2022.csv')

# print(happiness_df.columns)
# print(happiness_df.index)
# print(happiness_df.index.values)
happiness_df.info()
happiness_df.dtypes

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 146 entries, 0 to 145
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   country  146 non-null    object 
 1   score    146 non-null    float64
 2   income   140 non-null    object 
dtypes: float64(1), object(2)
memory usage: 3.5+ KB


country     object
score      float64
income      object
dtype: object

In [65]:
import numpy as np

# 반올림
# happiness_df.score.apply(lambda x: round(x))

# 자료형을 int로 변경하면서 실수의 소수점을 자름
happiness_df.astype({'score': np.int32})

Unnamed: 0,country,score,income
0,Finland,7,High income
1,Denmark,7,High income
2,Iceland,7,High income
3,Switzerland,7,High income
4,Netherlands,7,High income
...,...,...,...
141,Botswana,3,Upper middle income
142,Rwanda,3,Low income
143,Zimbabwe,2,Lower middle income
144,Lebanon,2,Lower middle income
