## Learn Pandas
1. 데이터 분석을 위한 모듈
- 데이터 분석 및 가공에 사용되는 활용 빈도가 높은 라이브러리
- Numpy 모듈 기반으로 생성된다.

학습 내용

1. 기초 익히기
- 이미 존재하는 파일의 내용으로 DataFrame 생성하기
- 중복 데이터 제거
- 측정 오류 데이터 제거

2. DataFream Structure
> https://pandas.pydata.org/docs/getting_started/index.html <br>
<img src = "img/DataFrameStructure.png" width = '50%'>


In [4]:
# 필요한 라이브러리 import

import numpy as np
import pandas as pd

In [5]:
!pip show pandas

Name: pandas
Version: 1.0.1
Summary: Powerful data structures for data analysis, time series, and statistics
Home-page: https://pandas.pydata.org
Author: None
Author-email: None
License: BSD
Location: c:\pyth\lib\site-packages
Requires: python-dateutil, pytz, numpy
Required-by: statsmodels, seaborn


기초 익히기

In [6]:
# 시리즈 - 정수값으로 생성 시 자동으로 int64
s = pd.Series([1,2,3])
print(s)
type(s)

0    1
1    2
2    3
dtype: int64


pandas.core.series.Series

In [8]:
print(s.values)

[1 2 3]


In [9]:
s.index

RangeIndex(start=0, stop=3, step=1)

# 결측치를 의미하는 표현
# Java - null // Javascript - undefined, NaN // sql - null // python - NaN
# Python에서 결측치 구성하는 API - np.nan


In [10]:
# 결측치가 포함된 데이터 타입은 float으로 자동 변경
s = pd.Series([1,np.nan,3])
s

0    1.0
1    NaN
2    3.0
dtype: float64

In [11]:
# 결측치는 카운트 되지 않는다
s.count()

2

In [12]:
#존재하는 Row 수만큼 count해야 하는 상황이라 가정
#해결책 : 결측치 값을 유효하지만 무용한 데이터로 변경
s = s.fillna(0)
s

0    1.0
1    0.0
2    3.0
dtype: float64

In [13]:
# 결측치 자체를 삭제
s = pd.Series([1,np.nan,3])
s = s.dropna()
s

0    1.0
2    3.0
dtype: float64

특정 날짜를 기준으로 자동으로 날짜 증가시켜 DataFrame 생성

In [14]:
datas = pd.date_range('20200915',periods = 6)
datas
dates = ['Mon','Tue','Thur','Fri']

In [15]:
# 난수 발생 API
# 정수 난수 혹은 실수 난수 발생 여부 선택
np.random.randint(3)

0

In [16]:
np.random.randn(2,3)

array([[ 0.72425161, -1.12818486,  1.01036888],
       [-0.54391756,  0.35028119, -0.21331236]])

In [17]:
df = pd.DataFrame(np.random.randn(6,4), index= datas, columns= dates)
df

Unnamed: 0,Mon,Tue,Thur,Fri
2020-09-15,0.083914,1.417194,-1.454125,0.392567
2020-09-16,-0.157102,2.308499,-0.429491,0.410539
2020-09-17,-0.126515,0.422358,-0.469602,1.105152
2020-09-18,1.422575,1.712835,-1.645058,1.200675
2020-09-19,0.614926,-0.173148,-0.789946,-0.567605
2020-09-20,0.176233,0.795591,1.346524,0.768018


In [18]:
df.values

array([[ 0.08391423,  1.41719395, -1.45412537,  0.39256689],
       [-0.15710182,  2.30849884, -0.42949123,  0.4105391 ],
       [-0.12651527,  0.42235809, -0.46960247,  1.10515181],
       [ 1.42257475,  1.71283458, -1.6450578 ,  1.20067455],
       [ 0.61492577, -0.17314786, -0.78994596, -0.56760538],
       [ 0.17623305,  0.7955911 ,  1.34652362,  0.76801795]])

In [224]:
df.columns

Index(['이름', '중간고사점수', '기말고사점수'], dtype='object')

In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 6 entries, 2020-09-15 to 2020-09-20
Freq: D
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Mon     6 non-null      float64
 1   Tue     6 non-null      float64
 2   Thur    6 non-null      float64
 3   Fri     6 non-null      float64
dtypes: float64(4)
memory usage: 240.0 bytes


In [21]:
df.describe()

Unnamed: 0,Mon,Tue,Thur,Fri
count,6.0,6.0,6.0,6.0
mean,0.335672,1.080555,-0.573617,0.551557
std,0.600764,0.906328,1.066641,0.643796
min,-0.157102,-0.173148,-1.645058,-0.567605
25%,-0.073908,0.515666,-1.288081,0.39706
50%,0.130074,1.106393,-0.629774,0.589279
75%,0.505253,1.638924,-0.439519,1.020868
max,1.422575,2.308499,1.346524,1.200675


In [22]:
df.sort_values(by="Tue", ascending= True)

Unnamed: 0,Mon,Tue,Thur,Fri
2020-09-19,0.614926,-0.173148,-0.789946,-0.567605
2020-09-17,-0.126515,0.422358,-0.469602,1.105152
2020-09-20,0.176233,0.795591,1.346524,0.768018
2020-09-15,0.083914,1.417194,-1.454125,0.392567
2020-09-18,1.422575,1.712835,-1.645058,1.200675
2020-09-16,-0.157102,2.308499,-0.429491,0.410539


In [23]:
df["Mon"]

2020-09-15    0.083914
2020-09-16   -0.157102
2020-09-17   -0.126515
2020-09-18    1.422575
2020-09-19    0.614926
2020-09-20    0.176233
Freq: D, Name: Mon, dtype: float64

In [24]:
df.Mon

2020-09-15    0.083914
2020-09-16   -0.157102
2020-09-17   -0.126515
2020-09-18    1.422575
2020-09-19    0.614926
2020-09-20    0.176233
Freq: D, Name: Mon, dtype: float64

In [25]:
# A, B
df[['Tue','Mon']]

Unnamed: 0,Tue,Mon
2020-09-15,1.417194,0.083914
2020-09-16,2.308499,-0.157102
2020-09-17,0.422358,-0.126515
2020-09-18,1.712835,1.422575
2020-09-19,-0.173148,0.614926
2020-09-20,0.795591,0.176233


In [26]:
# 인덱스와 : 표기로 범위 설정하여 출력
df['2020-09-15':'2020-09-17']

Unnamed: 0,Mon,Tue,Thur,Fri
2020-09-15,0.083914,1.417194,-1.454125,0.392567
2020-09-16,-0.157102,2.308499,-0.429491,0.410539
2020-09-17,-0.126515,0.422358,-0.469602,1.105152


In [27]:
df[0:3]

Unnamed: 0,Mon,Tue,Thur,Fri
2020-09-15,0.083914,1.417194,-1.454125,0.392567
2020-09-16,-0.157102,2.308499,-0.429491,0.410539
2020-09-17,-0.126515,0.422358,-0.469602,1.105152


# loc 속성 
# 데이터들을 슬라이싱 하는 기술
# - loc[index, columns]

In [28]:
df.loc[:, ['Mon', 'Tue']]

Unnamed: 0,Mon,Tue
2020-09-15,0.083914,1.417194
2020-09-16,-0.157102,2.308499
2020-09-17,-0.126515,0.422358
2020-09-18,1.422575,1.712835
2020-09-19,0.614926,-0.173148
2020-09-20,0.176233,0.795591


In [29]:
df.iloc[:, [1]]
    

Unnamed: 0,Tue
2020-09-15,1.417194
2020-09-16,2.308499
2020-09-17,0.422358
2020-09-18,1.712835
2020-09-19,-0.173148
2020-09-20,0.795591


In [30]:
df2 = df.copy()
df2

Unnamed: 0,Mon,Tue,Thur,Fri
2020-09-15,0.083914,1.417194,-1.454125,0.392567
2020-09-16,-0.157102,2.308499,-0.429491,0.410539
2020-09-17,-0.126515,0.422358,-0.469602,1.105152
2020-09-18,1.422575,1.712835,-1.645058,1.200675
2020-09-19,0.614926,-0.173148,-0.789946,-0.567605
2020-09-20,0.176233,0.795591,1.346524,0.768018


In [31]:
# 이미 존재하는 DataFrame에 새로운 Column 추가
df['Wed'] = [1,2,3,4,5,6]
df

Unnamed: 0,Mon,Tue,Thur,Fri,Wed
2020-09-15,0.083914,1.417194,-1.454125,0.392567,1
2020-09-16,-0.157102,2.308499,-0.429491,0.410539,2
2020-09-17,-0.126515,0.422358,-0.469602,1.105152,3
2020-09-18,1.422575,1.712835,-1.645058,1.200675,4
2020-09-19,0.614926,-0.173148,-0.789946,-0.567605,5
2020-09-20,0.176233,0.795591,1.346524,0.768018,6


In [32]:
# 존재하는 시리즈 삭제
del df['Wed']
df

Unnamed: 0,Mon,Tue,Thur,Fri
2020-09-15,0.083914,1.417194,-1.454125,0.392567
2020-09-16,-0.157102,2.308499,-0.429491,0.410539
2020-09-17,-0.126515,0.422358,-0.469602,1.105152
2020-09-18,1.422575,1.712835,-1.645058,1.200675
2020-09-19,0.614926,-0.173148,-0.789946,-0.567605
2020-09-20,0.176233,0.795591,1.346524,0.768018


In [33]:
df['Wed'] = [1,2,3,np.nan,5,6]
df

Unnamed: 0,Mon,Tue,Thur,Fri,Wed
2020-09-15,0.083914,1.417194,-1.454125,0.392567,1.0
2020-09-16,-0.157102,2.308499,-0.429491,0.410539,2.0
2020-09-17,-0.126515,0.422358,-0.469602,1.105152,3.0
2020-09-18,1.422575,1.712835,-1.645058,1.200675,
2020-09-19,0.614926,-0.173148,-0.789946,-0.567605,5.0
2020-09-20,0.176233,0.795591,1.346524,0.768018,6.0


In [34]:
# 해당 시리즈에 데이터 존재 여부 확인 
df['Wed'].isin([1,3,5,7])

2020-09-15     True
2020-09-16    False
2020-09-17     True
2020-09-18    False
2020-09-19     True
2020-09-20    False
Freq: D, Name: Wed, dtype: bool

In [35]:
df.iloc[3, 4] = 4
df

Unnamed: 0,Mon,Tue,Thur,Fri,Wed
2020-09-15,0.083914,1.417194,-1.454125,0.392567,1.0
2020-09-16,-0.157102,2.308499,-0.429491,0.410539,2.0
2020-09-17,-0.126515,0.422358,-0.469602,1.105152,3.0
2020-09-18,1.422575,1.712835,-1.645058,1.200675,4.0
2020-09-19,0.614926,-0.173148,-0.789946,-0.567605,5.0
2020-09-20,0.176233,0.795591,1.346524,0.768018,6.0


In [36]:
df['Wed'].min()
df['Wed'].max()

6.0

# 이미 존재하는 파일의 내용을 기반으로 DataFrame 생성하기

In [37]:
df = pd.read_csv('0.dataset/friends.csv')
df

Unnamed: 0,이름,나이,직업,hobby
0,신동엽,20,연예인,music
1,유재석,41,교수,art
2,김새롬,18,학생,study
3,이영자,45,상담사,talk
4,강호동,38,연예인,talk


In [38]:
df.iloc[3]
df

Unnamed: 0,이름,나이,직업,hobby
0,신동엽,20,연예인,music
1,유재석,41,교수,art
2,김새롬,18,학생,study
3,이영자,45,상담사,talk
4,강호동,38,연예인,talk


In [39]:
df.iloc[3,0] = "Lee"
df

Unnamed: 0,이름,나이,직업,hobby
0,신동엽,20,연예인,music
1,유재석,41,교수,art
2,김새롬,18,학생,study
3,Lee,45,상담사,talk
4,강호동,38,연예인,talk


In [40]:
# Tab으로 구분된 데이터 Read해서 DataFrame 생성하기
df = pd.read_table('0.dataset/friendsTab.txt')
df

Unnamed: 0,이름,나이,직업,hobby
0,신동엽,20,연예인,music
1,유재석,41,교수,art
2,김새롬,18,학생,study
3,이영자,45,상담사,talk
4,강호동,38,연예인,talk


In [41]:
df = pd.read_csv('0.dataset/friendsTab.txt' ,delimiter = "\t")
df

Unnamed: 0,이름,나이,직업,hobby
0,신동엽,20,연예인,music
1,유재석,41,교수,art
2,김새롬,18,학생,study
3,이영자,45,상담사,talk
4,강호동,38,연예인,talk


In [42]:
df = pd.read_csv('0.dataset/friendsTabNoHead.txt' ,delimiter = "\t")
df

Unnamed: 0,신동엽,20,연예인,music
0,유재석,41,교수,art
1,김새롬,18,학생,study
2,이영자,45,상담사,talk
3,강호동,38,연예인,talk


In [43]:
df.columns = ['name', 'age', 'job', 'hobby']
df

Unnamed: 0,name,age,job,hobby
0,유재석,41,교수,art
1,김새롬,18,학생,study
2,이영자,45,상담사,talk
3,강호동,38,연예인,talk


In [44]:
df = pd.read_csv('0.dataset/friendsTabNoHead.txt' ,delimiter = "\t" , header = None)
df.columns = ['name', 'age', 'job', 'hobby']
df

Unnamed: 0,name,age,job,hobby
0,신동엽,20,연예인,music
1,유재석,41,교수,art
2,김새롬,18,학생,study
3,이영자,45,상담사,talk
4,강호동,38,연예인,talk


In [45]:
df = pd.read_csv('0.dataset/friendsTabNoHead.txt' ,delimiter = "\t" , header = None, names = ['name', 'age', 'job', 'hobby'])
df

Unnamed: 0,name,age,job,hobby
0,신동엽,20,연예인,music
1,유재석,41,교수,art
2,김새롬,18,학생,study
3,이영자,45,상담사,talk
4,강호동,38,연예인,talk


In [50]:
df['salary'] = 0
df

Unnamed: 0,name,age,job,hobby,salary
0,신동엽,20,연예인,music,0
1,유재석,41,교수,art,0
2,김새롬,18,학생,study,0
3,이영자,45,상담사,talk,0
4,강호동,38,연예인,talk,0


In [57]:
# np.where() - 조건식에 맞추어 적용 가능한 유용한 함수
# 조건식 df['job'] != '학생' then salary = yes, else salary = no
df['salary'] = np.where(df['job'] != '학생', 'Yes', 'No')
df

Unnamed: 0,name,age,job,hobby,salary
0,신동엽,20,연예인,music,Yes
1,유재석,41,교수,art,Yes
2,김새롬,18,학생,study,No
3,이영자,45,상담사,talk,Yes
4,강호동,38,연예인,talk,Yes


In [67]:
friend_dict_list = [{'name': '신동엽', 'age': 20, 'job': '연예인', 'hobby':'music'},
                     {'name': '유재석', 'age': 41, 'job': '교수', 'hobby':'art'},
                     {'name': '김새롬', 'age': 18, 'job': '학생', 'hobby':'study'},
                     {'name': '이영자', 'age' : 45, 'job': '상담사', 'hobby' : 'talk'},
                     {'name' :  '강호동', 'age' : 38, 'job' : '연예인', 'hobby' : 'talk'}]
friend_dict_list

[{'name': '신동엽', 'age': 20, 'job': '연예인', 'hobby': 'music'},
 {'name': '유재석', 'age': 41, 'job': '교수', 'hobby': 'art'},
 {'name': '김새롬', 'age': 18, 'job': '학생', 'hobby': 'study'},
 {'name': '이영자', 'age': 45, 'job': '상담사', 'hobby': 'talk'},
 {'name': '강호동', 'age': 38, 'job': '연예인', 'hobby': 'talk'}]

In [68]:
df = pd.DataFrame(friend_dict_list)
df

Unnamed: 0,name,age,job,hobby
0,신동엽,20,연예인,music
1,유재석,41,교수,art
2,김새롬,18,학생,study
3,이영자,45,상담사,talk
4,강호동,38,연예인,talk


In [72]:
# 컬럼 순서 변경
df = df[['hobby','job','age','name']]
df

Unnamed: 0,hobby,job,age,name
0,music,연예인,20,신동엽
1,art,교수,41,유재석
2,study,학생,18,김새롬
3,talk,상담사,45,이영자
4,talk,연예인,38,강호동


In [73]:
friend_dict_list = [{'name': '신동엽', 'age': 20, 'job': '연예인', 'hobby':'music'},
                     {'name': '유재석', 'age': 41, 'job': '교수', 'hobby':'art'},
                     {'name': '김새롬', 'age': 18, 'job': '학생', 'hobby':'study'},
                     {'name': '이영자', 'age' : 45, 'job': '상담사', 'hobby' : 'talk'},
                     {'name' :  '강호동', 'age' : 38, 'job' : '연예인', 'hobby' : 'talk'},
                    {'name': '신동엽', 'age': 20, 'job': '연예인', 'hobby':'music'} ]
friend_dict_list

[{'name': '신동엽', 'age': 20, 'job': '연예인', 'hobby': 'music'},
 {'name': '유재석', 'age': 41, 'job': '교수', 'hobby': 'art'},
 {'name': '김새롬', 'age': 18, 'job': '학생', 'hobby': 'study'},
 {'name': '이영자', 'age': 45, 'job': '상담사', 'hobby': 'talk'},
 {'name': '강호동', 'age': 38, 'job': '연예인', 'hobby': 'talk'},
 {'name': '신동엽', 'age': 20, 'job': '연예인', 'hobby': 'music'}]

In [82]:
df = pd.DataFrame(friend_dict_list)
df

Unnamed: 0,name,age,job,hobby
0,신동엽,20,연예인,music
1,유재석,41,교수,art
2,김새롬,18,학생,study
3,이영자,45,상담사,talk
4,강호동,38,연예인,talk
5,신동엽,20,연예인,music


In [83]:
df.duplicated()

0    False
1    False
2    False
3    False
4    False
5     True
dtype: bool

In [94]:
# 중복 데이터 삭제 중복 데이터 중 어떤 것을 남길지 선택할 수 있다.
df = df.drop_duplicates(keep = 'last')
df

Unnamed: 0,name,age,job,hobby
1,유재석,41,교수,art
2,김새롬,18,학생,study
3,이영자,45,상담사,talk
4,강호동,38,연예인,talk
5,신동엽,20,연예인,music


In [103]:
friend_dict_list = [{'name': '신동엽', 'age': 20, 'job': '연예인', 'hobby':'music'},
                     {'name': '유재석', 'age': 41, 'job': '교수', 'hobby':'art'},
                     {'name': '김새롬', 'age': 18, 'job': '학생', 'hobby':'study'},
                     {'name': '이영자', 'age' : 45, 'job': '상담사', 'hobby' : 'talk'},
                     {'name' :  '강호동', 'age' : 38, 'job' : '연예인', 'hobby' : 'talk'},
                    {'name': '신동엽', 'age': 23, 'job': '연예인', 'hobby':'music'} ]
friend_dict_list
df = pd.DataFrame(friend_dict_list)
df

Unnamed: 0,name,age,job,hobby
0,신동엽,20,연예인,music
1,유재석,41,교수,art
2,김새롬,18,학생,study
3,이영자,45,상담사,talk
4,강호동,38,연예인,talk
5,신동엽,23,연예인,music


In [104]:
df = df.drop_duplicates(subset = 'name', keep = 'last')
df

Unnamed: 0,name,age,job,hobby
1,유재석,41,교수,art
2,김새롬,18,학생,study
3,이영자,45,상담사,talk
4,강호동,38,연예인,talk
5,신동엽,23,연예인,music


In [105]:
friend_dict_list = [{'name': '신동엽', 'age': 20, 'job': '연예인', 'hobby':'music'},
                     {'name': '유재석', 'age': 41, 'job': '교수', 'hobby':'art'},
                     {'name': '김새롬', 'age': 18, 'job': '학생', 'hobby':'study'},
                     {'name': '이영자', 'age' : 45, 'job': '상담사', 'hobby' : 'talk'},
                     {'name' :  '강호동', 'age' : 38, 'job' : '연예인', 'hobby' : 'talk'},
                    {'name': '신동엽', 'age': 23, 'job': '연예인', 'hobby':'music'} ]
friend_dict_list
df = pd.DataFrame(friend_dict_list)
df

Unnamed: 0,name,age,job,hobby
0,신동엽,20,연예인,music
1,유재석,41,교수,art
2,김새롬,18,학생,study
3,이영자,45,상담사,talk
4,강호동,38,연예인,talk
5,신동엽,23,연예인,music


In [106]:
df.drop_duplicates('name', keep ='last', inplace = True)
df

Unnamed: 0,name,age,job,hobby
1,유재석,41,교수,art
2,김새롬,18,학생,study
3,이영자,45,상담사,talk
4,강호동,38,연예인,talk
5,신동엽,23,연예인,music


In [107]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5 entries, 1 to 5
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   name    5 non-null      object
 1   age     5 non-null      int64 
 2   job     5 non-null      object
 3   hobby   5 non-null      object
dtypes: int64(1), object(3)
memory usage: 200.0+ bytes


In [108]:
friend_dict_list = [{'name': '신동엽', 'age': 20, 'job': '연예인', 'hobby':'music'},
                     {'name': '유재석', 'age': 41, 'job': '교수', 'hobby':'art'},
                     {'name': '김새롬', 'age': 18, 'job': '학생', 'hobby':'study'},
                     {'name': '이영자', 'age' : 45, 'job': '상담사', 'hobby' : 'talk'},
                     {'name' :  '강호동', 'age' : 38, 'job' : '연예인', 'hobby' : 'talk'},
                    {'name': '신동엽', 'age': None, 'job': '연예인', 'hobby':'music'} ]
friend_dict_list
df = pd.DataFrame(friend_dict_list)
df

Unnamed: 0,name,age,job,hobby
0,신동엽,20.0,연예인,music
1,유재석,41.0,교수,art
2,김새롬,18.0,학생,study
3,이영자,45.0,상담사,talk
4,강호동,38.0,연예인,talk
5,신동엽,,연예인,music


In [109]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   name    6 non-null      object 
 1   age     5 non-null      float64
 2   job     6 non-null      object 
 3   hobby   6 non-null      object 
dtypes: float64(1), object(3)
memory usage: 320.0+ bytes


In [111]:
df.isna()


Unnamed: 0,name,age,job,hobby
0,False,False,False,False
1,False,False,False,False
2,False,False,False,False
3,False,False,False,False
4,False,False,False,False
5,False,True,False,False


In [112]:
df['age']

0    20.0
1    41.0
2    18.0
3    45.0
4    38.0
5     NaN
Name: age, dtype: float64

In [114]:
df['age'].sum()

162.0

In [115]:
df['age'].mean()

32.4

In [117]:
#생성된 데이터 프레임 재대입 없이 현 데이터 프레임에 직접 수정하고 싶은 경우
df['age'].fillna(0,inplace = True)
df

Unnamed: 0,name,age,job,hobby
0,신동엽,20.0,연예인,music
1,유재석,41.0,교수,art
2,김새롬,18.0,학생,study
3,이영자,45.0,상담사,talk
4,강호동,38.0,연예인,talk
5,신동엽,0.0,연예인,music


In [118]:
df['age'].mean()

27.0

In [119]:
df['job']

0    연예인
1     교수
2     학생
3    상담사
4    연예인
5    연예인
Name: job, dtype: object

In [120]:
v = df.groupby('job')
v

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000001EA4A938A08>

In [121]:
len(v)

4

In [122]:
v.groups

{'교수': Int64Index([1], dtype='int64'),
 '상담사': Int64Index([3], dtype='int64'),
 '연예인': Int64Index([0, 4, 5], dtype='int64'),
 '학생': Int64Index([2], dtype='int64')}

In [161]:
friend_dict_list = [{'name': '신동엽', 'age': 20, 'job': '연예인', 'hobby':'music'},
                     {'name': '유재석', 'age': 41, 'job': '교수', 'hobby':'art'},
                     {'name': '김새롬', 'age': 18, 'job': '학생', 'hobby':'study'},
                     {'name': '이영자', 'age' : 45, 'job': '상담사', 'hobby' : 'talk'},
                     {'name' :  '강호동', 'age' : 38, 'job' : '연예인', 'hobby' : 'talk'},
                    {'name': '신동엽', 'age': None, 'job': '연예인', 'hobby':'music'} ]
friend_dict_list
df = pd.DataFrame(friend_dict_list)
df

Unnamed: 0,name,age,job,hobby
0,신동엽,20.0,연예인,music
1,유재석,41.0,교수,art
2,김새롬,18.0,학생,study
3,이영자,45.0,상담사,talk
4,강호동,38.0,연예인,talk
5,신동엽,,연예인,music


In [162]:
v = df.groupby('job')
v

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000001EA4B263948>

In [163]:
v['age'].mean()['연예인']

29.0

In [164]:
df.fillna(v['age'].mean()['연예인'],inplace = True)
df

Unnamed: 0,name,age,job,hobby
0,신동엽,20.0,연예인,music
1,유재석,41.0,교수,art
2,김새롬,18.0,학생,study
3,이영자,45.0,상담사,talk
4,강호동,38.0,연예인,talk
5,신동엽,29.0,연예인,music


In [165]:
df.iloc[5,1] = None
print(df)
df['age'] = df.groupby("job")['age'].transform('median')


  name   age  job  hobby
0  신동엽  20.0  연예인  music
1  유재석  41.0   교수    art
2  김새롬  18.0   학생  study
3  이영자  45.0  상담사   talk
4  강호동  38.0  연예인   talk
5  신동엽   NaN  연예인  music


In [166]:
df

Unnamed: 0,name,age,job,hobby
0,신동엽,29.0,연예인,music
1,유재석,41.0,교수,art
2,김새롬,18.0,학생,study
3,이영자,45.0,상담사,talk
4,강호동,29.0,연예인,talk
5,신동엽,29.0,연예인,music


In [167]:
friend_dict_list = [{'name': '신동엽', 'age': 20, 'job': '연예인', 'hobby':'music'},
                     {'name': '유재석', 'age': 41, 'job': '교수', 'hobby':'art'},
                     {'name': '김새롬', 'age': 18, 'job': '학생', 'hobby':'study'},
                     {'name': '이영자', 'age' : 45, 'job': '상담사', 'hobby' : 'talk'},
                     {'name' : '강호동', 'age' : 38, 'job' : '연예인', 'hobby' : 'talk'},
                     {'name': '신동엽', 'age': None, 'job': '연예인', 'hobby':'music'},
                     {'name': '고현정', 'age': 44, 'job': '가수', 'hobby':'music'},
                     {'name': '박민영', 'age': 22, 'job': '학생', 'hobby':'art'},
                     {'name': '박서준', 'age': 18, 'job': '학생', 'hobby':'study'},
                     {'name': '박보검', 'age' : 45, 'job': '상담사', 'hobby' : 'talk'},
                     {'name' : '이효리', 'age' : 28, 'job' : '교수', 'hobby' : 'talk'},
                    {'name': '이상순', 'age': 29, 'job': '주부', 'hobby':'music'}]
df = pd.DataFrame(friend_dict_list)
df

Unnamed: 0,name,age,job,hobby
0,신동엽,20.0,연예인,music
1,유재석,41.0,교수,art
2,김새롬,18.0,학생,study
3,이영자,45.0,상담사,talk
4,강호동,38.0,연예인,talk
5,신동엽,,연예인,music
6,고현정,44.0,가수,music
7,박민영,22.0,학생,art
8,박서준,18.0,학생,study
9,박보검,45.0,상담사,talk


In [174]:
df.job.unique()
print(df.job.value_counts())

연예인
교수
학생
상담사
가수
주부
학생     3
연예인    3
교수     2
상담사    2
가수     1
주부     1
Name: job, dtype: int64


In [176]:
# hobby 종류에 대한 개수 출력
df.hobby.value_counts()

music    4
talk     4
study    2
art      2
Name: hobby, dtype: int64

In [194]:
l1 = [{'name': '이효리', 'job': "교수"},
      {'name': '이상순', 'job': "학생"},
      {'name': '박보검', 'job': "개발자"}]

l2 = [{'name': '신동엽', 'job': "치과의사"},
      {'name': '이영자', 'job': "농부"},
      {'name': '정찬우', 'job': "연예인"}]
         
df1 = pd.DataFrame(l1, columns = ['name', 'job'])
df2 = pd.DataFrame(l2, columns = ['name', 'job'])


In [195]:
df1

Unnamed: 0,name,job
0,이효리,교수
1,이상순,학생
2,박보검,개발자


In [196]:
df2

Unnamed: 0,name,job
0,신동엽,치과의사
1,이영자,농부
2,정찬우,연예인


In [204]:
df3 = [df1,df2]
df3

[  name  job
 0  이효리   교수
 1  이상순   학생
 2  박보검  개발자,
   name   job
 0  신동엽  치과의사
 1  이영자    농부
 2  정찬우   연예인]

In [205]:
df3[0]

Unnamed: 0,name,job
0,이효리,교수
1,이상순,학생
2,박보검,개발자


In [207]:
df3[1]

Unnamed: 0,name,job
0,신동엽,치과의사
1,이영자,농부
2,정찬우,연예인


In [210]:
df4 = pd.concat(df3, ignore_index=True)
df4

Unnamed: 0,name,job
0,이효리,교수
1,이상순,학생
2,박보검,개발자
3,신동엽,치과의사
4,이영자,농부
5,정찬우,연예인


In [220]:
df5 = pd.concat(df3,axis = 1,ignore_index=True)
df5

Unnamed: 0,0,1,2,3
0,이효리,교수,신동엽,치과의사
1,이상순,학생,이영자,농부
2,박보검,개발자,정찬우,연예인


In [222]:
df5.columns = ['name','job','name','job']
df5

Unnamed: 0,name,job,name.1,job.1
0,이효리,교수,신동엽,치과의사
1,이상순,학생,이영자,농부
2,박보검,개발자,정찬우,연예인
