# DataFrame 정제
: NaN으로 빠진 값이나 정상적이지 않은값(결측치(missing value), 이상치)의 정제

In [5]:
import numpy as np
import pandas as pd

In [6]:
# numpy의 난수를 이용하여 DataFrame 만들기
df = pd.DataFrame(np.random.rand(6,4)) # 0부터 1 사이나 0과 1은 없다.
df

Unnamed: 0,0,1,2,3
0,0.888056,0.830123,0.629305,0.473251
1,0.631434,0.484542,0.368093,0.81759
2,0.345881,0.286498,0.914134,0.929642
3,0.975457,0.998185,0.944509,0.604923
4,0.851535,0.864722,0.33716,0.893976
5,0.666183,0.865552,0.852561,0.480167


In [7]:
# Column 과 index 이름 넣기
df.columns = ['A','B','C','D']
df.index = pd.date_range("20220701",periods=6)# 판다스에서 제공하는 날짜 함수
df

Unnamed: 0,A,B,C,D
2022-07-01,0.888056,0.830123,0.629305,0.473251
2022-07-02,0.631434,0.484542,0.368093,0.81759
2022-07-03,0.345881,0.286498,0.914134,0.929642
2022-07-04,0.975457,0.998185,0.944509,0.604923
2022-07-05,0.851535,0.864722,0.33716,0.893976
2022-07-06,0.666183,0.865552,0.852561,0.480167


In [8]:
# index type 확인 
df.index

DatetimeIndex(['2022-07-01', '2022-07-02', '2022-07-03', '2022-07-04',
               '2022-07-05', '2022-07-06'],
              dtype='datetime64[ns]', freq='D')

In [55]:
# 새로운 F열 생성과 값 입력
df['F'] = [1.0,np.nan,3.5,6.1,np.nan,7.0]
df

Unnamed: 0,A,B,C,D,F
2022-07-01,0.888056,0.830123,0.629305,0.473251,1.0
2022-07-02,0.631434,0.484542,0.368093,0.81759,
2022-07-03,0.345881,0.286498,0.914134,0.929642,3.5
2022-07-04,0.975457,0.998185,0.944509,0.604923,6.1
2022-07-05,0.851535,0.864722,0.33716,0.893976,
2022-07-06,0.666183,0.865552,0.852561,0.480167,7.0


In [10]:
# NaN이 하나라도 있는 Data행 삭제
df.dropna(how='any') # how default 가 'any' 하나라도 있으면 지워진다.

Unnamed: 0,A,B,C,D,F
2022-07-01,0.888056,0.830123,0.629305,0.473251,1.0
2022-07-03,0.345881,0.286498,0.914134,0.929642,3.5
2022-07-04,0.975457,0.998185,0.944509,0.604923,6.1
2022-07-06,0.666183,0.865552,0.852561,0.480167,7.0


In [12]:
# 한 행에서 모든 데이터가 NaN인 경우 행 삭제
df.dropna(how='all')

Unnamed: 0,A,B,C,D,F
2022-07-01,0.888056,0.830123,0.629305,0.473251,1.0
2022-07-02,0.631434,0.484542,0.368093,0.81759,
2022-07-03,0.345881,0.286498,0.914134,0.929642,3.5
2022-07-04,0.975457,0.998185,0.944509,0.604923,6.1
2022-07-05,0.851535,0.864722,0.33716,0.893976,
2022-07-06,0.666183,0.865552,0.852561,0.480167,7.0


In [15]:
# NaN값을 특정 값으로 변경
df.fillna(value=5.0)
df.F.fillna(value=2.0)

2022-07-01    1.0
2022-07-02    2.0
2022-07-03    3.5
2022-07-04    6.1
2022-07-05    2.0
2022-07-06    7.0
Freq: D, Name: F, dtype: float64

In [31]:
# Boolean Index를 통해 검색 및 변경
df.isnull()

Unnamed: 0,A,B,C,D,F
2022-07-01,False,False,False,False,False
2022-07-02,False,False,False,False,True
2022-07-03,False,False,False,False,False
2022-07-04,False,False,False,False,False
2022-07-05,False,False,False,False,True
2022-07-06,False,False,False,False,False


In [29]:
# F열에서 NaN을 포함하고 있는 행 찾기
df.loc[df.isnull()['F'],:]

Unnamed: 0,A,B,C,D,F
2022-07-02,0.631434,0.484542,0.368093,0.81759,
2022-07-05,0.851535,0.864722,0.33716,0.893976,


In [39]:
# index를 이용하여 행 제거
df.drop(pd.to_datetime("20220701")) # 더 정확한 표현
# df.drop("2022-07-02" )
# df.drop([pd.to_datetime("20220701"),pd.to_datetime("20220702")])


Unnamed: 0,A,B,C,D,F
2022-07-01,0.888056,0.830123,0.629305,0.473251,1.0
2022-07-03,0.345881,0.286498,0.914134,0.929642,3.5
2022-07-04,0.975457,0.998185,0.944509,0.604923,6.1
2022-07-05,0.851535,0.864722,0.33716,0.893976,
2022-07-06,0.666183,0.865552,0.852561,0.480167,7.0


In [57]:
df2 = df.copy()
df2

Unnamed: 0,A,B,C,D,F
2022-07-01,0.888056,0.830123,0.629305,0.473251,1.0
2022-07-02,0.631434,0.484542,0.368093,0.81759,
2022-07-03,0.345881,0.286498,0.914134,0.929642,3.5
2022-07-04,0.975457,0.998185,0.944509,0.604923,6.1
2022-07-05,0.851535,0.864722,0.33716,0.893976,
2022-07-06,0.666183,0.865552,0.852561,0.480167,7.0


In [60]:
# 열 삭제
# del df['F']
df.drop("F",axis="columns")



Unnamed: 0,A,B,C,D
2022-07-01,0.888056,0.830123,0.629305,0.473251
2022-07-02,0.631434,0.484542,0.368093,0.81759
2022-07-03,0.345881,0.286498,0.914134,0.929642
2022-07-04,0.975457,0.998185,0.944509,0.604923
2022-07-05,0.851535,0.864722,0.33716,0.893976
2022-07-06,0.666183,0.865552,0.852561,0.480167


## 분석용 함수 사용하기

In [74]:
dic01 = {'one':[1.40,7,10,np.nan,0.75],'two':[np.nan,-4.5,np.nan,-1.3]}

pd.DataFrame(dic01)

ValueError: All arrays must be of the same length

In [182]:
data = [[1.4, np.nan],
        [7.1,-4.5],
        [np.nan,np.nan],
        [0.75,-1.3]
        ]
df = pd.DataFrame(data, columns=['one','two'], index= ['a','b','c','d'])
df

Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


In [183]:
# 열방향 합계 구하기
df.sum(axis='rows')

one    9.25
two   -5.80
dtype: float64

In [184]:
df.sum(axis='columns')

a    1.40
b    2.60
c    0.00
d   -0.55
dtype: float64

In [185]:
print(df[1:2].sum(axis=1))
print(type(df[1:2]))

b    2.6
dtype: float64
<class 'pandas.core.frame.DataFrame'>


In [186]:
print(type(df.loc['b',:]), df.loc['b',:].sum())
print(df.iloc[1].sum())

<class 'pandas.core.series.Series'> 2.5999999999999996
2.5999999999999996


In [187]:
df['one'].sum()

9.25

In [188]:
df.mean(axis=0)

one    3.083333
two   -2.900000
dtype: float64

In [189]:
df.var(axis=0)

one    12.205833
two     5.120000
dtype: float64

In [190]:
df.mean(axis='columns', skipna=False) # False면 NaN 를 스킵하지 않아서 NaN 뜸

a      NaN
b    1.300
c      NaN
d   -0.275
dtype: float64

In [168]:
### one의 NaN은 남은 값들의 평균으로 대체, two의 NaN은 가장 작은 값으로 대체
df['one'].fillna(df['one'].mean() ,inplace=True)
df['two'].fillna(df['two'].min(),inplace=True)

In [191]:
# 평균값 구하기
one_mean = df.mean(axis=0)['one']
one_mean
two_min = df.min(axis=0)['two']
two_min
df['one'] = df['one'].fillna(value=one_mean)
df['two'] = df['two'].fillna(value=two_min)
df

Unnamed: 0,one,two
a,1.4,-4.5
b,7.1,-4.5
c,3.083333,-4.5
d,0.75,-1.3


---
# DataFrame Merging(병합)

In [207]:
df1 = pd.DataFrame({"key": list("bbacaab"), "data1":range(7)})
df1

Unnamed: 0,key,data1
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,a,5
6,b,6


In [208]:
df2 = pd.DataFrame({"key": list('abd'), "data2" : range(3)})
df2

Unnamed: 0,key,data2
0,a,0
1,b,1
2,d,2


In [209]:
# 병합하기
pd.merge(df1, df2, on= "key")

Unnamed: 0,key,data1,data2
0,b,0,1
1,b,1,1
2,b,6,1
3,a,2,0
4,a,4,0
5,a,5,0


> key c와 d는 서로매칭 되지 않으므로 출력되지 않음 (맞는 키에대해서 행 추가)

In [210]:
pd.merge(df1,df2, on="key",how = "outer") 

Unnamed: 0,key,data1,data2
0,b,0.0,1.0
1,b,1.0,1.0
2,b,6.0,1.0
3,a,2.0,0.0
4,a,4.0,0.0
5,a,5.0,0.0
6,c,3.0,
7,d,,2.0


In [211]:
#df1 기준으로 합치기 
pd.merge(df1,df2,on = "key",how="left")

Unnamed: 0,key,data1,data2
0,b,0,1.0
1,b,1,1.0
2,a,2,0.0
3,c,3,
4,a,4,0.0
5,a,5,0.0
6,b,6,1.0


In [212]:
#df2 기준으로 합치기 
pd.merge(df1,df2,on = "key",how="right")

Unnamed: 0,key,data1,data2
0,a,2.0,0
1,a,4.0,0
2,a,5.0,0
3,b,0.0,1
4,b,1.0,1
5,b,6.0,1
6,d,,2


In [216]:
### 두개의 dataframe에 중복된 값이 있음
df1 = pd.DataFrame({"key":list('bbacab'),"data1":range(6)})
df1

Unnamed: 0,key,data1
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,b,5


In [217]:
df2 = pd.DataFrame({'key':list('ababd'), 'data2':range(5)})
df2

Unnamed: 0,key,data2
0,a,0
1,b,1
2,a,2
3,b,3
4,d,4


In [219]:
pd.merge(df1,df2,on='key',how='inner') # how default 가 inner

Unnamed: 0,key,data1,data2
0,b,0,1
1,b,0,3
2,b,1,1
3,b,1,3
4,b,5,1
5,b,5,3
6,a,2,0
7,a,2,2
8,a,4,0
9,a,4,2


## Key name이 다를 경우

In [225]:
df1 = pd.DataFrame({"lkey":list('bbacab'),"data1":range(6)})
df2 = pd.DataFrame({'rkey':list('ababd'), 'data2':range(5)})

In [228]:
pd.merge(df1,df2, left_on='lkey', right_on='rkey',how='inner')

Unnamed: 0,lkey,data1,rkey,data2
0,b,0,b,1
1,b,0,b,3
2,b,1,b,1
3,b,1,b,3
4,b,5,b,1
5,b,5,b,3
6,a,2,a,0
7,a,2,a,2
8,a,4,a,0
9,a,4,a,2


### 하나의 key값으로 병합하는 경우

In [230]:
df1 = pd.DataFrame({"key":list('bbacab'),"data1":range(6)})
df2 = pd.DataFrame({'group_val':[3.5, 7]}, index= ['a','b'])
df2

Unnamed: 0,group_val
a,3.5
b,7.0


In [231]:
pd.merge(df1,df2 , left_on='key',right_index=True , how= 'inner')

Unnamed: 0,key,data1,group_val
0,b,0,7.0
1,b,1,7.0
5,b,5,7.0
2,a,2,3.5
4,a,4,3.5


---
## Data Concatenation(연결)

In [234]:
s1 = pd.Series([0,1], index=['a','b'])
s1

a    0
b    1
dtype: int64

In [236]:
s2 = pd.Series([2,3,4], index=['c','d','e'])
s2

c    2
d    3
e    4
dtype: int64

In [237]:
s3 = pd.Series([5,6],index=['f','g'])
s3

f    5
g    6
dtype: int64

In [238]:
# s1, s2, s3 합치기
pd.concat([s1,s2,s3])

a    0
b    1
c    2
d    3
e    4
f    5
g    6
dtype: int64

In [240]:
# Series를 합치면서 dataFrame 만들기
pd.concat([s1,s2,s3],axis=1, sort=True)

Unnamed: 0,0,1,2
a,0.0,,
b,1.0,,
c,,2.0,
d,,3.0,
e,,4.0,
f,,,5.0
g,,,6.0


In [241]:
# column에 이름 넣기 concat 하면서 가능
pd.concat([s1,s2,s3],axis=1, sort=True, keys = ['s1','s2','s3'])

Unnamed: 0,s1,s2,s3
a,0.0,,
b,1.0,,
c,,2.0,
d,,3.0,
e,,4.0,
f,,,5.0
g,,,6.0


---
## DataFrame의 Concatenation


In [252]:
df1 = pd.DataFrame(np.arange(6).reshape(3,2), index=['a','b','c'], columns=['one','two'])
df1

Unnamed: 0,one,two
a,0,1
b,2,3
c,4,5


In [258]:
df2 = pd.DataFrame(5 + np.arange(4).reshape(2,2),index=['a','b'],columns=['three','four'])
df2

Unnamed: 0,three,four
a,5,6
b,7,8


In [261]:
# df1과 df2를 합치기 # sort는 인덱스 기준 정렬인가??
pd.concat([df1, df2],axis=1,sort=True)

Unnamed: 0,one,two,three,four
a,0,1,5.0,6.0
b,2,3,7.0,8.0
c,4,5,,


In [266]:
#ignore_index 는 axis 방향의 index 를 무시 (이번엔 따라서 열의 인덱스(column)을 무시)
pd.concat([df1, df2],axis=1,sort=True,ignore_index=True)


Unnamed: 0,0,1,2,3
a,0,1,5.0,6.0
b,2,3,7.0,8.0
c,4,5,,


In [267]:
pd.concat([df1, df2],axis=0,sort=True,ignore_index=True)

Unnamed: 0,four,one,three,two
0,,0.0,,1.0
1,,2.0,,3.0
2,,4.0,,5.0
3,6.0,,5.0,
4,8.0,,7.0,


---
## 데이터프레임 중복값 제거

In [272]:
df = pd.DataFrame({'k1': ['one']*3 + ['two']*4,'k2' : [1,1,2,3,3,4,4] })
df

Unnamed: 0,k1,k2
0,one,1
1,one,1
2,one,2
3,two,3
4,two,3
5,two,4
6,two,4


In [278]:
#중복값 확인
df.duplicated() # 행전체 데이터가 모두 일치하는지  위에서부터 같은 것이 2번 이상 나오는 것만 True 

0    False
1     True
2    False
3    False
4     True
5    False
6     True
dtype: bool

In [279]:
# 중복값 제거
df.drop_duplicates()

Unnamed: 0,k1,k2
0,one,1
2,one,2
3,two,3
5,two,4


In [282]:
# 새로운 열 추가
df['v1'] = np.arange(7)
df

Unnamed: 0,k1,k2,v1
0,one,1,0
1,one,1,1
2,one,2,2
3,two,3,3
4,two,3,4
5,two,4,5
6,two,4,6


In [283]:
df.drop_duplicates()

Unnamed: 0,k1,k2,v1
0,one,1,0
1,one,1,1
2,one,2,2
3,two,3,3
4,two,3,4
5,two,4,5
6,two,4,6


In [284]:
df['k1'].drop_duplicates()

0    one
3    two
Name: k1, dtype: object

In [288]:
# k1의 값들로 중복값 행 제거
df.drop_duplicates(['k1'])

Unnamed: 0,k1,k2,v1
0,one,1,0
3,two,3,3


In [286]:
# k1의 값들로 마지막 값으로 출력 
df.drop_duplicates(['k1'],keep='last') #첫번쨰와 마지막만 설정가능

Unnamed: 0,k1,k2,v1
2,one,2,2
6,two,4,6


## Category 사용하기

In [290]:
df3 = pd.DataFrame({'id':range(1,7),
                    'raw_grade':list('abbaae'),
})
df3

Unnamed: 0,id,raw_grade
0,1,a
1,2,b
2,3,b
3,4,a
4,5,a
5,6,e


In [291]:
# category 자료형으로 변환하기
df3['grade'] = df3['raw_grade'].astype('category')
df3

Unnamed: 0,id,raw_grade,grade
0,1,a,a
1,2,b,b
2,3,b,b
3,4,a,a
4,5,a,a
5,6,e,e


In [294]:
df3['grade']

0    a
1    b
2    b
3    a
4    a
5    e
Name: grade, dtype: category
Categories (3, object): ['a', 'b', 'e']

In [296]:
# category를 이용하여 자료 변형
df3['grade'].cat.categories =  ['very good', 'good','very bad']
df3

Unnamed: 0,id,raw_grade,grade
0,1,a,very good
1,2,b,good
2,3,b,good
3,4,a,very good
4,5,a,very good
5,6,e,very bad


In [297]:
df3.sort_values(by='grade')
#df3.sort_index()

Unnamed: 0,id,raw_grade,grade
0,1,a,very good
3,4,a,very good
4,5,a,very good
1,2,b,good
2,3,b,good
5,6,e,very bad


### 데이터의 범위 정하기

In [299]:
ages=[20,22,25,27,21,23,37,31,61,45,41,32]
bins=[18,25,35,60,100]

In [301]:
cats = pd.cut(ages, bins)
cats

[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64, right]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

In [303]:
# 범주 관련 code 보기
cats.codes

array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)

In [305]:
# 범주 관련 빈도수 보기
cats.value_counts()

(18, 25]     5
(25, 35]     3
(35, 60]     3
(60, 100]    1
dtype: int64

In [306]:
# category 이름 정하기
group_names =['Youth','YoungAdult','ModdleAged','Senior']
cat2 = pd.cut(ages,bins,labels=group_names)
cat2.value_counts()

Youth         5
YoungAdult    3
ModdleAged    3
Senior        1
dtype: int64

In [311]:
pd.concat([s1,s2],axis=1)

Unnamed: 0,0,1
a,0.0,
b,1.0,
c,,2.0
d,,3.0
e,,4.0


---
## 낙서장

In [None]:
#sort 방향 어디인가
pd.concat([df1, df2],axis=0)
pd.concat([df1, df2],axis=0,sort=True)

In [None]:

temp=pd.concat([s1,s2,s3],keys=['s1','s2','s3'])

print(type(temp),temp)

In [178]:
df.drop(['B','F'],axis='columns')

KeyError: "['B' 'F'] not found in axis"

In [45]:
df.isnull().describe(include='all')

Unnamed: 0,A,B,C,D,F
count,6,6,6,6,6
unique,1,1,1,1,2
top,False,False,False,False,False
freq,6,6,6,6,4


In [67]:
df.isnull().info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 6 entries, 2022-07-01 to 2022-07-06
Freq: D
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   A       6 non-null      bool 
 1   B       6 non-null      bool 
 2   C       6 non-null      bool 
 3   D       6 non-null      bool 
 4   F       6 non-null      bool 
dtypes: bool(5)
memory usage: 250.0 bytes


In [49]:
df.drop([pd.to_datetime("20220701"),pd.to_datetime("20220702")])
df.drop([pd.to_datetime("2022-07-01"),pd.to_datetime("20220702")])

Unnamed: 0,A,B,C,D,F
2022-07-03,0.345881,0.286498,0.914134,0.929642,3.5
2022-07-04,0.975457,0.998185,0.944509,0.604923,6.1
2022-07-05,0.851535,0.864722,0.33716,0.893976,
2022-07-06,0.666183,0.865552,0.852561,0.480167,7.0
