## 효율적인 메모리 관리와 프로그램 작성

### 빅데이터 분석에서의 메모리 관리
- 문자열보다는 범주형
- 범위가 제한적인 정수형
- 최소한의 실수형
- 이진(binary)인 경우 Boolean(T/F)

In [1]:
import pandas as pd
import numpy as np

In [2]:
def 데이터프레임생성(size):
    df = pd.DataFrame()
    df['나이'] = np.random.choice(100, size)
    df['수행평가1'] = np.random.choice(['A', 'B', 'C', 'D', 'F'], size)
    df['수행평가2'] = np.random.choice(['상', '중', '하'], size)
    df['학점'] = np.random.choice(['[0,3)', '[3,3.5)', '[3.5,4)', '[4,4.3]'], size)
    df['합격확률'] = np.random.uniform(0, 1, size)
    df['결과'] = np.random.choice(['합격', '불합격'], size)
    return df

In [3]:
df = 데이터프레임생성(1000000)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 6 columns):
 #   Column  Non-Null Count    Dtype  
---  ------  --------------    -----  
 0   나이      1000000 non-null  int32  
 1   수행평가1   1000000 non-null  object 
 2   수행평가2   1000000 non-null  object 
 3   학점      1000000 non-null  object 
 4   합격확률    1000000 non-null  float64
 5   결과      1000000 non-null  object 
dtypes: float64(1), int32(1), object(4)
memory usage: 42.0+ MB


In [6]:
df1 = df.copy()
df2 = df.copy()
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 6 columns):
 #   Column  Non-Null Count    Dtype  
---  ------  --------------    -----  
 0   나이      1000000 non-null  int32  
 1   수행평가1   1000000 non-null  object 
 2   수행평가2   1000000 non-null  object 
 3   학점      1000000 non-null  object 
 4   합격확률    1000000 non-null  float64
 5   결과      1000000 non-null  object 
dtypes: float64(1), int32(1), object(4)
memory usage: 42.0+ MB


### 수행작업
- 수행평가1, 학점에 따라 데이터 나누고 그 안에서 나이의 순위
- 수행평가1, 학점에 따라 데이터 나누고 그 안에서 합격확률의 순위
- 수행평가1, 학점에 따라 데이터 나누고 그 안에서 합격확률의 순위
- 수행시간 계산
    - %timeit : 반복작업을 하며 해당 프로그램을 수행하는데 걸린 시간의 평균과 표준편차 제고

In [7]:
%timeit df1['순위1'] = df1.groupby(['수행평가1', '학점'])['나이'].rank()
%timeit df1['순위2'] = df1.groupby(['수행평가1', '학점'])['합격확률'].rank()
%timeit df1['순위3'] = df1.groupby(['수행평가1', '학점', '결과'])['합격확률'].rank()

299 ms ± 7.94 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
381 ms ± 5.65 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
447 ms ± 4.27 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


## string -> 범주형

In [8]:
df2['수행평가1'] = df2['수행평가1'].astype('category')
df2['수행평가2'] = df2['수행평가2'].astype('category')
df2['학점'] = df2['학점'].astype('category')
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 6 columns):
 #   Column  Non-Null Count    Dtype   
---  ------  --------------    -----   
 0   나이      1000000 non-null  int32   
 1   수행평가1   1000000 non-null  category
 2   수행평가2   1000000 non-null  category
 3   학점      1000000 non-null  category
 4   합격확률    1000000 non-null  float64 
 5   결과      1000000 non-null  object  
dtypes: category(3), float64(1), int32(1), object(1)
memory usage: 21.9+ MB


## Downcasting

- int8 : -128 ~ 127
    - uint8 : 0 ~ 255 (unsigned : 음수 없음)
- int16 : -32768 ~ 32767
    - uint16 : 0 ~ 65535
- int32 : -2147483648 ~ 2147483647
    - uint32 : 0 ~ 42..
- int64 : -9223372036854775808 ~ 9223372036854775807

In [9]:
df2['나이'] = df2['나이'].astype('int8')
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 6 columns):
 #   Column  Non-Null Count    Dtype   
---  ------  --------------    -----   
 0   나이      1000000 non-null  int8    
 1   수행평가1   1000000 non-null  category
 2   수행평가2   1000000 non-null  category
 3   학점      1000000 non-null  category
 4   합격확률    1000000 non-null  float64 
 5   결과      1000000 non-null  object  
dtypes: category(3), float64(1), int8(1), object(1)
memory usage: 19.1+ MB


In [10]:
df2['합격확률'] = df2['합격확률'].astype('float32')
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 6 columns):
 #   Column  Non-Null Count    Dtype   
---  ------  --------------    -----   
 0   나이      1000000 non-null  int8    
 1   수행평가1   1000000 non-null  category
 2   수행평가2   1000000 non-null  category
 3   학점      1000000 non-null  category
 4   합격확률    1000000 non-null  float32 
 5   결과      1000000 non-null  object  
dtypes: category(3), float32(1), int8(1), object(1)
memory usage: 15.3+ MB


In [11]:
df2['결과'] = df2['결과'].map({'합격':True, '불합격':False})
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 6 columns):
 #   Column  Non-Null Count    Dtype   
---  ------  --------------    -----   
 0   나이      1000000 non-null  int8    
 1   수행평가1   1000000 non-null  category
 2   수행평가2   1000000 non-null  category
 3   학점      1000000 non-null  category
 4   합격확률    1000000 non-null  float32 
 5   결과      1000000 non-null  bool    
dtypes: bool(1), category(3), float32(1), int8(1)
memory usage: 8.6 MB


In [20]:
%timeit df2['순위1'] = df2.groupby(['수행평가1', '학점'])['나이'].rank()
%timeit df2['순위2'] = df2.groupby(['수행평가1', '학점'])['합격확률'].rank()
%timeit df2['순위3'] = df2.groupby(['수행평가1', '학점', '결과'])['합격확률'].rank()

161 ms ± 1.49 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
290 ms ± 2.44 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
310 ms ± 2.86 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [21]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 9 columns):
 #   Column  Non-Null Count    Dtype   
---  ------  --------------    -----   
 0   나이      1000000 non-null  int8    
 1   수행평가1   1000000 non-null  category
 2   수행평가2   1000000 non-null  category
 3   학점      1000000 non-null  category
 4   합격확률    1000000 non-null  float32 
 5   결과      1000000 non-null  bool    
 6   순위1     1000000 non-null  float64 
 7   순위2     1000000 non-null  float64 
 8   순위3     1000000 non-null  float64 
dtypes: bool(1), category(3), float32(1), float64(3), int8(1)
memory usage: 31.5 MB


In [22]:
# 파일 저장
변수 = ['나이', '수행평가1', '수행평가2', '학점', '합격확률', '결과']
df2 = df2[변수]
df1.to_csv('BSA03_df1.csv', index = False)
df2.to_csv('BSA03_df2.csv', index = False)
df1csv = pd.read_csv('BSA03_df1.csv')
df2csv = pd.read_csv('BSA03_df2.csv')

In [23]:
df1csv.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 9 columns):
 #   Column  Non-Null Count    Dtype  
---  ------  --------------    -----  
 0   나이      1000000 non-null  int64  
 1   수행평가1   1000000 non-null  object 
 2   수행평가2   1000000 non-null  object 
 3   학점      1000000 non-null  object 
 4   합격확률    1000000 non-null  float64
 5   결과      1000000 non-null  object 
 6   순위1     1000000 non-null  float64
 7   순위2     1000000 non-null  float64
 8   순위3     1000000 non-null  float64
dtypes: float64(4), int64(1), object(4)
memory usage: 68.7+ MB


In [24]:
df2csv.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 6 columns):
 #   Column  Non-Null Count    Dtype  
---  ------  --------------    -----  
 0   나이      1000000 non-null  int64  
 1   수행평가1   1000000 non-null  object 
 2   수행평가2   1000000 non-null  object 
 3   학점      1000000 non-null  object 
 4   합격확률    1000000 non-null  float64
 5   결과      1000000 non-null  bool   
dtypes: bool(1), float64(1), int64(1), object(3)
memory usage: 39.1+ MB


In [25]:
df2csv.head()

Unnamed: 0,나이,수행평가1,수행평가2,학점,합격확률,결과
0,32,B,상,"[3,3.5)",0.843094,False
1,13,C,상,"[4,4.3]",0.048789,False
2,30,A,중,"[3,3.5)",0.116601,True
3,64,A,상,"[3,3.5)",0.944771,True
4,60,D,상,"[3.5,4)",0.336185,True


In [26]:
!pip install pyarrow

Collecting pyarrow
  Downloading pyarrow-11.0.0-cp39-cp39-win_amd64.whl (20.6 MB)
     ---------------------------------------- 0.0/20.6 MB ? eta -:--:--
      --------------------------------------- 0.3/20.6 MB 9.6 MB/s eta 0:00:03
     - -------------------------------------- 0.7/20.6 MB 7.5 MB/s eta 0:00:03
     -- ------------------------------------- 1.2/20.6 MB 9.2 MB/s eta 0:00:03
     --- ------------------------------------ 1.6/20.6 MB 8.5 MB/s eta 0:00:03
     --- ------------------------------------ 2.0/20.6 MB 9.1 MB/s eta 0:00:03
     ---- ----------------------------------- 2.5/20.6 MB 9.4 MB/s eta 0:00:02
     ----- ---------------------------------- 2.9/20.6 MB 9.2 MB/s eta 0:00:02
     ------ --------------------------------- 3.3/20.6 MB 9.3 MB/s eta 0:00:02
     ------- -------------------------------- 3.8/20.6 MB 9.3 MB/s eta 0:00:02
     -------- ------------------------------- 4.2/20.6 MB 9.4 MB/s eta 0:00:02
     --------- ------------------------------ 4.7/20.6 M

In [27]:
df2.to_parquet('BSA03_df2.parquet')
df2pqt = pd.read_parquet('BSA03_df2.parquet')
df2pqt.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 6 columns):
 #   Column  Non-Null Count    Dtype   
---  ------  --------------    -----   
 0   나이      1000000 non-null  int8    
 1   수행평가1   1000000 non-null  category
 2   수행평가2   1000000 non-null  category
 3   학점      1000000 non-null  category
 4   합격확률    1000000 non-null  float32 
 5   결과      1000000 non-null  bool    
dtypes: bool(1), category(3), float32(1), int8(1)
memory usage: 8.6 MB


## 효율적인 프로그램

### 수행작업
"평가"라는 새로운 변수에   
- "나이"가 65세 미만이거나 "합격확률"이 0.6 이상이고 "학점"이 [4,4.3]이면 "수행평가1"을
- 위 조건이 아니면 "수행평가2"를 대입

In [28]:
def 변수추가(행자료):
    if 행자료['나이'] < 65:
        return 행자료['수행평가1']
    if (행자료['합격확률'] >= 0.6) & (행자료['학점'] == '[4,4.3]'):
        return 행자료['수행평가1']
    return(행자료['수행평가2'])

## Loop를 이용한 프로그램

In [29]:
df = 데이터프레임생성(100_000)
df1 = df.copy()
df2 = df.copy()
df3 = df.copy()

In [30]:
%%timeit
for index, row in df1.iterrows():
    df1.loc[index, '평가'] = 변수추가(row)

42.8 s ± 440 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


## Apply를 이용한 프로그램

In [31]:
%%timeit
df2['평가'] = df2.apply(변수추가, axis = 1)

816 ms ± 6.97 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


## Vectorized를 이용한 프로그램

In [37]:
(df3['나이'] < 65) | ((df3['합격확률'] >= 0.6) & (df3['학점'] == '[4,4.3]'))

0         True
1         True
2         True
3         True
4         True
         ...  
99995     True
99996     True
99997    False
99998     True
99999     True
Length: 100000, dtype: bool

In [38]:
%%timeit
df3['평가'] = df3['수행평가2']
조건 = (df3['나이'] < 65) | ((df3['합격확률'] >= 0.6) & (df3['학점'] == '[4,4.3]'))
df3.loc[조건, '평가'] = df['수행평가1']

12.2 ms ± 95.3 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
