# 회귀분석 - 범주형 데이터 전처리

* 선형회귀 분석의 독립변수로 범주형 데이터를 직접 사용할 수 없다.
* 범주형데이터를 숫자로 변환하여 회귀모형을 생성해야 한다.
    * Label encode
    * One-hot encode

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

from sklearn.linear_model import LinearRegression

In [2]:
tips = pd.read_csv('../../data/tips.csv')

In [3]:
tips.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   total_bill  244 non-null    float64
 1   tip         244 non-null    float64
 2   sex         244 non-null    object 
 3   smoker      244 non-null    object 
 4   day         244 non-null    object 
 5   time        244 non-null    object 
 6   size        244 non-null    int64  
dtypes: float64(2), int64(1), object(4)
memory usage: 13.5+ KB


In [4]:
X = tips[['total_bill','sex','smoker','day','time','size']]

In [5]:
y = tips[['tip']]

In [6]:
X.head()

Unnamed: 0,total_bill,sex,smoker,day,time,size
0,16.99,Female,No,Sun,Dinner,2
1,10.34,Male,No,Sun,Dinner,3
2,21.01,Male,No,Sun,Dinner,3
3,23.68,Male,No,Sun,Dinner,2
4,24.59,Female,No,Sun,Dinner,4


In [7]:
y.head()

Unnamed: 0,tip
0,1.01
1,1.66
2,3.5
3,3.31
4,3.61


In [8]:
model = LinearRegression()

In [9]:
model.fit(X, y)  # 범주형 데이터를 독립변수로 사용할 수 없다.

ValueError: could not convert string to float: 'Female'

### 범주형 데이터 변환

* One-hot encoding을 사용하여 범주형 데이터를 변환해야 한다.

In [10]:
X_new = pd.get_dummies(X, drop_first=True)

In [11]:
X_new.head()

Unnamed: 0,total_bill,size,sex_Male,smoker_Yes,day_Sat,day_Sun,day_Thur,time_Lunch
0,16.99,2,0,0,0,1,0,0
1,10.34,3,1,0,0,1,0,0
2,21.01,3,1,0,0,1,0,0
3,23.68,2,1,0,0,1,0,0
4,24.59,4,0,0,0,1,0,0


In [12]:
model.fit(X_new, y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [13]:
model.coef_

array([[ 0.09448701,  0.175992  , -0.03244094, -0.08640832, -0.12145838,
        -0.02548066, -0.1622592 ,  0.0681286 ]])

In [14]:
model.score(X_new, y)

0.4700781232206078