In [51]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error

In [2]:
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
lr = LinearRegression()

In [3]:
plt.rcParams['axes.unicode_minus'] = False

# 데이터 준비

In [4]:
df = pd.read_csv("./data/auto-mpg.csv", header = None)

In [5]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,18.0,8,307.0,130.0,3504.0,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693.0,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150.0,3436.0,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150.0,3433.0,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140.0,3449.0,10.5,70,1,ford torino


In [6]:
# auto-mpg 열 이름 지정
df.columns = ['mpg',
              'cylinders',
              'displacement',
              'horsepower',
              'weight',
              'acceleration',
              'model year',
              'origin',
              'name']

In [7]:
df

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,name
0,18.0,8,307.0,130.0,3504.0,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693.0,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150.0,3436.0,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150.0,3433.0,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140.0,3449.0,10.5,70,1,ford torino
...,...,...,...,...,...,...,...,...,...
393,27.0,4,140.0,86.00,2790.0,15.6,82,1,ford mustang gl
394,44.0,4,97.0,52.00,2130.0,24.6,82,2,vw pickup
395,32.0,4,135.0,84.00,2295.0,11.6,82,1,dodge rampage
396,28.0,4,120.0,79.00,2625.0,18.6,82,1,ford ranger


# 데이터 탐색

In [52]:
df.describe()

Unnamed: 0,mpg,cylinders,displacement,weight,acceleration,model year,origin,weight2,weight0.5,weight0.7,weight1.3
count,398.0,398.0,398.0,398.0,398.0,398.0,398.0,398.0,398.0,398.0,398.0
mean,23.514573,5.454774,193.425879,2970.424623,15.56809,76.01005,1.572864,9538762.0,53.964181,267.521792,33215.788658
std,7.815984,1.701004,104.269838,846.841774,2.757689,3.697627,0.802055,5453778.0,7.644513,53.20896,12337.144998
min,9.0,3.0,68.0,1613.0,8.0,70.0,1.0,2601769.0,40.162171,175.931683,14788.518794
25%,17.5,4.0,104.25,2223.75,13.825,73.0,1.0,4945066.0,47.156652,220.272437,22449.763918
50%,23.0,4.0,148.5,2803.5,15.5,76.0,1.0,7859624.0,52.948077,259.054011,30339.670548
75%,29.0,8.0,262.0,3608.0,17.175,79.0,2.0,13017670.0,60.066628,309.091142,42115.940848
max,46.6,8.0,455.0,5140.0,24.8,82.0,3.0,26419600.0,71.693793,395.98085,66719.38811


In [53]:
df.dtypes

mpg             float64
cylinders         int64
displacement    float64
horsepower       object
weight          float64
acceleration    float64
model year        int64
origin            int64
name             object
weight2         float64
weight0.5       float64
weight0.7       float64
weight1.3       float64
dtype: object

In [54]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           398 non-null    float64
 1   cylinders     398 non-null    int64  
 2   displacement  398 non-null    float64
 3   horsepower    398 non-null    object 
 4   weight        398 non-null    float64
 5   acceleration  398 non-null    float64
 6   model year    398 non-null    int64  
 7   origin        398 non-null    int64  
 8   name          398 non-null    object 
 9   weight2       398 non-null    float64
 10  weight0.5     398 non-null    float64
 11  weight0.7     398 non-null    float64
 12  weight1.3     398 non-null    float64
dtypes: float64(8), int64(3), object(2)
memory usage: 40.6+ KB


# 데이터 전처리

- horsepower 컬럼 수치형데이터로 변환

In [55]:
df["horsepower"].unique()

array(['130.0', '165.0', '150.0', '140.0', '198.0', '220.0', '215.0',
       '225.0', '190.0', '170.0', '160.0', '95.00', '97.00', '85.00',
       '88.00', '46.00', '87.00', '90.00', '113.0', '200.0', '210.0',
       '193.0', '?', '100.0', '105.0', '175.0', '153.0', '180.0', '110.0',
       '72.00', '86.00', '70.00', '76.00', '65.00', '69.00', '60.00',
       '80.00', '54.00', '208.0', '155.0', '112.0', '92.00', '145.0',
       '137.0', '158.0', '167.0', '94.00', '107.0', '230.0', '49.00',
       '75.00', '91.00', '122.0', '67.00', '83.00', '78.00', '52.00',
       '61.00', '93.00', '148.0', '129.0', '96.00', '71.00', '98.00',
       '115.0', '53.00', '81.00', '79.00', '120.0', '152.0', '102.0',
       '108.0', '68.00', '58.00', '149.0', '89.00', '63.00', '48.00',
       '66.00', '139.0', '103.0', '125.0', '133.0', '138.0', '135.0',
       '142.0', '77.00', '62.00', '132.0', '84.00', '64.00', '74.00',
       '116.0', '82.00'], dtype=object)

In [56]:
df["horsepower"].replace("?", np.nan, inplace = True) # ?를 np.nan으로 변경

In [57]:
df.dropna(subset = ["horsepower"], axis = 0, inplace = True) # 누락데이터 행을 삭제

In [58]:
df["horsepower"] = df["horsepower"].astype("float") # 문자열을 실수형으로 변환

In [9]:
df["weight2"] = pow(df["weight"], 2)

In [10]:
df["weight0.5"] = pow(df["weight"], 1/2)

In [11]:
df["weight0.7"] = pow(df["weight"], 7/10)

In [12]:
df["weight1.3"] = pow(df["weight"], 13/10)

In [13]:
df["weight0.5"] = pow(df["weight"], 1/2)

# 변수 선택

In [59]:
df.corr()

  df.corr()


Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,weight2,weight0.5,weight0.7,weight1.3
mpg,1.0,-0.777618,-0.805127,-0.778427,-0.832244,0.423329,0.580541,0.565209,-0.806682,-0.840095,-0.837391,-0.825859
cylinders,-0.777618,1.0,0.950823,0.842983,0.897527,-0.504683,-0.345647,-0.568932,0.890839,0.893465,0.895704,0.897534
displacement,-0.805127,0.950823,1.0,0.897257,0.932994,-0.5438,-0.369855,-0.614535,0.928779,0.928484,0.930836,0.93353
horsepower,-0.778427,0.842983,0.897257,1.0,0.864538,-0.689196,-0.416361,-0.455171,0.86972,0.85677,0.860292,0.86753
weight,-0.832244,0.897527,0.932994,0.864538,1.0,-0.416839,-0.30912,-0.585005,0.992019,0.997883,0.999241,0.999256
acceleration,0.423329,-0.504683,-0.5438,-0.689196,-0.416839,1.0,0.290316,0.212746,-0.426547,-0.409829,-0.412793,-0.420378
model year,0.580541,-0.345647,-0.369855,-0.416361,-0.30912,0.290316,1.0,0.181528,-0.325214,-0.298771,-0.303082,-0.314616
origin,0.565209,-0.568932,-0.614535,-0.455171,-0.585005,0.212746,0.181528,1.0,-0.55202,-0.598278,-0.593272,-0.57591
weight2,-0.806682,0.890839,0.928779,0.86972,0.992019,-0.426547,-0.325214,-0.55202,1.0,0.981797,0.98639,0.996135
weight0.5,-0.840095,0.893465,0.928484,0.85677,0.997883,-0.409829,-0.298771,-0.598278,0.981797,1.0,0.999658,0.994638


In [61]:
# 분석에 활용할 열 선택
ndf = df[["mpg", "cylinders", "horsepower", "weight", "displacement"]]
ndf.head()

Unnamed: 0,mpg,cylinders,horsepower,weight,displacement
0,18.0,8,130.0,3504.0,307.0
1,15.0,8,165.0,3693.0,350.0
2,18.0,8,150.0,3436.0,318.0
3,16.0,8,150.0,3433.0,304.0
4,17.0,8,140.0,3449.0,302.0


# 데이터셋 분할

In [62]:
x = ndf.drop("mpg", axis = 1) # 독립변수 x1, x2, x3, x4
y = ndf[["mpg"]] # 종속변수 y

In [64]:
# train data와 test data 분할(7:3 비율)
x_train, x_test, y_train, y_test = train_test_split(x, # 독립변수 
                                                    y, # 종속변수
                                                    test_size = 0.3, # 검증 30%
                                                    random_state = 2)

In [65]:
print(len(x_train), len(x_test))

274 118


# 선형회귀모형

In [66]:
# 선형 회귀 모형 객체 생성
lr = LinearRegression()

In [67]:
# train data로 모형 학습
lr.fit(x_train, y_train)

In [68]:
# 학습을 마친 모형에 test data를 적용하여 결정계수(R제곱) 계산
r_square = lr.score(x_test, y_test)
r_square

0.7328457213008805

In [69]:
# 회귀식의 기울기
lr.coef_

array([[-0.48198192, -0.04854718, -0.00528897,  0.00345342]])

In [70]:
# 회귀식의 y절편
lr.intercept_

array([46.48805855])

In [71]:
# 모형에 x_test를 입력하여 예측한 값을 실제 값과 비교
y_pred = lr.predict(x_test)

In [72]:
y_pred

array([[31.13614647],
       [28.2514678 ],
       [24.82956385],
       [32.80768422],
       [29.86942136],
       [30.19092101],
       [23.3050195 ],
       [28.84938745],
       [19.84102393],
       [18.27540095],
       [30.5403967 ],
       [21.06750888],
       [30.14919146],
       [23.69532864],
       [25.26822109],
       [25.61148634],
       [31.22789458],
       [16.89567529],
       [29.44065477],
       [21.16102586],
       [11.40176637],
       [27.63138602],
       [26.58426545],
       [ 9.02917835],
       [25.88647409],
       [21.77005068],
       [25.90894967],
       [19.88691281],
       [12.02862149],
       [14.17215907],
       [26.38545009],
       [31.83328032],
       [10.90801078],
       [14.79469951],
       [30.84782418],
       [29.28688502],
       [ 9.28501491],
       [32.92790763],
       [28.15628002],
       [27.9819958 ],
       [29.66194838],
       [22.49799501],
       [24.37193485],
       [13.66035255],
       [26.77541343],
       [17

In [73]:
mse = mean_squared_error(y_test, y_pred)
mse

14.198414336239596

In [74]:
rmse = mean_squared_error(y_test, y_pred, squared = False)
rmse

3.7680783346740014

In [15]:
x_train, x_test, y_train, y_test = train_test_split(df["weight"], df["mpg"], test_size = 0.25, random_state = 4)
x_train = np.reshape(x_train.values, (-1, 1))
x_test = np.reshape(x_test.values, (-1, 1))

In [16]:
# 선형회귀 모델 훈련
lr.fit(x_train, y_train)

In [17]:
# 계수(coefficient), y절편
print(lr.coef_, lr.intercept_)

[-0.00767065] 46.342664736827174


In [None]:
# 모델 평가
lr.score(x_test, y_test)

In [None]:
x_train, x_test, y_train, y_test = train_test_split(df["weight0.5"], df["mpg"], test_size = 0.25, random_state = 4)
x_train = np.reshape(x_train.values, (-1, 1))
x_test = np.reshape(x_test.values, (-1, 1))

In [None]:
# 선형회귀 모델 훈련
lr.fit(x_train, y_train)

In [None]:
# 계수(coefficient), y절편
print(lr.coef_, lr.intercept_)

In [22]:
# 모델 평가
lr.score(x_test, y_test)

0.7412613823718934

In [23]:
x_train, x_test, y_train, y_test = train_test_split(df[["weight", "weight0.5"]], df["mpg"], test_size = 0.25, random_state = 4)

In [24]:
# 선형회귀 모델 훈련
lr.fit(x_train, y_train)

In [25]:
# 모델 평가
lr.score(x_test, y_test)

0.7517232326118474

In [26]:
x_train, x_test, y_train, y_test = train_test_split(df[["weight", "displacement"]], df["mpg"], test_size = 0.25, random_state = 4)

In [27]:
# 선형회귀 모델 훈련
lr.fit(x_train, y_train)

In [28]:
# 계수(coefficient), y절편
print(lr.coef_, lr.intercept_)

[-0.00691973 -0.00647186] 45.35880744363486


In [29]:
# 모델 평가
lr.score(x_test, y_test)

0.7387555323649818

In [30]:
x_train, x_test, y_train, y_test = train_test_split(df[["weight", "displacement", "cylinders"]], df["mpg"], test_size = 0.25, random_state = 4)

In [31]:
# 선형회귀 모델 훈련
lr.fit(x_train, y_train)

In [32]:
# 계수(coefficient), y절편
print(lr.coef_, lr.intercept_)

[-0.00689718 -0.0042269  -0.15551127] 45.706653363194036


In [33]:
# 모델 평가
lr.score(x_test, y_test)

0.7393455284754016

In [34]:
x_train, x_test, y_train, y_test = train_test_split(df[["weight", "displacement", "cylinders", "model year"]], df["mpg"], test_size = 0.25, random_state = 4)

In [35]:
# 선형회귀 모델 훈련
lr.fit(x_train, y_train)

In [36]:
# 계수(coefficient), y절편
print(lr.coef_, lr.intercept_)

[-0.00764606  0.01106444 -0.17929904  0.75833395] -12.56770237426257


In [37]:
# 모델 평가
lr.score(x_test, y_test)

0.834758781116283

In [38]:
x_train, x_test, y_train, y_test = train_test_split(df[["weight", "displacement", "cylinders", "model year", "origin"]], df["mpg"], test_size = 0.25, random_state = 4)

In [39]:
# 선형회귀 모델 훈련
lr.fit(x_train, y_train)

In [40]:
# 계수(coefficient), y절편
print(lr.coef_, lr.intercept_)

[-0.00762245  0.01762451 -0.24053372  0.7757403   1.21337393] -16.772224460672508


In [41]:
# 모델 평가
lr.score(x_test, y_test)

0.8469991928845614

In [42]:
x_train, x_test, y_train, y_test = train_test_split(df[["weight", "displacement", "cylinders", "model year", "origin", "acceleration"]], df["mpg"], test_size = 0.25, random_state = 5)

In [43]:
# 선형회귀 모델 훈련
lr.fit(x_train, y_train)

In [44]:
# 모델 평가
lr.score(x_test, y_test)

0.7769328777326694

In [45]:
x_train, x_test, y_train, y_test = train_test_split(df[["weight", "displacement", "model year", "origin"]], df["mpg"], test_size = 0.25, random_state = 4)

In [46]:
# 선형회귀 모델 훈련
lr.fit(x_train, y_train)

In [47]:
# 모델 평가
lr.score(x_test, y_test)

0.8457279943179172

In [48]:
x_train, x_test, y_train, y_test = train_test_split(df[["weight0.5", "weight0.7", "displacement", "model year", "origin"]], df["mpg"], test_size = 0.2, random_state = 4)

In [49]:
# 선형회귀 모델 훈련
lr.fit(x_train, y_train)

In [50]:
# 모델 평가
lr.score(x_test, y_test)

0.8915365361856485