In [11]:
import pandas as pd

import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score


In [2]:
df = pd.read_csv('auto_mpg_dataset.csv')
df

Unnamed: 0,MPG,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model Year,Origin
0,8,307.0,130.0,3504.0,12.0,70,1,chevrolet chevelle malibu
1,8,350.0,165.0,3693.0,11.5,70,1,buick skylark 320
2,8,318.0,150.0,3436.0,11.0,70,1,plymouth satellite
3,8,304.0,150.0,3433.0,12.0,70,1,amc rebel sst
4,8,302.0,140.0,3449.0,10.5,70,1,ford torino
...,...,...,...,...,...,...,...,...
393,4,140.0,86.00,2790.0,15.6,82,1,ford mustang gl
394,4,97.0,52.00,2130.0,24.6,82,2,vw pickup
395,4,135.0,84.00,2295.0,11.6,82,1,dodge rampage
396,4,120.0,79.00,2625.0,18.6,82,1,ford ranger


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   MPG           398 non-null    int64  
 1   Cylinders     398 non-null    float64
 2   Displacement  398 non-null    object 
 3   Horsepower    398 non-null    float64
 4   Weight        398 non-null    float64
 5   Acceleration  398 non-null    int64  
 6   Model Year    398 non-null    int64  
 7   Origin        398 non-null    object 
dtypes: float64(3), int64(3), object(2)
memory usage: 25.0+ KB


In [15]:
type(df['Displacement'].iloc[0])

numpy.float64

In [14]:
df['Displacement'] = pd.to_numeric(df['Displacement'], errors='coerce')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   MPG           398 non-null    int64  
 1   Cylinders     398 non-null    float64
 2   Displacement  392 non-null    float64
 3   Horsepower    398 non-null    float64
 4   Weight        398 non-null    float64
 5   Acceleration  398 non-null    int64  
 6   Model Year    398 non-null    int64  
 7   Origin        398 non-null    object 
dtypes: float64(4), int64(3), object(1)
memory usage: 25.0+ KB


In [13]:
df['Model Year'].value_counts().sort_index()

Model Year
1    249
2     70
3     79
Name: count, dtype: int64

In [17]:
# 값 변경: 1 → 0, 2 or 3 → 1
df['Model Year'] = df['Model Year'].replace({1: 0, 2: 1, 3: 1})
df

Unnamed: 0,MPG,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model Year,Origin
0,8,307.0,130.0,3504.0,12.0,70,0,chevrolet chevelle malibu
1,8,350.0,165.0,3693.0,11.5,70,0,buick skylark 320
2,8,318.0,150.0,3436.0,11.0,70,0,plymouth satellite
3,8,304.0,150.0,3433.0,12.0,70,0,amc rebel sst
4,8,302.0,140.0,3449.0,10.5,70,0,ford torino
...,...,...,...,...,...,...,...,...
393,4,140.0,86.0,2790.0,15.6,82,0,ford mustang gl
394,4,97.0,52.0,2130.0,24.6,82,1,vw pickup
395,4,135.0,84.0,2295.0,11.6,82,0,dodge rampage
396,4,120.0,79.0,2625.0,18.6,82,0,ford ranger


In [20]:
pred_disp = df[df['Displacement'].isna()]
pred_disp

Unnamed: 0,MPG,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model Year,Origin
32,4,98.0,,2046.0,19.0,71,0,ford pinto
126,6,200.0,,2875.0,17.0,74,0,ford maverick
330,4,85.0,,1835.0,17.3,80,1,renault lecar deluxe
336,4,140.0,,2905.0,14.3,80,0,ford mustang cobra
354,4,100.0,,2320.0,15.8,81,1,renault 18i
374,4,151.0,,3035.0,20.5,82,0,amc concord dl


In [18]:
for i in df.columns:
    print(df[f'{i}'].isnull().sum())

0
0
6
0
0
0
0
0


In [None]:
"""
일단 origin의 경우 각 수치형 변수에 영향을 줄 수 없으며, 중복 값도 다수 존재하기에 차원을 쓸데없이 늘려 혼란을 야기할 수 있으므로 다중회귀분석시 제외함
Model Year의 경우 확인 해볼 것 - Origin이 동일한데 모델 년도가 다른 경우가 존재하는지 확인
 - Origin 별로 존재하는 model year는 오직 한개이므로 중요한 영향을 줄 수 없을 것임. 다중회귀분석시 제외함

Displacement 변수의 Nan 값은 다중회귀분석으로 채워넣고 진행
 - 가장 상관관계높은 변수를 찾기
 - Displacement 변수를 Target으로 삼아 예측 진행 후 Nan값 대체
 - MPG를 Target으로 마지막 다중회귀분석 후 해석
"""


In [22]:
model_year_counts = df.groupby('Origin')['Model Year'].nunique()
model_year_counts

Origin
amc ambassador brougham    1
amc ambassador dpl         1
amc ambassador sst         1
amc concord                1
amc concord d/l            1
                          ..
vw dasher (diesel)         1
vw pickup                  1
vw rabbit                  1
vw rabbit c (diesel)       1
vw rabbit custom           1
Name: Model Year, Length: 305, dtype: int64

In [23]:
model_year_counts.unique()

array([1], dtype=int64)