# Motor Trend Car Road Tests

### The data was extracted from the 1974 Motor Trend US magazine, and comprises fuel consumption and 10 aspects of automobile design and performance for 32 automobiles (1973–74 models).

![image.png](../Images/Motor.png)

해당 데이터는 1974년 Motor Trend US Magazine에서 추출된 데이터이며, 32가지의 자동차의 연료 소비를 포함한 10가지 정도의 데이터로 이루어져 있다.<br>
'mtcars'라는 데이터로 알려져 있으며 원본 데이터 세트는 seaborn 패키지에서 다운로드할 수 있다.<br><br>
주어진 학습용 데이터(mpg_X_train.csv, mpg_X_test.csv)를 활용하여 해당 자동차 모델이 미국에서 만든 것인지 예측 모형을 만든 후,<br> 이를 평가용 데이터(mpg_X_test.csv)에 적용하여 얻은 국가 예측값(미국일 확률을 1로 한다)을 .csv 파일로 저장한다.

### Library & Data Import

In [1]:
import pandas as pd
import numpy as np

In [2]:
X_test = pd.read_csv('../Datasets/mpg_X_test.csv')
X_train = pd.read_csv('../Datasets/mpg_X_train.csv')
y_train = pd.read_csv('../Datasets/mpg_y_train.csv')

### 1. 데이터 탐색

In [3]:
X_test

Unnamed: 0,name,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year
0,maxda glc deluxe,34.1,4,86.0,65.0,1975,15.2,79
1,plymouth sapporo,23.2,4,156.0,105.0,2745,16.7,78
2,dodge coronet brougham,16.0,8,318.0,150.0,4190,13.0,76
3,amc concord dl 6,20.2,6,232.0,90.0,3265,18.2,79
4,fiat strada custom,37.3,4,91.0,69.0,2130,14.7,79
...,...,...,...,...,...,...,...,...
115,datsun b210 gx,39.4,4,85.0,70.0,2070,18.6,78
116,amc hornet,18.0,6,232.0,100.0,2945,16.0,73
117,amc matador,16.0,6,258.0,110.0,3632,18.0,74
118,chevy c10,13.0,8,350.0,145.0,4055,12.0,76


In [4]:
X_train

Unnamed: 0,name,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year
0,pontiac j2000 se hatchback,31.0,4,112.0,85.0,2575,16.2,82
1,pontiac safari (sw),13.0,8,400.0,175.0,5140,12.0,71
2,mazda glc custom l,37.0,4,91.0,68.0,2025,18.2,82
3,oldsmobile vista cruiser,12.0,8,350.0,180.0,4499,12.5,73
4,peugeot 504,19.0,4,120.0,88.0,3270,21.9,76
...,...,...,...,...,...,...,...,...
273,honda civic cvcc,36.1,4,91.0,60.0,1800,16.4,78
274,subaru dl,30.0,4,97.0,67.0,1985,16.4,77
275,dodge colt m/m,33.5,4,98.0,83.0,2075,15.9,77
276,ford pinto,18.0,6,171.0,97.0,2984,14.5,75


In [5]:
y_train

Unnamed: 0,isUSA
0,1
1,1
2,0
3,1
4,0
...,...
273,0
274,0
275,1
276,1


In [6]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 278 entries, 0 to 277
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   name          278 non-null    object 
 1   mpg           278 non-null    float64
 2   cylinders     278 non-null    int64  
 3   displacement  278 non-null    float64
 4   horsepower    274 non-null    float64
 5   weight        278 non-null    int64  
 6   acceleration  278 non-null    float64
 7   model_year    278 non-null    int64  
dtypes: float64(4), int64(3), object(1)
memory usage: 17.5+ KB


### 2. 데이터 전처리

In [7]:
from sklearn.impute import SimpleImputer

# 결측치 처리
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')

X_train[['horsepower']] = imputer.fit_transform( X_train[['horsepower']] )
X_test[['horsepower']] = imputer.fit_transform( X_test[['horsepower']] )

In [8]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 278 entries, 0 to 277
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   name          278 non-null    object 
 1   mpg           278 non-null    float64
 2   cylinders     278 non-null    int64  
 3   displacement  278 non-null    float64
 4   horsepower    278 non-null    float64
 5   weight        278 non-null    int64  
 6   acceleration  278 non-null    float64
 7   model_year    278 non-null    int64  
dtypes: float64(4), int64(3), object(1)
memory usage: 17.5+ KB


In [9]:
X_train.describe()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year
count,278.0,278.0,278.0,278.0,278.0,278.0,278.0
mean,23.732734,5.374101,189.994604,103.383212,2948.464029,15.580216,76.057554
std,7.647295,1.677084,105.471423,38.695458,862.949746,2.745907,3.605591
min,10.0,3.0,68.0,46.0,1613.0,8.0,70.0
25%,18.0,4.0,98.0,75.0,2206.25,14.0,73.0
50%,23.0,4.0,140.5,90.5,2737.5,15.5,76.0
75%,29.0,6.0,258.0,118.75,3560.0,17.0,79.0
max,46.6,8.0,455.0,230.0,5140.0,24.8,82.0


In [10]:
COL_DEL = ['name']
COL_NUM = ['mpg', 'cylinders', 'displacement', 'horsepower', 'weight', 'acceleration', 'model_year']
COL_CAT = []
COL_Y = ['isUSA']

X_train = X_train.iloc[:, 1:]
X_test = X_test.iloc[:, 1:]

In [11]:
X_train

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year
0,31.0,4,112.0,85.0,2575,16.2,82
1,13.0,8,400.0,175.0,5140,12.0,71
2,37.0,4,91.0,68.0,2025,18.2,82
3,12.0,8,350.0,180.0,4499,12.5,73
4,19.0,4,120.0,88.0,3270,21.9,76
...,...,...,...,...,...,...,...
273,36.1,4,91.0,60.0,1800,16.4,78
274,30.0,4,97.0,67.0,1985,16.4,77
275,33.5,4,98.0,83.0,2075,15.9,77
276,18.0,6,171.0,97.0,2984,14.5,75


In [12]:
X_test

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year
0,34.1,4,86.0,65.0,1975,15.2,79
1,23.2,4,156.0,105.0,2745,16.7,78
2,16.0,8,318.0,150.0,4190,13.0,76
3,20.2,6,232.0,90.0,3265,18.2,79
4,37.3,4,91.0,69.0,2130,14.7,79
...,...,...,...,...,...,...,...
115,39.4,4,85.0,70.0,2070,18.6,78
116,18.0,6,232.0,100.0,2945,16.0,73
117,16.0,6,258.0,110.0,3632,18.0,74
118,13.0,8,350.0,145.0,4055,12.0,76


### 4. 데이터 모형 구축

#### (1) 데이터 분할

In [13]:
from sklearn.model_selection import train_test_split

X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train, test_size=0.3)

#### (2) 스케일링

In [14]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(X_tr[COL_NUM])

X_tr[COL_NUM]=scaler.transform(X_tr[COL_NUM])
X_val[COL_NUM]=scaler.transform(X_val[COL_NUM])
X_test[COL_NUM]=scaler.transform(X_test[COL_NUM])

### 5. 모델링

In [15]:
from sklearn.neighbors import KNeighborsClassifier

modelKNN = KNeighborsClassifier(n_neighbors=5, metric='euclidean')
modelKNN.fit(X_tr, y_tr.values.ravel())

from sklearn.tree import DecisionTreeClassifier

modelDT = DecisionTreeClassifier(max_depth=10)
modelDT.fit(X_tr, y_tr)

DecisionTreeClassifier(max_depth=10)

### 6. 데이터 모형 평가

In [16]:
y_val_pred = modelKNN.predict(X_val)

y_val_pred_probaKNN = modelKNN.predict_proba(X_val)
y_val_pred_probaDT = modelDT.predict_proba(X_val)

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


In [17]:
from sklearn.metrics import roc_auc_score

scoreKNN = roc_auc_score(y_val, y_val_pred_probaKNN[:, 1])
scoreDT = roc_auc_score(y_val, y_val_pred_probaDT[:, 1])

print( scoreKNN, scoreDT )

0.8709876543209877 0.8925925925925926


In [18]:
best_model = None
best_score = 0

for i in range(2, 10):
    model = KNeighborsClassifier(n_neighbors=i, metric='euclidean')
    model.fit(X_tr, y_tr.values.ravel())
    y_val_pred_proba = model.predict_proba(X_val)
    score = roc_auc_score(y_val, y_val_pred_proba[:, 1])
    print(i,"개의 이웃 확인 : ", score)
    if best_score <= score:
        best_model = model

2 개의 이웃 확인 :  0.887037037037037
3 개의 이웃 확인 :  0.8598765432098765
4 개의 이웃 확인 :  0.8620370370370372
5 개의 이웃 확인 :  0.8709876543209877
6 개의 이웃 확인 :  0.8820987654320989
7 개의 이웃 확인 :  0.8907407407407407
8 개의 이웃 확인 :  0.890432098765432
9 개의 이웃 확인 :  0.8993827160493828


In [19]:
print(best_model.predict_proba(X_test))

[[0.55555556 0.44444444]
 [0.33333333 0.66666667]
 [0.         1.        ]
 [0.         1.        ]
 [0.55555556 0.44444444]
 [0.88888889 0.11111111]
 [0.55555556 0.44444444]
 [0.66666667 0.33333333]
 [0.55555556 0.44444444]
 [0.55555556 0.44444444]
 [0.         1.        ]
 [0.44444444 0.55555556]
 [0.         1.        ]
 [0.55555556 0.44444444]
 [1.         0.        ]
 [0.77777778 0.22222222]
 [0.         1.        ]
 [0.         1.        ]
 [0.77777778 0.22222222]
 [0.         1.        ]
 [0.         1.        ]
 [0.         1.        ]
 [0.         1.        ]
 [0.         1.        ]
 [0.         1.        ]
 [0.44444444 0.55555556]
 [0.66666667 0.33333333]
 [0.         1.        ]
 [0.         1.        ]
 [0.66666667 0.33333333]
 [0.77777778 0.22222222]
 [1.         0.        ]
 [0.44444444 0.55555556]
 [0.33333333 0.66666667]
 [0.         1.        ]
 [0.44444444 0.55555556]
 [0.44444444 0.55555556]
 [0.         1.        ]
 [0.77777778 0.22222222]
 [0.33333333 0.66666667]


In [20]:
pred = best_model.predict_proba(X_test)[:,1]

print(pred)

[0.44444444 0.66666667 1.         1.         0.44444444 0.11111111
 0.44444444 0.33333333 0.44444444 0.44444444 1.         0.55555556
 1.         0.44444444 0.         0.22222222 1.         1.
 0.22222222 1.         1.         1.         1.         1.
 1.         0.55555556 0.33333333 1.         1.         0.33333333
 0.22222222 0.         0.55555556 0.66666667 1.         0.55555556
 0.55555556 1.         0.22222222 0.66666667 1.         0.11111111
 0.11111111 0.55555556 0.33333333 0.44444444 0.44444444 1.
 0.22222222 1.         0.11111111 0.22222222 1.         0.22222222
 1.         1.         1.         0.77777778 1.         0.33333333
 0.88888889 0.66666667 1.         1.         0.66666667 1.
 1.         0.22222222 0.66666667 0.22222222 0.88888889 1.
 0.44444444 0.11111111 1.         0.22222222 1.         0.33333333
 0.22222222 1.         1.         1.         0.22222222 1.
 1.         0.88888889 0.         0.         0.33333333 1.
 1.         1.         1.         1.         1.    

### 7. 결과 저장

In [21]:
pd.DataFrame({'isUSA': pred}).to_csv('./result.csv', index=False)