In [260]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import cross_validate
from lightgbm import LGBMClassifier

# 데이터 프레임

In [261]:
co2=pd.read_csv('CO2_Emissions.csv')

In [262]:
co2.head(5)

Unnamed: 0,Make,Model,Vehicle Class,Engine Size(L),Cylinders,Transmission,Fuel Type,Fuel Consumption City (L/100 km),Fuel Consumption Hwy (L/100 km),Fuel Consumption Comb (L/100 km),Fuel Consumption Comb (mpg),CO2 Emissions(g/km)
0,ACURA,ILX,COMPACT,2.0,4,AS5,Z,9.9,6.7,8.5,33,196
1,ACURA,ILX,COMPACT,2.4,4,M6,Z,11.2,7.7,9.6,29,221
2,ACURA,ILX HYBRID,COMPACT,1.5,4,AV7,Z,6.0,5.8,5.9,48,136
3,ACURA,MDX 4WD,SUV - SMALL,3.5,6,AS6,Z,12.7,9.1,11.1,25,255
4,ACURA,RDX AWD,SUV - SMALL,3.5,6,AS6,Z,12.1,8.7,10.6,27,244


In [263]:
co2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7385 entries, 0 to 7384
Data columns (total 12 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   Make                              7385 non-null   object 
 1   Model                             7385 non-null   object 
 2   Vehicle Class                     7385 non-null   object 
 3   Engine Size(L)                    7385 non-null   float64
 4   Cylinders                         7385 non-null   int64  
 5   Transmission                      7385 non-null   object 
 6   Fuel Type                         7385 non-null   object 
 7   Fuel Consumption City (L/100 km)  7385 non-null   float64
 8   Fuel Consumption Hwy (L/100 km)   7385 non-null   float64
 9   Fuel Consumption Comb (L/100 km)  7385 non-null   float64
 10  Fuel Consumption Comb (mpg)       7385 non-null   int64  
 11  CO2 Emissions(g/km)               7385 non-null   int64  
dtypes: flo

## 데이터프레임 전처리(컬럼명공백제거, 특수문자제거)

In [264]:
# str버전
co2.columns = co2.columns.str.replace(' ', '_')

In [265]:
co2.head(5)

Unnamed: 0,Make,Model,Vehicle_Class,Engine_Size(L),Cylinders,Transmission,Fuel_Type,Fuel_Consumption_City_(L/100_km),Fuel_Consumption_Hwy_(L/100_km),Fuel_Consumption_Comb_(L/100_km),Fuel_Consumption_Comb_(mpg),CO2_Emissions(g/km)
0,ACURA,ILX,COMPACT,2.0,4,AS5,Z,9.9,6.7,8.5,33,196
1,ACURA,ILX,COMPACT,2.4,4,M6,Z,11.2,7.7,9.6,29,221
2,ACURA,ILX HYBRID,COMPACT,1.5,4,AV7,Z,6.0,5.8,5.9,48,136
3,ACURA,MDX 4WD,SUV - SMALL,3.5,6,AS6,Z,12.7,9.1,11.1,25,255
4,ACURA,RDX AWD,SUV - SMALL,3.5,6,AS6,Z,12.1,8.7,10.6,27,244


In [266]:
co2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7385 entries, 0 to 7384
Data columns (total 12 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   Make                              7385 non-null   object 
 1   Model                             7385 non-null   object 
 2   Vehicle_Class                     7385 non-null   object 
 3   Engine_Size(L)                    7385 non-null   float64
 4   Cylinders                         7385 non-null   int64  
 5   Transmission                      7385 non-null   object 
 6   Fuel_Type                         7385 non-null   object 
 7   Fuel_Consumption_City_(L/100_km)  7385 non-null   float64
 8   Fuel_Consumption_Hwy_(L/100_km)   7385 non-null   float64
 9   Fuel_Consumption_Comb_(L/100_km)  7385 non-null   float64
 10  Fuel_Consumption_Comb_(mpg)       7385 non-null   int64  
 11  CO2_Emissions(g/km)               7385 non-null   int64  
dtypes: flo

# 데이터분석

## feature, label 설정

In [267]:
label_name = 'Fuel_Type'
co2[label_name].unique()

array(['Z', 'D', 'X', 'E', 'N'], dtype=object)

* 분류 값 정의
    * D: 디젤
    * E: 전기
    * N: 천연가스
    * X: 혼합연료
    * Z: 기타

In [268]:
# feature= co2[['Engine_Size(L)', 'Cylinders', 'Fuel_Consumption_City_(L/100_km)','Fuel_Consumption_Hwy_(L/100_km)',
              # 'Fuel_Consumption_Comb_(L/100_km)','Fuel_Consumption_Comb_(mpg)','CO2_Emissions(g/km)']].to_numpy()

features = co2[co2.columns.difference([label_name])]
labels = co2[label_name]
# label = co2['Fuel_Type']

## 사이킷런으로 train,test

In [269]:
train_input, test_input, train_target, test_target = train_test_split(features, labels, test_size=0.2, random_state=42)

In [270]:
features.head()

Unnamed: 0,CO2_Emissions(g/km),Cylinders,Engine_Size(L),Fuel_Consumption_City_(L/100_km),Fuel_Consumption_Comb_(L/100_km),Fuel_Consumption_Comb_(mpg),Fuel_Consumption_Hwy_(L/100_km),Make,Model,Transmission,Vehicle_Class
0,196,4,2.0,9.9,8.5,33,6.7,ACURA,ILX,AS5,COMPACT
1,221,4,2.4,11.2,9.6,29,7.7,ACURA,ILX,M6,COMPACT
2,136,4,1.5,6.0,5.9,48,5.8,ACURA,ILX HYBRID,AV7,COMPACT
3,255,6,3.5,12.7,11.1,25,9.1,ACURA,MDX 4WD,AS6,SUV - SMALL
4,244,6,3.5,12.1,10.6,27,8.7,ACURA,RDX AWD,AS6,SUV - SMALL


In [271]:
labels.head()

0    Z
1    Z
2    Z
3    Z
4    Z
Name: Fuel_Type, dtype: object

## LightGBM 사용

## 하이퍼파라메터 튜닝

In [272]:
from scipy.stats import uniform, randint

In [273]:
params = {'n_estimators': randint(50, 100),
          'learning_rate': uniform(0.001, 0.3),
          'max_depth': randint(3, 20),
          'min_child_samples': randint(10, 100),
#           'subsample': uniform(0.5, 1.0),
#           'colsample_bytree':uniform(0.5, 1.0),
          'reg_alpha':uniform(0.0, 1.0),
          'reg_lambda':uniform(0.0, 1.0),
          'num_leaves':randint(5, 50)
          }

* 텍스트 데이터 자동형변환
    * 텍스트 데이터의 형이 object보다 category일 경우에 최적의 성능을 발휘한다.

In [274]:
from sklearn.model_selection import RandomizedSearchCV
# co2 데이터 프레임에서 데이터 타입이 문자(object)인 모든 열의 컬럼 리스트를 반환한다.
obj_categorical_features = co2.select_dtypes(include=['object']).columns.tolist()

In [275]:
obj_categorical_features

['Make', 'Model', 'Vehicle_Class', 'Transmission', 'Fuel_Type']

In [276]:
for col in obj_categorical_features:
    if col != label_name:
        train_input[col] = train_input[col].astype('category')
        test_input[col] = test_input[col].astype('category')

In [277]:
lgbm = LGBMClassifier(random_state=42, categorical_feature='auto') # categorical 데이터를 자동으로 처리해 주겠다.
# n_iter=100: 하이퍼파라메터의 샘플링(rvs) 조합을 100번까지 시도
gs = RandomizedSearchCV(LGBMClassifier(random_state=42), params, 
                        n_iter=100, n_jobs=-1, random_state=42)
gs.fit(train_input, train_target)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000212 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1735
[LightGBM] [Info] Number of data points in the train set: 5908, number of used features: 11
[LightGBM] [Info] Start training from score -3.801261
[LightGBM] [Info] Start training from score -3.007309
[LightGBM] [Info] Start training from score -0.705066
[LightGBM] [Info] Start training from score -0.834349


In [278]:
print(gs.best_params_)

{'learning_rate': 0.20856855930780796, 'max_depth': 18, 'min_child_samples': 70, 'n_estimators': 90, 'num_leaves': 40, 'reg_alpha': 0.7121792213475359, 'reg_lambda': 0.23724908749680007}


In [279]:
print(np.max(gs.cv_results_['mean_test_score']))

0.9861209133330755


* 최종 서비스 모델

In [280]:
best_params = gs.best_params_

In [281]:
service_model = LGBMClassifier(
    n_estimators=int(best_params['n_estimators']),
    learning_rate=float(best_params['learning_rate']),
    max_depth=int(best_params['max_depth']),
    min_child_samples=int(best_params['min_child_samples']),
    reg_alpha=float(best_params['reg_alpha']),
    reg_lambda=float(best_params['reg_lambda']),
    num_leaves=int(best_params['num_leaves']),
    random_state=42
)

In [282]:
service_model.fit(train_input, train_target)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003529 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1735
[LightGBM] [Info] Number of data points in the train set: 5908, number of used features: 11
[LightGBM] [Info] Start training from score -3.801261
[LightGBM] [Info] Start training from score -3.007309
[LightGBM] [Info] Start training from score -0.705066
[LightGBM] [Info] Start training from score -0.834349


In [283]:
service_model.score(train_input,train_target)

0.9991536899119837

In [284]:
service_model.score(test_input,test_target)

0.982396750169262

In [285]:
# Random search에서의 데이터 분할셋과 train_test_split함수에서 분리한 데이터 셋의 차이가 있어 결과는 차이가 있을 수 있다.

# 서비스 모델 개발

In [287]:
# 사용자로부터 특성 입력 받기
def collect_user_input():
    user_input = {}
    print("연료 유형을 예측하기 위한 차량의 특성을 입력해주세요.")
    for feature in co2.columns.difference([label_name]):
        if feature in obj_categorical_features:
            value = input(f"{feature} (범주형): ")
        else:
            value = input(f"{feature} (수치형): ")
        user_input[feature] = value
    return user_input

input_features = collect_user_input()

연료 유형을 예측하기 위한 차량의 특성을 입력해주세요.


CO2_Emissions(g/km) (수치형):  196
Cylinders (수치형):  4
Engine_Size(L) (수치형):  2
Fuel_Consumption_City_(L/100_km) (수치형):  9.9
Fuel_Consumption_Comb_(L/100_km) (수치형):  8.5
Fuel_Consumption_Comb_(mpg) (수치형):  33
Fuel_Consumption_Hwy_(L/100_km) (수치형):  6.7
Make (범주형):  ACURA
Model (범주형):  ILX
Transmission (범주형):  AS5
Vehicle_Class (범주형):  COMPACT


* 사용자 입력은 HTML또는 Python 콘솔에서 입력한 경우 모두 문자열 타입이기 때문에 각 feature별 형변환이 필요하다.

In [288]:
input_features

{'CO2_Emissions(g/km)': '196',
 'Cylinders': '4',
 'Engine_Size(L)': '2',
 'Fuel_Consumption_City_(L/100_km)': '9.9',
 'Fuel_Consumption_Comb_(L/100_km)': '8.5',
 'Fuel_Consumption_Comb_(mpg)': '33',
 'Fuel_Consumption_Hwy_(L/100_km)': '6.7',
 'Make': 'ACURA',
 'Model': 'ILX',
 'Transmission': 'AS5',
 'Vehicle_Class': 'COMPACT'}

In [289]:
# 딕셔너리의 각 값을 리스트로 변환
data = {key: [value] for key, value in input_features.items()}
data

{'CO2_Emissions(g/km)': ['196'],
 'Cylinders': ['4'],
 'Engine_Size(L)': ['2'],
 'Fuel_Consumption_City_(L/100_km)': ['9.9'],
 'Fuel_Consumption_Comb_(L/100_km)': ['8.5'],
 'Fuel_Consumption_Comb_(mpg)': ['33'],
 'Fuel_Consumption_Hwy_(L/100_km)': ['6.7'],
 'Make': ['ACURA'],
 'Model': ['ILX'],
 'Transmission': ['AS5'],
 'Vehicle_Class': ['COMPACT']}

In [290]:
input_df = pd.DataFrame(data)
input_df

Unnamed: 0,CO2_Emissions(g/km),Cylinders,Engine_Size(L),Fuel_Consumption_City_(L/100_km),Fuel_Consumption_Comb_(L/100_km),Fuel_Consumption_Comb_(mpg),Fuel_Consumption_Hwy_(L/100_km),Make,Model,Transmission,Vehicle_Class
0,196,4,2,9.9,8.5,33,6.7,ACURA,ILX,AS5,COMPACT


In [291]:
input_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1 entries, 0 to 0
Data columns (total 11 columns):
 #   Column                            Non-Null Count  Dtype 
---  ------                            --------------  ----- 
 0   CO2_Emissions(g/km)               1 non-null      object
 1   Cylinders                         1 non-null      object
 2   Engine_Size(L)                    1 non-null      object
 3   Fuel_Consumption_City_(L/100_km)  1 non-null      object
 4   Fuel_Consumption_Comb_(L/100_km)  1 non-null      object
 5   Fuel_Consumption_Comb_(mpg)       1 non-null      object
 6   Fuel_Consumption_Hwy_(L/100_km)   1 non-null      object
 7   Make                              1 non-null      object
 8   Model                             1 non-null      object
 9   Transmission                      1 non-null      object
 10  Vehicle_Class                     1 non-null      object
dtypes: object(11)
memory usage: 220.0+ bytes


* 입력값 형변환

In [292]:
for col in obj_categorical_features:
    if col != label_name:
        input_df[col] = input_df[col].astype('category')

In [293]:
input_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1 entries, 0 to 0
Data columns (total 11 columns):
 #   Column                            Non-Null Count  Dtype   
---  ------                            --------------  -----   
 0   CO2_Emissions(g/km)               1 non-null      object  
 1   Cylinders                         1 non-null      object  
 2   Engine_Size(L)                    1 non-null      object  
 3   Fuel_Consumption_City_(L/100_km)  1 non-null      object  
 4   Fuel_Consumption_Comb_(L/100_km)  1 non-null      object  
 5   Fuel_Consumption_Comb_(mpg)       1 non-null      object  
 6   Fuel_Consumption_Hwy_(L/100_km)   1 non-null      object  
 7   Make                              1 non-null      category
 8   Model                             1 non-null      category
 9   Transmission                      1 non-null      category
 10  Vehicle_Class                     1 non-null      category
dtypes: category(4), object(7)
memory usage: 656.0+ bytes


* int64변환

In [294]:
input_df['CO2_Emissions(g/km)'] = input_df['CO2_Emissions(g/km)'].astype('int64')
input_df['Cylinders'] = input_df['Cylinders'].astype('int64')
input_df['Fuel_Consumption_Comb_(mpg)'] = input_df['Fuel_Consumption_Comb_(mpg)'].astype('int64')

In [295]:
input_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1 entries, 0 to 0
Data columns (total 11 columns):
 #   Column                            Non-Null Count  Dtype   
---  ------                            --------------  -----   
 0   CO2_Emissions(g/km)               1 non-null      int64   
 1   Cylinders                         1 non-null      int64   
 2   Engine_Size(L)                    1 non-null      object  
 3   Fuel_Consumption_City_(L/100_km)  1 non-null      object  
 4   Fuel_Consumption_Comb_(L/100_km)  1 non-null      object  
 5   Fuel_Consumption_Comb_(mpg)       1 non-null      int64   
 6   Fuel_Consumption_Hwy_(L/100_km)   1 non-null      object  
 7   Make                              1 non-null      category
 8   Model                             1 non-null      category
 9   Transmission                      1 non-null      category
 10  Vehicle_Class                     1 non-null      category
dtypes: category(4), int64(3), object(4)
memory usage: 656.0+ bytes

* 나머지 타입(float64) 변환

In [296]:
obj_categorical_features = input_df.select_dtypes(include=['object']).columns.tolist()
obj_categorical_features

['Engine_Size(L)',
 'Fuel_Consumption_City_(L/100_km)',
 'Fuel_Consumption_Comb_(L/100_km)',
 'Fuel_Consumption_Hwy_(L/100_km)']

In [297]:
for col in obj_categorical_features:
    input_df[col] = input_df[col].astype('float64')
    input_df[col] = input_df[col].astype('float64')

In [298]:
input_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1 entries, 0 to 0
Data columns (total 11 columns):
 #   Column                            Non-Null Count  Dtype   
---  ------                            --------------  -----   
 0   CO2_Emissions(g/km)               1 non-null      int64   
 1   Cylinders                         1 non-null      int64   
 2   Engine_Size(L)                    1 non-null      float64 
 3   Fuel_Consumption_City_(L/100_km)  1 non-null      float64 
 4   Fuel_Consumption_Comb_(L/100_km)  1 non-null      float64 
 5   Fuel_Consumption_Comb_(mpg)       1 non-null      int64   
 6   Fuel_Consumption_Hwy_(L/100_km)   1 non-null      float64 
 7   Make                              1 non-null      category
 8   Model                             1 non-null      category
 9   Transmission                      1 non-null      category
 10  Vehicle_Class                     1 non-null      category
dtypes: category(4), float64(4), int64(3)
memory usage: 656.0 bytes

In [299]:
# 예측 실행
prediction = service_model.predict(input_df)
print(f"\n예측된 연료 유형: {prediction[0]}")


예측된 연료 유형: Z
