In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [7]:
df = pd.read_csv('final_data.csv', encoding = 'cp949')

In [19]:
df.head()

In [18]:
df.info()

## RMSE 결과 정리
1. Linear Regression :0.151
2. Polynomial Features : 에러나서 아직 못 구함
3. Decision Tree : 0.160 
4. Random Forest : 0.142 
5. Gradient Boost  0.164 
6. XGBoost : 0.163
7. LightGBM : 에러나서 아직 못 구함

### 1. Linear Regression 선형회귀

In [9]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [103]:
y_target = df['val']
X_data = df[['count', 'nearby_population_200m']]

X_train, X_test, y_train, y_test = train_test_split(X_data, y_target, test_size = 0.3, random_state = 156)

In [105]:
from sklearn.metrics import mean_squared_error, r2_score

model = LinearRegression()
model.fit(X_train, y_train)

y_preds = model.predict(X_test)
r2 = r2_score(y_test, y_preds)
mse = mean_squared_error(y_test, y_preds)
rmse = np.sqrt(mse)

print('회귀계수\n', model.coef_)
print('y 절편\n', model.intercept_)
print('결정계수\n', r2)
print('rmse\n', rmse)

회귀계수
 [2.46571572e-06 2.86138191e-05]
y 절편
 -0.008585150826382532
결정계수
 -0.011367262766892372
rmse
 0.151013752205148


### 2. Polynomial Features 다항회귀

In [107]:
from sklearn.preprocessing import PolynomialFeatures
import numpy as np

In [129]:
y_target = df['val']
X_data = df[['count', 'nearby_population_200m']]

X_train, X_test, y_train, y_test = train_test_split(X_data, y_target, test_size = 0.3, random_state = 156)

In [None]:
# 에러.. : ValueError: X has 2 features, but LinearRegression is expecting 6 features as input
from sklearn.metrics import mean_squared_error, r2_score

poly_ftr = PolynomialFeatures(degree = 2).fit_transform(X_train)

model = LinearRegression()
model.fit(poly_ftr, y_train)

y_preds = model.predict(X_test)
r2 = r2_score(y_test, y_preds)
mse = mean_squared_error(y_test, y_preds)
rmse = np.sqrt(mse)

print('회귀 계수\n', model.coef_)
print('y 절편\n', model.intercept_)
print('결정계수\n', r2)
print('rmse\n', rmse)

### 3. Decision Tree. Random Forest. Gradient Boosting. XGBoost. (LightGBM)

In [13]:
# 입력 모델과 데이터셋을 받아 교차검증으로 평균 RMSE 계산
def get_model_cv_prediction(model, X_data, y_target):
    neg_mse_scores = cross_val_score(model, X_data, y_target, scoring="neg_mean_squared_error", cv = 5)
    rmse_scores  = np.sqrt(-1 * neg_mse_scores)
    avg_rmse = np.mean(rmse_scores)
    print('##### ',model.__class__.__name__ , ' #####')
    print(' 5 교차 검증의 평균 RMSE : {0:.3f} '.format(avg_rmse))

In [16]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

In [17]:
y_target = df['val']
X_data = df[['count', 'nearby_population_200m']]

X_train, X_test, y_train, y_test = train_test_split(X_data, y_target, test_size = 0.3, random_state = 156)

In [15]:
dt_reg = DecisionTreeRegressor(random_state = 0, max_depth = 4)
rf_reg = RandomForestRegressor(random_state = 0, n_estimators = 1000)
gb_reg = GradientBoostingRegressor(random_state = 0, n_estimators = 1000)
xgb_reg = XGBRegressor(n_estimators = 1000)
# lgb_reg = LGBMRegressor(n_estimators = 1000) 에러 나와서 일단 제외

models = [dt_reg, rf_reg, gb_reg, xgb_reg]
for model in models:  
    get_model_cv_prediction(model, X_data, y_target)

#####  DecisionTreeRegressor  #####
 5 교차 검증의 평균 RMSE : 0.160 
#####  RandomForestRegressor  #####
 5 교차 검증의 평균 RMSE : 0.142 
#####  GradientBoostingRegressor  #####
 5 교차 검증의 평균 RMSE : 0.164 
#####  XGBRegressor  #####
 5 교차 검증의 평균 RMSE : 0.163 


--------

### ?. AutoML : OSS 방식
- https://data-minggeul.tistory.com/12

In [1]:
import autosklearn.classification

ModuleNotFoundError: No module named 'autosklearn'

In [None]:
clf = autosklearn.classification.AutoSklearnClassifier()
clf.fit(X_train, y_train)
results = clf.predict(X_test)

### 번외. Random Forest

In [10]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, cross_val_score

In [11]:
y_target = df['val']
X_data = df[['count', 'nearby_population_200m']]

X_train, X_test, y_train, y_test = train_test_split(X_data, y_target, test_size = 0.3, random_state = 156)

In [12]:
rf = RandomForestRegressor(random_state = 0, n_estimators = 1000)
neg_mse_scores = cross_val_score(rf, X_data, y_target, scoring = "neg_mean_squared_error", cv = 5)
rmse_scores  = np.sqrt(-1 * neg_mse_scores)
avg_rmse = np.mean(rmse_scores)

print(' 5 교차 검증의 개별 Negative MSE scores: ', np.round(neg_mse_scores, 2))
print(' 5 교차 검증의 개별 RMSE scores : ', np.round(rmse_scores, 2))
print(' 5 교차 검증의 평균 RMSE : {0:.3f} '.format(avg_rmse))

 5 교차 검증의 개별 Negative MSE scores:  [-0.04 -0.02 -0.01 -0.02 -0.02]
 5 교차 검증의 개별 RMSE scores :  [0.2  0.15 0.08 0.15 0.13]
 5 교차 검증의 평균 RMSE : 0.142 
