# Topic : Lecture 3 Multi-Linear regression
<img src="https://www.tribloom.com/wp-content/uploads/2019/08/CRISP-DM_Process_Diagram-768x769.png" height=300>

Follow the CRSIP-DM method
1. Step 1: Import library, import data
2. Step 2: Pre-processing (missing data, categorical type, normalization, format transform)
3. Step 3: Build ML Model
4. Step 4: Evaluate Model
5. Step 5: Deploy (Prediction)


## Step 1: Load data (also import library)

In [19]:
# import library
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

#import data
data=pd.read_csv("50_Startups.csv")
print(data.head()) # show first 5 items
print(type(data))
print(data.info())




   R&D Spend  Administration  Marketing Spend       State     Profit
0  165349.20       136897.80        471784.10    New York  192261.83
1  162597.70       151377.59        443898.53  California  191792.06
2  153441.51       101145.55        407934.54     Florida  191050.39
3  144372.41       118671.85        383199.62    New York  182901.99
4  142107.34        91391.77        366168.42     Florida  166187.94
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   R&D Spend        50 non-null     float64
 1   Administration   50 non-null     float64
 2   Marketing Spend  50 non-null     float64
 3   State            50 non-null     object 
 4   Profit           50 non-null     float64
dtypes: float64(4), object(1)
memory usage: 2.1+ KB
None


# Step 2: Pre-process X, Y
* 資料型別的 transformation) format transform (轉換成numpy format)
* (missing data=> imputation, 
* normalization
* data type 例如 categorical data onehot encoding, Label_Encoding, padas 套件有一個簡單的 get_dummies 的API



In [20]:
#1. (missing data=> imputation,
# no missing data

#2. normalization
# Skip sklearn linear model 會幫我們做

#3. nominal data 的轉換 ==> pandas 下面 pd.getdummies
#==> X 5個特徵 1Y

X=data.iloc[:,:-1]
Y=data.iloc[:,-1]

print('--------------------------------Before ont hot encode--------------------------------')
print(X[:5],'\n')


################ one hot encode ################
X=pd.get_dummies(X)

print('--------------------------------After ont hot encode--------------------------------')
print(X[:5],'\n')

################ Reshape X ################
X=X.values.reshape(-1,6)

print('--------------------------------After reshape--------------------------------')
print(X[:5,3:],'\n')



print('--------------------------------Before transformate Y--------------------------------')
print(type(Y),'\n')


#4. 資料型別的 transformation) format transform (轉換成numpy format)
Y=Y.values.reshape(-1,1)

print('--------------------------------After transformate Y--------------------------------')
print(type(Y),'\n')


#去除X最後一列
X=X[:,:-1] 

print('--------------------------------Before Random Split Date to Test&Train--------------------------------')
print(type(X),X.shape)
print(type(Y),Y.shape)
print('\n')

from sklearn.model_selection import train_test_split
#使用 sklearn.model_selection.train_test_spli 隨機切分訓練資料與測試資料
#test_size 設定所有資料百分比為測試資料
#random_state 隨機數種子：其實就是該組隨機數的編號，在需要重複試驗的時候，保證得到一組一樣的隨機數。比如你每次都填1，其他引數一樣的情況下你得到的隨機陣列是一樣的。但填0或不填，每次都會不一樣。
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 0)
print('--------------------------------After Random Split Date to Test&Train--------------------------------')
print(type(X_train),X_train.shape)
print(type(Y_train),Y_train.shape)

--------------------------------Before ont hot encode--------------------------------
   R&D Spend  Administration  Marketing Spend       State
0  165349.20       136897.80        471784.10    New York
1  162597.70       151377.59        443898.53  California
2  153441.51       101145.55        407934.54     Florida
3  144372.41       118671.85        383199.62    New York
4  142107.34        91391.77        366168.42     Florida 

--------------------------------After ont hot encode--------------------------------
   R&D Spend  Administration  Marketing Spend  State_California  \
0  165349.20       136897.80        471784.10                 0   
1  162597.70       151377.59        443898.53                 1   
2  153441.51       101145.55        407934.54                 0   
3  144372.41       118671.85        383199.62                 0   
4  142107.34        91391.77        366168.42                 0   

   State_Florida  State_New York  
0              0               1  
1     

# 將值為0 替換為平均數


In [21]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=0, strategy='mean')
imputer.fit(X[:, 0:3])
X[:, 0:3] = imputer.transform(X[:, 0:3] )

# Step 3: Build Model for training

In [22]:
from sklearn.linear_model import LinearRegression as LR
model=LR()  # constructor
#all in policy
model.fit(X_train,Y_train) # training==> find a* and b*

print(model) 
print("a*=",model.coef_,"b*=",model.intercept_, )


LinearRegression()
a*= [[ 7.73467193e-01  3.28845975e-02  3.66100259e-02 -6.99369053e+02
  -1.65865321e+03]] b*= [43253.53667068]


# Step 4: Evalute Model

In [23]:
from sklearn.metrics import r2_score as R2
from sklearn.metrics import mean_squared_error as MSE
yPre=model.predict(X_train)
print("MSE=",MSE(Y_train,yPre))
print("R2=",R2(Y_train,yPre))


#backward selection 
#檢查p-values select 重要的特徵
import statsmodels.api as sm
X_train = np.append(arr = np.ones((40, 1)).astype(int), values = X_train, axis = 1)
X_opt = X_train [:, [0, 1, 2, 3, 4, 5]]
regressor_OLS = sm.OLS(endog = Y_train, exog = X_opt).fit()
print("===================================================")
print('0-5', regressor_OLS.summary())
print("====================================================")


X_opt = X_train [:, [0, 1, 2, 3, 5]]
regressor_OLS = sm.OLS(endog = Y_train, exog = X_opt).fit()
print("====================================================")
print('01235',regressor_OLS.summary())
print("====================================================")

X_opt = X_train [:, [0, 1, 2, 3]]
regressor_OLS = sm.OLS(endog = Y_train, exog = X_opt).fit()
print("====================================================")
print('0123',regressor_OLS.summary())
print("====================================================")


X_opt = X_train [:, [0, 1, 3]]
regressor_OLS = sm.OLS(endog = Y_train, exog = X_opt).fit()
print("====================================================")
print('013',regressor_OLS.summary())
print("====================================================")


X_opt = X_train [:, [0, 1]]
regressor_OLS = sm.OLS(endog = Y_train, exog = X_opt).fit()
print("====================================================")
print('01',regressor_OLS.summary())
print("====================================================")




MSE= 81571001.8007737
R2= 0.9501847627493607
0-5                             OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.950
Model:                            OLS   Adj. R-squared:                  0.943
Method:                 Least Squares   F-statistic:                     129.7
Date:                Wed, 12 Oct 2022   Prob (F-statistic):           3.91e-21
Time:                        09:00:59   Log-Likelihood:                -421.10
No. Observations:                  40   AIC:                             854.2
Df Residuals:                      34   BIC:                             864.3
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
con

# Step 5: Deploy Model to predict new value

In [24]:
data.to_csv("result.csv", index=False, mode='w')

# Cross-Validted

In [25]:
from sklearn.model_selection import cross_val_score
model_cv = cross_val_score(LR(), X_train, Y_train, cv=5)
print(model_cv)
print(model_cv.mean(), model_cv.std())

[0.64120447 0.88536669 0.90980771 0.8961349  0.97415038]
0.8613328292604049 0.11430041180785552


# K-fold

In [26]:
from sklearn.model_selection import KFold
from sklearn.linear_model import Lasso
kf = KFold(n_splits=2)
for train, test in kf.split(X):
    X_train, X_test, Y_train, Y_test = X[train], X[test], Y[train], Y[test]
    reg = Lasso(alpha=0.1) # 其中可以調整 alpha 值決定正則化的強度
    reg.fit(X_train, Y_train)
    print(reg.coef_)

[ 3.33223179e-01  1.22371766e-01  2.82795551e-02 -1.19597952e+04
  1.96620349e+03]
[ 7.76156199e-01 -8.44929316e-02  4.49765846e-02  6.40492338e+03
  1.72039927e+03]


# Lasso Fit

In [27]:
from sklearn.linear_model import Lasso
reg = Lasso(alpha=0.1) # 其中可以調整 alpha 值決定正則化的強度
reg.fit(X_train, Y_train)
print(reg.coef_)

[ 7.76156199e-01 -8.44929316e-02  4.49765846e-02  6.40492338e+03
  1.72039927e+03]


#Optuna

In [28]:
!pip install optuna

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [32]:
import sklearn.datasets
import sklearn.ensemble
import sklearn.model_selection
import sklearn.svm
import optuna

def objective(trial):

    # Invoke suggest methods of a Trial object to generate hyperparameters.
    regressor_name = trial.suggest_categorical('classifier', ['SVR', 'RandomForest'])
    if regressor_name == 'SVR':
        svr_c = trial.suggest_float('svr_c', 1e-10, 1e10, log=True)
        regressor_obj = sklearn.svm.SVR(C=svr_c)
    else:
        rf_max_depth = trial.suggest_int('rf_max_depth', 2, 32)
        regressor_obj = sklearn.ensemble.RandomForestRegressor(max_depth=rf_max_depth)

   
    regressor_obj.fit(X_train, Y_train)
    y_pred = regressor_obj.predict(X_test)

    error = sklearn.metrics.mean_squared_error(Y_test, y_pred)

    return error  # An objective value linked with the Trial object.

study = optuna.create_study()  # Create a new study.
study.optimize(objective, n_trials=100)  # Invoke optimization of the objective function.
study.best_params

[32m[I 2022-10-12 09:11:50,118][0m A new study created in memory with name: no-name-142a0570-2978-40be-b490-9ddc8c7e4d32[0m
  y = column_or_1d(y, warn=True)
[32m[I 2022-10-12 09:11:50,125][0m Trial 0 finished with value: 4265278304.6522517 and parameters: {'classifier': 'SVR', 'svr_c': 7.939250917001582e-09}. Best is trial 0 with value: 4265278304.6522517.[0m
[32m[I 2022-10-12 09:11:50,246][0m Trial 1 finished with value: 1962239475.6946976 and parameters: {'classifier': 'RandomForest', 'rf_max_depth': 29}. Best is trial 1 with value: 1962239475.6946976.[0m
  y = column_or_1d(y, warn=True)
[32m[I 2022-10-12 09:11:50,251][0m Trial 2 finished with value: 4265278063.645988 and parameters: {'classifier': 'SVR', 'svr_c': 0.0057588030022821665}. Best is trial 1 with value: 1962239475.6946976.[0m
[32m[I 2022-10-12 09:11:50,379][0m Trial 3 finished with value: 2054800018.4369292 and parameters: {'classifier': 'RandomForest', 'rf_max_depth': 5}. Best is trial 1 with value: 1962239

{'classifier': 'SVR', 'svr_c': 1319012.9490425165}