# bike_marketing 데이터 변수의 설명은 아래와 같다
|변수|설명|
|:--:|:--:|
|company_num|회사 번호|
|google_adwords|구글 AdWords에 대한 비용|
|facebook|페이스북 광고에 대한 비용|
|twitter|트위터 광고에 대한 비용|
|marketing_total|총 마케팅 예산|
|revenues|매출 정보|
|employees|종업원 수|
|pop_density|타깃 시장의 인구밀도 수준(Low, Medium, High|

## 1) pop_density 변수를 factor형 변수로 변환하고, pop_density별 revenues의 평균 차이가 있는지 통계분석을 시행하여 결과를 해석하시오. 만일 대립가설이 채택된다면 사후분석을 실시하고 결과를 해석하시오

In [29]:
# 모듈 가져오기
import pandas as pd
import numpy as np
from scipy.stats import shapiro, f_oneway
from statsmodels.api import OLS, add_constant
import itertools
import time

In [2]:
# 데이터 가져오기
bike = pd.read_csv('./Data/data_edu/모의고사 4회/bike_marketing.csv')

In [3]:
# 데이터 컬럼 정보 확인하기
bike.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 172 entries, 0 to 171
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   company_num      172 non-null    int64  
 1   google_adwords   172 non-null    float64
 2   facebook         172 non-null    float64
 3   twitter          172 non-null    float64
 4   marketing_total  172 non-null    int64  
 5   revenues         172 non-null    float64
 6   employees        172 non-null    int64  
 7   pop_density      172 non-null    object 
dtypes: float64(4), int64(3), object(1)
memory usage: 10.9+ KB


In [5]:
# 데이터 확인
bike['pop_density'].unique()

array(['High', 'Medium', 'Low'], dtype=object)

In [8]:
# 통계분석을 위한 데이터 정의
col1= bike['revenues'][bike['pop_density'] == 'High']
col2= bike['revenues'][bike['pop_density'] == 'Medium']
col3= bike['revenues'][bike['pop_density'] == 'Low']

In [12]:
# 둘 이상의 모집단의 평균이 동일한지 테스트 하므로 분산분석 이용
one_way_anova = f_oneway(col1,col2,col3)

In [13]:
# 결과 확인
# 귀무가설 : 모든 모집단의 평균은 같다
# 대립가설 : 적어도 하나의 모집단은 평균이 다르다
one_way_anova

F_onewayResult(statistic=0.609944214128908, pvalue=0.544572945313452)

In [14]:
# p-value > 0.05 이므로 귀무가설을 기각하지 못한다
# 따라서, 통계적으로 유의한 차이를 가지고 있다고 보기 힘들다

## 2) google_adwords, facebook, twitter, marketing_total, employees가 revenues에 영향을 미치는지 알아보는 회귀분석을 전진 선택법을 사용하여 수행하고 결과를 해석하시오

In [42]:
# 필요한 데이터만 추출
cols = ['google_adwords', 'facebook', 'twitter', 'marketing_total', 'employees', 'revenues']
bike_reg = bike[cols]

In [43]:
# 데이터 분할 X,y
X = add_constant(bike_reg.drop('revenues', axis = 1))
y = bike_reg['revenues']

In [44]:
# 데이터 확인
y

0      39.26
1      38.90
2      49.51
3      40.56
4      40.21
       ...  
167    48.95
168    49.37
169    36.96
170    41.11
171    58.38
Name: revenues, Length: 172, dtype: float64

In [45]:
# AIC가 가장 작은값을 가지는 모델을 선택
# 함수 정의 - 회귀 모델과, 모델의 AIC 값 추출
def processSubset(x, y, feature_set):
    model = OLS(y, x[feature_set])
    regression = model.fit()
    AIC = regression.aic    
    return {'model' : regression, "AIC" : AIC}

def getBest(x,y,k):
    tic = time.time()
    results = list()
    for combo in itertools.combinations(x.columns.difference(['const']), k):
        combo = list(combo) + ['const']
        results.append(processSubset(x,y,feature_set = combo))
    models = pd.DataFrame(results)
    bestModel = models.loc[models['AIC'].argmin()]
    toc = time.time()
    print('Processed', models.shape[0], 'models on', k, 'predictors in', (toc-tic), 'seconds')
    return models, bestModel

# 전진 선택법
def forward(x,y,predictors):
    remainingPredictors = [p for p in X_train.columns.difference(['const']) if p not in predictors]
    tic = time.time()
    results = list()
    for p in remainingPredictors:
        results.append(processSubset(X_train, y_train, feature_set=predictors + [p] + ['const']))
        models = pd.DataFrame(results)
        bestModel = models.loc[models['AIC'].argmin(), :]
        toc = time.time()
        print('Processed', models.shape[0], 'models on', len(predictors) + 1, 'predictors in', (toc-tic))
        print('Selected predictors:', bestModel['model'].model.exog_names, 'AIC:' , bestModel['AIC'])
        return bestModel

# 전진 선택법 모델
def forward_model(x,y):
    fModels = pd.DataFrame(columns = ['AIC', 'model'])
    tic = time.time()
    predictors = list()
    for i in range(1, len(x.columns.difference(['const']))+1):
        forwardResult = forward(x,y,predictors)
        if i > 1:
            if forwardResult['AIC'] > fmodelBefore:
                break
        fModels.loc[i] = forwardResult
        predictors = fModels.loc[i]['model'].model.exog_names
        fmodelBefore = fModels.loc[i]['AIC']
        predictors = [k for k in predictors if k != 'const']
    toc = time.time()
    print("Total elapesed time : ", (toc - tic), "seconds.")
    return (fModels['model'][len(fModels['model'])])

In [46]:
# 변수 사전 정의
X_train = X
y_train = y
X_test = X
y_test = y
predictors = X_train.columns
forward_model(X,y).summary()

Processed 1 models on 1 predictors in 0.008960723876953125
Selected predictors: ['employees', 'const'] AIC: 946.1149216366764
Processed 1 models on 2 predictors in 0.005982637405395508
Selected predictors: ['employees', 'facebook', 'const'] AIC: 909.2735722999604
Processed 1 models on 3 predictors in 0.006979703903198242
Selected predictors: ['employees', 'facebook', 'google_adwords', 'const'] AIC: 750.8753609482542
Processed 1 models on 4 predictors in 0.0059833526611328125
Selected predictors: ['employees', 'facebook', 'google_adwords', 'marketing_total', 'const'] AIC: 752.8255686835203
Total elapesed time :  0.05784440040588379 seconds.


0,1,2,3
Dep. Variable:,revenues,R-squared:,0.87
Model:,OLS,Adj. R-squared:,0.868
Method:,Least Squares,F-statistic:,375.1
Date:,"Mon, 02 Nov 2020",Prob (F-statistic):,3.41e-74
Time:,22:35:48,Log-Likelihood:,-371.44
No. Observations:,172,AIC:,750.9
Df Residuals:,168,BIC:,763.5
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
employees,0.4066,0.104,3.898,0.000,0.201,0.612
facebook,0.1768,0.012,14.548,0.000,0.153,0.201
google_adwords,0.0415,0.003,16.090,0.000,0.036,0.047
const,28.3751,0.584,48.601,0.000,27.222,29.528

0,1,2,3
Omnibus:,3.324,Durbin-Watson:,2.101
Prob(Omnibus):,0.19,Jarque-Bera (JB):,3.34
Skew:,-0.306,Prob(JB):,0.188
Kurtosis:,2.696,Cond. No.,702.0
