In [1]:
import pandas as pd
import numpy as np
import warnings
import scipy.stats as ttest_rel

from numpy import mean
from numpy import std
from sklearn import datasets
from sklearn.model_selection import train_test_split,cross_val_score , KFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC
from scipy import stats
warnings.filterwarnings('ignore')

In [2]:
data = pd.read_csv("customer_churn.csv")
customer_churn = pd.DataFrame(data)


In [3]:
customer_churn.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5640 entries, 0 to 5639
Data columns (total 20 columns):
CustomerID                     5640 non-null int64
Churn                          5640 non-null int64
Tenure                         5376 non-null float64
PreferredLoginDevice           5640 non-null object
CityTier                       5640 non-null int64
WarehouseToHome                5389 non-null float64
PreferredPaymentMode           5640 non-null object
Gender                         5640 non-null object
HourSpendOnApp                 5385 non-null float64
NumberOfDeviceRegistered       5640 non-null int64
PreferedOrderCat               5640 non-null object
SatisfactionScore              5640 non-null int64
MaritalStatus                  5640 non-null object
NumberOfAddress                5640 non-null int64
Complain                       5640 non-null int64
OrderAmountHikeFromlastYear    5375 non-null float64
CouponUsed                     5384 non-null float64
OrderCount 

In [4]:
#1. 針對 churn 欄位使用 Stratified sampling 從原本的資料集中取 60%的資料

customer_churn_stratified = customer_churn.groupby('Churn',group_keys=False).apply(lambda x: x.sample(frac=0.6))

In [5]:
#2. 列出取樣後各類別的資料數量
customer_churn_stratified.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3384 entries, 1804 to 3212
Data columns (total 20 columns):
CustomerID                     3384 non-null int64
Churn                          3384 non-null int64
Tenure                         3216 non-null float64
PreferredLoginDevice           3384 non-null object
CityTier                       3384 non-null int64
WarehouseToHome                3249 non-null float64
PreferredPaymentMode           3384 non-null object
Gender                         3384 non-null object
HourSpendOnApp                 3231 non-null float64
NumberOfDeviceRegistered       3384 non-null int64
PreferedOrderCat               3384 non-null object
SatisfactionScore              3384 non-null int64
MaritalStatus                  3384 non-null object
NumberOfAddress                3384 non-null int64
Complain                       3384 non-null int64
OrderAmountHikeFromlastYear    3230 non-null float64
CouponUsed                     3235 non-null float64
OrderCou

In [6]:
#3. 資料前處理，填補空值

customer_churn_stratified['Tenure'].fillna(customer_churn_stratified['Tenure'].mean(),inplace=True)
customer_churn_stratified['WarehouseToHome'].fillna(customer_churn_stratified['WarehouseToHome'].mean(),inplace=True)
customer_churn_stratified['HourSpendOnApp'].fillna(customer_churn_stratified['HourSpendOnApp'].mean(),inplace=True)
customer_churn_stratified['OrderAmountHikeFromlastYear'].fillna(customer_churn_stratified['OrderAmountHikeFromlastYear'].mean(),inplace=True)
customer_churn_stratified['CouponUsed'].fillna(customer_churn_stratified['CouponUsed'].mean(),inplace=True)
customer_churn_stratified['OrderCount'].fillna(customer_churn_stratified['OrderCount'].mean(),inplace=True)
customer_churn_stratified['DaySinceLastOrder'].fillna(customer_churn_stratified['DaySinceLastOrder'].mean(),inplace=True)


In [7]:
#3. 資料前處理，將nominal轉為numeric

labelencoder = LabelEncoder()
customer_churn_stratified['PreferredLoginDevice'] = labelencoder.fit_transform(customer_churn_stratified['PreferredLoginDevice'])
customer_churn_stratified['PreferredPaymentMode'] = labelencoder.fit_transform(customer_churn_stratified['PreferredPaymentMode'])
customer_churn_stratified['Gender'] = labelencoder.fit_transform(customer_churn_stratified['Gender'])
customer_churn_stratified['PreferedOrderCat'] = labelencoder.fit_transform(customer_churn_stratified['PreferedOrderCat'])
customer_churn_stratified['MaritalStatus'] = labelencoder.fit_transform(customer_churn_stratified['MaritalStatus'])

In [8]:
#3. 以 10 folds cross-validation 建立 Logistic Regression 及 SVM 模型

X = customer_churn_stratified.drop(['CustomerID','Churn'],axis=1)
y = customer_churn_stratified['Churn']

logisticmodel = LogisticRegression()
svm = SVC()

LogisticRegression_Score = cross_val_score(logisticmodel, X, y, cv=10, scoring='accuracy')
SVM_Score = cross_val_score(svm,X,y,cv=10,scoring='accuracy')





In [9]:
# 4. 針對測試資料印出兩個模型的平均 Accuracy

print(LogisticRegression_Score)
print('Logistic Regression 平均 Accuracy:')
print(LogisticRegression_Score.mean())
print('-'*25)
print('SVM 平均 Accuracy:')
print(SVM_Score.mean())

[0.85545723 0.86725664 0.89380531 0.85840708 0.88757396 0.87869822
 0.87278107 0.86982249 0.86094675 0.85207101]
Logistic Regression 平均 Accuracy:
0.8696819744811577
-------------------------
SVM 平均 Accuracy:
0.8312649456284582


In [10]:
#5. 重複 1~4 題 30 次，並印出兩種模型最終的平均 Accuracy

k_range = range(1,31)
logistic_scores = []
svm_scores = []
for k_number in k_range:
    customer_churn_stratified = customer_churn.groupby('Churn',group_keys=False).apply(lambda x: x.sample(frac=0.6))
    
    customer_churn_stratified['Tenure'].fillna(customer_churn_stratified['Tenure'].mean(),inplace=True)
    customer_churn_stratified['WarehouseToHome'].fillna(customer_churn_stratified['WarehouseToHome'].mean(),inplace=True)
    customer_churn_stratified['HourSpendOnApp'].fillna(customer_churn_stratified['HourSpendOnApp'].mean(),inplace=True)
    customer_churn_stratified['OrderAmountHikeFromlastYear'].fillna(customer_churn_stratified['OrderAmountHikeFromlastYear'].mean(),inplace=True)
    customer_churn_stratified['CouponUsed'].fillna(customer_churn_stratified['CouponUsed'].mean(),inplace=True)
    customer_churn_stratified['OrderCount'].fillna(customer_churn_stratified['OrderCount'].mean(),inplace=True)
    customer_churn_stratified['DaySinceLastOrder'].fillna(customer_churn_stratified['DaySinceLastOrder'].mean(),inplace=True)
   
    labelencoder = LabelEncoder()
    customer_churn_stratified['PreferredLoginDevice'] = labelencoder.fit_transform(customer_churn_stratified['PreferredLoginDevice'])
    customer_churn_stratified['PreferredPaymentMode'] = labelencoder.fit_transform(customer_churn_stratified['PreferredPaymentMode'])
    customer_churn_stratified['Gender'] = labelencoder.fit_transform(customer_churn_stratified['Gender'])
    customer_churn_stratified['PreferedOrderCat'] = labelencoder.fit_transform(customer_churn_stratified['PreferedOrderCat'])
    customer_churn_stratified['MaritalStatus'] = labelencoder.fit_transform(customer_churn_stratified['MaritalStatus'])
    
    X = customer_churn_stratified.drop(['CustomerID','Churn'],axis=1)
    y = customer_churn_stratified['Churn']
    
    logisticmodel = LogisticRegression(C=k_number,random_state=k_number)
    svm = SVC(C=k_number,random_state=k_number)
   
    LogisticRegression_Scores = cross_val_score(logisticmodel, X, y, cv=10, scoring='accuracy')
    SVM_Scores = cross_val_score(svm,X,y,cv=10,scoring='accuracy')
   
    logistic_scores.append(LogisticRegression_Scores.mean())
    svm_scores.append(SVM_Scores.mean())
    



In [11]:
import numpy as np

logistic_scores= np.array(logistic_scores)
svm_scores= np.array(svm_scores)

print('Logistic Regression 平均 Accuracy:')
print(logistic_scores.mean())
print('-'*25)
print('SVM 平均 Accuracy:')
print(svm_scores.mean())

Logistic Regression 平均 Accuracy:
0.871216305062459
-------------------------
SVM 平均 Accuracy:
0.831639320893916


In [12]:
#6. 根據模型於 30 次 10 folds cross-validation 的 Accuracy，以 paired t-test 比較兩種模型，並說明結論

print(stats.shapiro(logistic_scores))
print(stats.shapiro(svm_scores))
print("兩種模型皆為常態分配，可使用paired t-test進行檢定")
print('-'*50)
print(stats.ttest_rel(logistic_scores,svm_scores))
print('p-value>0.5，無法拒絕虛無假設，兩種模型做出來的平均準確率結果相似')

(0.9809568524360657, 0.8503914475440979)
(0.528870701789856, 1.0968034480640654e-08)
兩種模型皆為常態分配，可使用paired t-test進行檢定
--------------------------------------------------
Ttest_relResult(statistic=57.234575680449254, pvalue=2.2179924106917435e-31)
p-value>0.5，無法拒絕虛無假設，兩種模型做出來的平均準確率結果相似
