# Simple logistic regression dataset
* https://www.kaggle.com/dragonheir/logistic-regression

## 1.取得資料

In [3]:
# !wget https://www.dropbox.com/s/wuviv5chks1n6dn/logistic-regression.zip?dl=0
# !unzip logistic-regression.zip?dl=0
# !ls -l

total 24
drwxr-xr-x 3 root root  4096 Aug 10 16:59 datalab
-rw-r--r-- 1 root root  3309 Aug 14 07:02 logistic-regression.zip?dl=0
-rw-r--r-- 1 root root 10926 Dec 24  2017 Social_Network_Ads.csv


In [0]:
#基本套件
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

#模型套件
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

In [0]:
data = pd.read_csv('Social_Network_Ads.csv')

## 2.觀察資料

In [17]:
print(data.shape)
print(data.info())
data.head()

(400, 5)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 5 columns):
User ID            400 non-null int64
Gender             400 non-null object
Age                400 non-null int64
EstimatedSalary    400 non-null int64
Purchased          400 non-null int64
dtypes: int64(4), object(1)
memory usage: 15.7+ KB
None


Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19,19000,0
1,15810944,Male,35,20000,0
2,15668575,Female,26,43000,0
3,15603246,Female,27,57000,0
4,15804002,Male,19,76000,0


In [18]:
#將性別欄位轉為數值

data['Gender_num'] = data['Gender'].map({"Male":0,"Female":1})
print(data.shape)
print(data.info())
data.head()

(400, 6)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 6 columns):
User ID            400 non-null int64
Gender             400 non-null object
Age                400 non-null int64
EstimatedSalary    400 non-null int64
Purchased          400 non-null int64
Gender_num         400 non-null int64
dtypes: int64(5), object(1)
memory usage: 18.8+ KB
None


Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased,Gender_num
0,15624510,Male,19,19000,0,0
1,15810944,Male,35,20000,0,0
2,15668575,Female,26,43000,0,1
3,15603246,Female,27,57000,0,1
4,15804002,Male,19,76000,0,0


## 3.分訓練及測試

In [0]:
X_train, X_test, y_train, y_test = train_test_split(
    data[['Gender_num','Age','EstimatedSalary']], data[['Purchased']], test_size=0.2, random_state=31)

## 4.特徵縮放
* 將特徵值的範圍調整為0~1之間，使數據的平均值為0，變異數為1
* 可避免部分特徵值的變異數過大，造成機器在學習時出現權重失衡的狀況

In [0]:
sc = StandardScaler()

X_train_std = sc.fit_transform(X_train)
X_test_std = sc.transform(X_test)

## 5.訓練

In [24]:
lr = LogisticRegression()
lr.fit(X_train_std,y_train['Purchased'].values)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

## 6.測試(預測)

In [29]:
#結果背後代表的機率
proba = lr.predict_proba(X_test_std)
print(proba)

#購買與否的結果
result = lr.predict(X_test_std)
print(result)

[[0.98309755 0.01690245]
 [0.74212715 0.25787285]
 [0.34245997 0.65754003]
 [0.63290824 0.36709176]
 [0.10915594 0.89084406]
 [0.01120867 0.98879133]
 [0.53942772 0.46057228]
 [0.03903986 0.96096014]
 [0.91072077 0.08927923]
 [0.79670176 0.20329824]
 [0.99088876 0.00911124]
 [0.43731771 0.56268229]
 [0.94219365 0.05780635]
 [0.0061257  0.9938743 ]
 [0.26080851 0.73919149]
 [0.60550701 0.39449299]
 [0.00738719 0.99261281]
 [0.18515826 0.81484174]
 [0.95208429 0.04791571]
 [0.989915   0.010085  ]
 [0.95666442 0.04333558]
 [0.99066347 0.00933653]
 [0.66798016 0.33201984]
 [0.04964367 0.95035633]
 [0.73871447 0.26128553]
 [0.06934638 0.93065362]
 [0.84580377 0.15419623]
 [0.49611845 0.50388155]
 [0.92082601 0.07917399]
 [0.96138397 0.03861603]
 [0.97868555 0.02131445]
 [0.99203209 0.00796791]
 [0.01411831 0.98588169]
 [0.13952752 0.86047248]
 [0.05842513 0.94157487]
 [0.99346377 0.00653623]
 [0.51181787 0.48818213]
 [0.27093344 0.72906656]
 [0.99411301 0.00588699]
 [0.6551365  0.3448635 ]


In [49]:
# 訓練的error值
error = 0
for i, v in enumerate(lr.predict(X_train_std)):
    if v!= y_train['Purchased'].values[i]:
        error+=1
print("Total errors",error)

# 訓練的AUC
# print(X_train.shape)
# print(y_train.shape)
# print(type(error))
# print(type(X_train.shape[0]))

Accuracy = round((100 - float(error) / float(X_train.shape[0]) * 100),2)
print("Accuracy: {} %".format(str(Accuracy)))

('Total errors', 47)
Accuracy: 85.31 %


In [51]:
# 測試的error值
error = 0
for i, v in enumerate(lr.predict(X_test_std)):
    if v!= y_test['Purchased'].values[i]:
        error+=1
print("Total errors",error)

# 測試的AUC
# print(X_test.shape)
# print(y_test.shape)
# print(type(error))
# print(type(X_test.shape[0]))

Accuracy = round((100 - float(error) / float(X_test.shape[0]) * 100),2)
print("Accuracy: {} %".format(str(Accuracy)))

('Total errors', 14)
Accuracy: 82.5 %
