### 考察
* 説明変数を、main_category, currency, goal, country, terms(日数:launched - deadline)とする
* categoryはmain_categoryとある程度重複(相関?)がある為、今回は除外してみる
* currecyとcountryにも相関がある為、無相関化を施してみる
* 目的変数をstateとする。
* stateのliveとundefinedは除外して考える

### 結果
* ロジスティック回帰
    * 訓練 正答率: 67.164%
    * テスト 正答率: 66.862%
* サポートベクターマシン
    * 訓練 正答率: 67.164%
    * テスト 正答率: 66.862%

In [2]:
import time
import datetime
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import re
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression, SGDClassifier
from sklearn.metrics import log_loss, accuracy_score, confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC

In [3]:
df_ks = pd.read_csv("./data/ks-projects-201612.csv")
#Column名の空白を削除
df_ks.columns = df_ks.columns.str.strip()
#有効説明変数をMainCategory, Currency, Deadline, Launched, Goal, Countryとし、目的変数をStateとし、抽出する
df_ks = df_ks[['main_category','currency','deadline', 'launched', 'goal', 'country', 'state']]
display(df_ks.head())
df_ks.describe()

Unnamed: 0,main_category,currency,deadline,launched,goal,country,state
0,Publishing,GBP,2015/10/9 11:36,2015/8/11 12:12,1000,GB,failed
1,Film & Video,USD,2013/2/26 0:20,2013/1/12 0:20,45000,US,failed
2,Music,USD,2012/4/16 4:24,2012/3/17 3:24,5000,US,failed
3,Film & Video,USD,2015/8/29 1:00,2015/7/4 8:35,19500,US,canceled
4,Food,USD,2016/4/1 13:38,2016/2/26 13:38,50000,US,successful


Unnamed: 0,main_category,currency,deadline,launched,goal,country,state
count,323750,323750,323750,323750,323750,323750,323750
unique,120,37,275294,295793,8188,162,410
top,Film & Video,USD,USD,5000,5000,US,failed
freq,57679,260298,508,60,25520,257565,168221


In [4]:
#Currencyのアルファベット3文字のデータのみ抽出
pattern = "[A-Z]{3}"
df_ks = df_ks[df_ks['currency'].str.match(pattern)]

#Stateのliveとundefinedを削除
df_ks = df_ks[~(df_ks["state"]=="live") & ~(df_ks["state"]=="undefined")]

#Goalを数字化
df_ks["goal"] = pd.to_numeric(df_ks["goal"], errors='coerce')

display(df_ks.head())
df_ks.describe()

Unnamed: 0,main_category,currency,deadline,launched,goal,country,state
0,Publishing,GBP,2015/10/9 11:36,2015/8/11 12:12,1000.0,GB,failed
1,Film & Video,USD,2013/2/26 0:20,2013/1/12 0:20,45000.0,US,failed
2,Music,USD,2012/4/16 4:24,2012/3/17 3:24,5000.0,US,failed
3,Film & Video,USD,2015/8/29 1:00,2015/7/4 8:35,19500.0,US,canceled
4,Food,USD,2016/4/1 13:38,2016/2/26 13:38,50000.0,US,successful


Unnamed: 0,goal
count,315135.0
mean,47396.37
std,1139801.0
min,0.01
25%,2000.0
50%,5000.0
75%,15000.0
max,100000000.0


In [5]:
#launchからdeadlineまでの日数を取得
df_ks['terms'] = (pd.to_datetime(df_ks['deadline']) - pd.to_datetime(df_ks['launched'])).apply(lambda x:x.days)
df_ks = df_ks[['main_category','currency', 'goal', 'country', 'terms', 'state']]
display(df_ks.head())

Unnamed: 0,main_category,currency,goal,country,terms,state
0,Publishing,GBP,1000.0,GB,58,failed
1,Film & Video,USD,45000.0,US,45,failed
2,Music,USD,5000.0,US,30,failed
3,Film & Video,USD,19500.0,US,55,canceled
4,Food,USD,50000.0,US,35,successful


In [6]:
#successfulを1,それ以外は失敗として0へ変換
def state_to_num(state):
    if state=="successful":
        state = 1
    else:
        state = 0
    return state

#stateを0,1へ変更
df_ks['state'] = df_ks.state.apply(state_to_num)
df_ks.head()

Unnamed: 0,main_category,currency,goal,country,terms,state
0,Publishing,GBP,1000.0,GB,58,0
1,Film & Video,USD,45000.0,US,45,0
2,Music,USD,5000.0,US,30,0
3,Film & Video,USD,19500.0,US,55,0
4,Food,USD,50000.0,US,35,1


In [7]:
#MainCategoryの全体数
print(df_ks.groupby(by=["main_category"]).size())
#Currencyの全体数
print(df_ks.groupby(by=["currency"]).size())
#Countryの全体数
print(df_ks.groupby(by=["country"]).size())

main_category
Art             23703
Comics           8626
Crafts           7041
Dance            3339
Design          23314
Fashion         18046
Film & Video    56311
Food            20919
Games           27542
Journalism       4009
Music           44021
Photography      9577
Publishing      33311
Technology      25481
Theater          9895
dtype: int64
currency
AUD      6120
CAD     11764
CHF       442
DKK       797
EUR     10864
GBP     27102
HKD        65
MXN        16
NOK       512
NZD      1118
SEK      1230
SGD        81
USD    255024
dtype: int64
country
AT         351
AU        6112
BE         384
CA       11759
CH         442
DE        2567
DK         794
ES        1288
FR        1804
GB       27078
HK          65
IE         556
IT        1643
LU          35
MX          16
N,"0       234
NL        2212
NO         510
NZ        1118
SE        1229
SG          81
US      254857
dtype: int64


In [8]:
#CurrencyのCHF,DKK,HKD,MXN,NOK,SGDはOthersとしてまとめる
df_ks["currency"] = df_ks["currency"].replace(["CHF","DKK","HKD","MXN","NOK","SGD"], "Others")
print(df_ks.groupby(by=["currency"]).size())
#CountryのAT,BE,CH,DK,HK,IE,LU,MX, N,"0, NO,SGはOthersとしてまとめる
df_ks["country"] = df_ks["country"].replace(["AT","BE","CH","DK","HK","IE","LU","MX","N,\"0","NO","SG"], "Others")
print(df_ks.groupby(by=["country"]).size())

currency
AUD         6120
CAD        11764
EUR        10864
GBP        27102
NZD         1118
Others      1913
SEK         1230
USD       255024
dtype: int64
country
AU          6112
CA         11759
DE          2567
ES          1288
FR          1804
GB         27078
IT          1643
NL          2212
NZ          1118
Others      3468
SE          1229
US        254857
dtype: int64


In [9]:
#質的データ(Main Category, Currency, Country)をダミー化(one-hot)
df_dummy = pd.get_dummies(df_ks, columns=["main_category","currency","country"], drop_first=True)
display(df_dummy.head())

Unnamed: 0,goal,terms,state,main_category_Comics,main_category_Crafts,main_category_Dance,main_category_Design,main_category_Fashion,main_category_Film & Video,main_category_Food,...,country_DE,country_ES,country_FR,country_GB,country_IT,country_NL,country_NZ,country_Others,country_SE,country_US
0,1000.0,58,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
1,45000.0,45,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
2,5000.0,30,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,19500.0,55,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
4,50000.0,35,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1


In [10]:
df = df_dummy.drop('state', axis=1)
display(df.head())

Unnamed: 0,goal,terms,main_category_Comics,main_category_Crafts,main_category_Dance,main_category_Design,main_category_Fashion,main_category_Film & Video,main_category_Food,main_category_Games,...,country_DE,country_ES,country_FR,country_GB,country_IT,country_NL,country_NZ,country_Others,country_SE,country_US
0,1000.0,58,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
1,45000.0,45,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
2,5000.0,30,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,19500.0,55,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
4,50000.0,35,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1


In [11]:
#無相関化
cov = np.cov(df, rowvar=0)
_,S = np.linalg.eig(cov)
decorr_data = (S.T@df.T).T

display(decorr_data.head())

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,24,25,26,27,28,29,30,31,32,33
0,-1000.000014,57.99967,-0.670341,-0.015441,0.065257,-1.315583,0.47657,-0.066233,-0.110427,-0.075651,...,0.358992,-0.005636,-0.216366,0.115592,0.005923,0.000379,-0.000205,0.000174,0.00011,-7.907336e-18
1,-45000.000011,44.989128,1.240862,0.924755,-0.250611,-0.26505,-0.01752,-0.023176,-0.066987,-0.061199,...,0.351633,-0.004662,-0.213255,0.113481,0.005882,0.000732,-0.000202,0.000178,0.000115,-2.926002e-17
2,-5000.000007,29.998893,1.271182,-0.357665,-0.83993,-0.260463,0.0565,-0.042342,-0.076547,-0.069244,...,0.350043,-0.004663,-0.215001,0.11438,0.00425,0.00052,-0.00018,0.000169,0.000123,-3.0704760000000004e-17
3,-19500.000013,54.995358,1.239979,0.924615,-0.250227,-0.265289,-0.017846,-0.023338,-0.067036,-0.061058,...,0.35164,-0.004652,-0.213235,0.113456,0.005885,0.000732,-0.000202,0.000178,0.000115,-2.917185e-17
4,-50000.000009,34.987855,1.212689,-0.02324,0.06857,-0.355031,-0.293074,0.146637,0.149519,0.780856,...,0.352253,-0.004117,-0.223957,0.113385,0.006351,0.000792,-0.000223,0.000184,0.00012,3.957353e-17


In [12]:
#標準化(白色化)
stdsc = StandardScaler()
white_data = stdsc.fit_transform(decorr_data)
print(white_data)

[[ 4.07057469e-02  3.25881603e-01 -2.49018532e+00 ... -2.94702443e-03
  -7.85437626e-03  9.99193015e-02]
 [ 2.10245230e-03  1.45305210e-01  5.13331963e-01 ... -4.13197093e-04
  -2.88472376e-03 -4.83241592e-03]
 [ 3.71963565e-02 -6.27478297e-02  5.60981597e-01 ... -6.17660857e-03
   4.77083469e-03 -1.19200317e-02]
 ...
 [ 2.84228805e-02  1.45406936e-01  5.13190185e-01 ... -4.20163915e-04
  -2.88513083e-03 -4.85363239e-03]
 [ 2.84228805e-02 -6.27820707e-02  3.93965755e-01 ...  5.82543867e-03
   3.66788646e-03  1.07158614e-01]
 [ 3.98283993e-02 -9.04966103e-02  4.61202302e-01 ...  2.30233831e-03
   9.47466975e-04 -8.27394800e-02]]


In [13]:
#ホールドアウトで訓練データとテストデータに分割(20%)
X = white_data
y = df_ks["state"].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(252108, 34)
(63027, 34)
(252108,)
(63027,)


In [14]:
#ロジスティック回帰
lr = SGDClassifier(loss='log', penalty='none', max_iter=10000, fit_intercept=True, random_state=1234)
lr.fit(X_train, y_train)
y_lr_est = lr.predict(X_train)
y_lr_test = lr.predict(X_test)

In [15]:
print("訓練 正答率: {:.3f}%".format(100 * accuracy_score(y_lr_est, y_train)))
print("テスト 正答率: {:.3f}%".format(100 * accuracy_score(y_lr_test, y_test)))

訓練 正答率: 67.164%
テスト 正答率: 66.862%


In [None]:
#SVM
clf = SVC(C=10)
clf.fit(X_train, y_train)
y_svc_est = clf.predict(X_train)
y_svc_test = clf.predict(X_test)

In [None]:
print("SVM 訓練 正答率: {:.3f}%".format(100 * accuracy_score(y_svc_est, y_train)))
print("SVM テスト 正答率: {:.3f}%".format(100 * accuracy_score(y_svc_test, y_test)))