In [163]:
import time
import datetime
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.metrics import log_loss, accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

In [191]:
df_ks = pd.read_csv("./data/ks-projects-201612.csv")
#ランダムに10000万件取得
df_ks = df_ks.sample(n=10000)
#Column名の空白を削除
df_ks.columns = df_ks.columns.str.strip()
#有効説明変数をCategory, Currency, Deadline, Launched, Goal, Countryとし、目的変数をStateとし、抽出する
#df_ks = df_ks[['category','currency','deadline', 'launched', 'goal', 'country', 'state']]
df_ks = df_ks[['category','currency','deadline', 'launched', 'goal', 'state']]
display(df_ks.head())
df_ks.describe()

Unnamed: 0,category,currency,deadline,launched,goal,state
53141,Video Games,GBP,2015/5/8 14:02,2015/3/9 13:02,25000,failed
249598,Documentary,CAD,2013/10/7 5:59,2013/9/9 23:12,20000,failed
266935,Hardware,USD,2013/10/8 18:51,2013/9/8 18:51,750,successful
144412,Music,CAD,2015/4/10 0:57,2015/3/10 23:57,1000,failed
103888,Food Trucks,USD,2015/6/23 23:59,2015/5/24 23:59,50000,failed


Unnamed: 0,category,currency,deadline,launched,goal,state
count,10000,10000,10000,10000,10000,10000
unique,176,23,9896,9970,784,24
top,Product Design,USD,USD,2013/5/18 0:36,5000,failed
freq,530,8014,18,2,759,5285


In [192]:
#Currencyのアルファベット3文字のデータのみ抽出(カラムがズレているデータは除外)
pattern = "[A-Z]{3}"
df_ks = df_ks[df_ks['currency'].str.match(pattern)]

#Stateのliveとundefinedを削除
df_ks = df_ks[~(df_ks["state"]=="live") & ~(df_ks["state"]=="undefined")]

#Goalを数字化
df_ks["goal"] = pd.to_numeric(df_ks["goal"], errors='coerce')

display(df_ks.head())
df_ks.describe()

Unnamed: 0,category,currency,deadline,launched,goal,state
53141,Video Games,GBP,2015/5/8 14:02,2015/3/9 13:02,25000.0,failed
249598,Documentary,CAD,2013/10/7 5:59,2013/9/9 23:12,20000.0,failed
266935,Hardware,USD,2013/10/8 18:51,2013/9/8 18:51,750.0,successful
144412,Music,CAD,2015/4/10 0:57,2015/3/10 23:57,1000.0,failed
103888,Food Trucks,USD,2015/6/23 23:59,2015/5/24 23:59,50000.0,failed


Unnamed: 0,goal
count,9737.0
mean,31426.81
std,366016.7
min,1.0
25%,2000.0
50%,5500.0
75%,15000.0
max,30000000.0


In [193]:
#launchからdeadlineまでの日数を取得
df_ks['terms'] = (pd.to_datetime(df_ks['deadline']) - pd.to_datetime(df_ks['launched'])).apply(lambda x:x.days)
#df_ks = df_ks[['category','currency', 'goal', 'country', 'terms', 'state']]
df_ks = df_ks[['category','currency', 'goal', 'terms', 'state']]
display(df_ks.head())

Unnamed: 0,category,currency,goal,terms,state
53141,Video Games,GBP,25000.0,60,failed
249598,Documentary,CAD,20000.0,27,failed
266935,Hardware,USD,750.0,30,successful
144412,Music,CAD,1000.0,30,failed
103888,Food Trucks,USD,50000.0,30,failed


In [194]:
#successfulを1,それ以外は失敗として0へ変換
def state_to_num(state):
    if state=="successful":
        state = 1
    else:
        state = 0
    return state

#stateを0,1へ変更
df_ks['state'] = df_ks.state.apply(state_to_num)
df_ks.head()

Unnamed: 0,category,currency,goal,terms,state
53141,Video Games,GBP,25000.0,60,0
249598,Documentary,CAD,20000.0,27,0
266935,Hardware,USD,750.0,30,1
144412,Music,CAD,1000.0,30,0
103888,Food Trucks,USD,50000.0,30,0


In [195]:
#CurrencyのCHF,DKK,HKD,MXN,NOK,SGDはOthersとしてまとめる
df_ks["currency"] = df_ks["currency"].replace(["CHF","DKK","HKD","MXN","NOK","SGD"], "Others")
print(df_ks.groupby(by=["currency"]).size())
#CountryのAT,BE,CH,DK,HK,IE,LU,MX, N,"0, NO,SGはOthersとしてまとめる
#df_ks["country"] = df_ks["country"].replace(["AT","BE","CH","DK","HK","IE","LU","MX","N,\"0","NO","SG"], "Others")
#print(df_ks.groupby(by=["country"]).size())

currency
AUD        201
CAD        369
EUR        336
GBP        831
NZD         42
Others      62
SEK         36
USD       7860
dtype: int64


In [196]:
#質的データ(Main Category, Currency, Country)をダミー化(one-hot)
#df_dummy = pd.get_dummies(df_ks, columns=["category","currency","country"], drop_first=True)
df_dummy = pd.get_dummies(df_ks, columns=["category","currency"], drop_first=True)
display(df_dummy.head())

Unnamed: 0,goal,terms,state,category_Academic,category_Accessories,category_Action,category_Animals,category_Animation,category_Anthologies,category_Apparel,...,category_World Music,category_Young Adult,category_Zines,currency_CAD,currency_EUR,currency_GBP,currency_NZD,currency_Others,currency_SEK,currency_USD
53141,25000.0,60,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
249598,20000.0,27,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
266935,750.0,30,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
144412,1000.0,30,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
103888,50000.0,30,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [270]:
X = df_dummy.drop("state", axis=1)
y = df_dummy["state"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)


estimators = 0
est_train_score = 0
est_test_score = 0

for est in list(range(1, 21)):
    clf = RandomForestClassifier(n_estimators=est, max_depth=None, criterion="gini", min_samples_leaf=5, min_samples_split=5, random_state=1234)
    clf.fit(X_train, y_train)
    #テストスコアの最高点を格納する
    if clf.score(X_test, y_test) > est_test_score:
        est_train_score = clf.score(X_train, y_train)
        est_test_score = clf.score(X_test, y_test)
        estimators = est
        
print("estimators:{}".format(estimators))
print("train score={:.4f}". format(est_train_score))
print("test score={:.4f}". format(est_test_score))

samples = 0
est_train_score = 0
est_test_score = 0

for n in list(range(2, 21)):
    clf = RandomForestClassifier(n_estimators=10, max_depth=None, criterion="gini", min_samples_leaf=n, min_samples_split=n, random_state=1234)
    clf.fit(X_train, y_train)
    #テストスコアの最高点を格納する
    if clf.score(X_test, y_test) > est_test_score:
        est_train_score = clf.score(X_train, y_train)
        est_test_score = clf.score(X_test, y_test)
        samples = n
        
print("samples:{}".format(samples))
print("train score={:.4f}". format(est_train_score))
print("test score={:.4f}". format(est_test_score))

estimators:10
train score=0.7263
test score=0.6966
samples:5
train score=0.7263
test score=0.6966
score= 0.726280652201823
score= 0.696611909650924


In [271]:
clf = RandomForestClassifier(n_estimators=10, max_depth=None, criterion="gini", min_samples_leaf=5, min_samples_split=5, random_state=1234)

clf.fit(X_train, y_train)
print("score=", clf.score(X_train, y_train))
print("score=", clf.score(X_test, y_test))

score= 0.726280652201823
score= 0.696611909650924
