In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import datasets
import pandas as pd
import matplotlib.pyplot as plt
import json
import pickle
from sklearn import metrics
from collections import defaultdict
from collections import Counter
import csv

In [None]:
# 2クラス用AUC
from sklearn import metrics
def auc(train, predict):
    fpr, tpr, thresholds = metrics.roc_curve(train, predict)
    return  metrics.auc(fpr, tpr)

In [None]:
# 変数セーブ&ロード
import pickle
def varSave(filename, var):
    fileObject = open(filename,'wb')
    pickle.dump(var, fileObject)
    fileObject.close()
    return

def varLoad(filename):
    fileObject = open(filename,'rb')  
    var = pickle.load(fileObject)  
    fileObject.close()
    return var

In [None]:
# onehot
def onehot(target):
    return np.eye(np.unique(target).shape[0])[target]

In [None]:
# PCA
from sklearn.decomposition import PCA
def pca(X, dim):
    pca = PCA(n_components=dim)
    pca.fit(X)
    pca.transform(X)
    print (sum(pca.explained_variance_ratio_)) # 寄与率
    return pca.transform(X)

In [None]:
# AutoEncoder
from keras.layers import Input, Dense
from keras.models import Model
def autoenc(X, encoding_dim, epochs):
    input_dim = X.shape[1] # 入力次元
    # encode, decodeの深さ
    depth = 3
    # エンコーダー層
    input_img = Input(shape=(input_dim,))
    encoded = Dense(128, activation='relu')(input_img)
    encoded = Dense(64, activation='relu')(encoded)
    encoded = Dense(encoding_dim, activation='sigmoid')(encoded)
    # デコーダー層
    decoded = Dense(64, activation='relu')(encoded)
    decoded = Dense(128, activation='relu')(decoded)
    decoded = Dense(input_dim, activation='sigmoid')(decoded)
    # AutoEncoder
    autoencoder = Model(input_img, decoded)
    # エンコーダーモデル
    encoder = Model(input_img, encoded)
    # デコーダーモデル
    encoded_input = Input(shape=(encoding_dim,))
    decoder_layer = encoded_input
    for i in range(depth):
        decoder_layer = autoencoder.layers[i-depth](decoder_layer)
    decoder = Model(encoded_input, decoder_layer)
    # 最適化
    autoencoder.compile(optimizer='adam', loss='mean_squared_error')
    autoencoder.fit(X, X, epochs=epochs, batch_size=128, shuffle=True)
    return (encoder, decoder)

In [None]:
#回帰用
boston = datasets.load_boston()
X=boston.data
y=boston.target
X_train, X_valid, y_train, y_valid = train_test_split(X, y, random_state=1,test_size=0.1)

In [None]:
#LASSO
from sklearn import linear_model
model= linear_model.Lasso(alpha=1, fit_intercept=True) # alpha: L1係数
model.fit(X_train, y_train)
print(model.predict(X_valid))
print(model.score(X_valid,y_valid))
print(model.coef_)

In [None]:
#リッジ回帰
from sklearn.linear_model import Ridge
model = Ridge(alpha=1.0) # alpha: L2の係数
model.fit(X_train, y_train)
print(model.predict(X_valid))
print(model.score(X_valid,y_valid))

In [None]:
# ランダムフォレスト
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(max_depth=5, random_state=1, n_estimators=100)
model.fit(X_train, y_train)
print(model.predict(X_valid))
print(model.score(X_valid,y_valid))

In [None]:
# 分類用
iris = datasets.load_iris()
X = iris.data
y = iris.target
X_train, X_valid, y_train, y_valid = train_test_split(X, y, random_state=1,test_size=0.1)

In [None]:
# ロジスティック回帰
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(C=1,penalty="l1") # C: 損失の係数(正則化係数の逆数)
model.fit(X_train, y_train)
print (model.score(X_valid, y_valid))
print (model.predict(X_valid))
print (model.predict_proba(X_valid))

In [None]:
#SVM
from sklearn.svm import LinearSVC
model = LinearSVC(C=1, penalty="l1", loss="squared_hinge", dual=False) # C: 損失の係数(正則化係数の逆数)
model.fit(X_train, y_train)
print(model.predict(X_valid))
print(model.score(X_valid,y_valid))
print(model._predict_proba_lr(X_valid))
print(model.coef_)

In [None]:
# kernel SVM
from sklearn.svm import SVC
model = SVC(C=1,kernel='rbf')
model.fit(X_train, y_train)
print(model.predict(X_valid))
print(model.score(X_valid,y_valid))

In [None]:
# ランダムフォレスト
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1)
model.fit(X_train, y_train)
print(model.predict(X_valid))
print(model.score(X_valid,y_valid))

In [None]:
# LightGBM
import lightgbm as lgb
lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_valid, y_valid, reference=lgb_train)
lgbm_params = {
    'objective': 'multiclass', #'xentropy',
    'num_class': 3,
    'metric': 'multi_logloss', #'auc',
    'num_leaves':15,
    # "min_data_in_leaf": 20,
    # "bagging_fraction":1,
    # "bagging_freq" : 10,
    # "feature_fraction":0.98
    "lambda_l1":0.1,
    "lambda_l2":0.01,
    # "min_gain_to_split":0.1
    # "max_depth":10
}
model = lgb.train(lgbm_params, lgb_train, valid_sets=lgb_eval, num_boost_round=2000, early_stopping_rounds=20)
print (model.predict(X_valid, num_iteration=model.best_iteration))
print(model.feature_importance())
lgb.plot_importance(model, ignore_zero=False,height=0.5)

In [None]:
# xgboost
import xgboost as xgb
model = xgb.XGBClassifier()
model.fit(X_train, y_train, verbose=True)
print(model.predict(X_valid))
print(model.predict_proba(X_valid))
print(model.score(X_valid, y_valid))
print(model.feature_importances_)

In [None]:
# catboost
import catboost
model = catboost.CatBoostClassifier(iterations=1000, 
                                    use_best_model=True, 
                                    eval_metric = "HingeLoss", #'AUC',
                                    random_seed=1, 
                                    l2_leaf_reg=3,
                                    depth=6,
                                    loss_function="MultiClass",#"CrossEntropy",
                                    classes_count=3
                                  )
model.fit(X_train, y_train, 
        # cat_features=categorical_features_index, 
        eval_set=(X_valid, y_valid),
        early_stopping_rounds=20
        )
print(model.predict(X_valid))
print(model.score(X_valid,y_valid))

In [None]:
# ニューラルネット
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
model = Sequential()
model.add(Dense(units=64, activation="relu", input_dim=X_train.shape[1]))
model.add(Dense(units=32, activation="relu"))
model.add(Dense(units=3, activation="softmax"))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

model.fit(X_train, onehot(y_train), epochs=200, batch_size=256)
model.predict_proba(X_valid, batch_size=256)

In [None]:
# k-nn
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
print(knn.score(X_valid,y_valid))
print(knn.predict_proba(X_valid))