# Sprint2 機械学習スクラッチ入門

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [20]:
train_data = pd.read_csv('train.csv')
#display(train_data.head(3))
X_df = train_data.loc[:,['GrLivArea','YearBuilt']]
y_df = train_data.loc[:,'SalePrice']
display(X_df)

Unnamed: 0,GrLivArea,YearBuilt
0,1710,2003
1,1262,1976
2,1786,2001
3,1717,1915
4,2198,2000
...,...,...
1455,1647,1999
1456,2073,1978
1457,2340,1941
1458,1078,1950


In [21]:
X_df.columns

Index(['GrLivArea', 'YearBuilt'], dtype='object')

## 【問題1】train_test_splitのスクラッチ

In [39]:
import random

null_sum = 0
for col in X_df.columns:
    #欠損の補間
    null_sum = X_df[col].isnull().sum()
    train_length = X_df[col].count()
    if null_sum > 0:
        if X_df[col].dtype == object:
            X_df[col] = X_df[col].fillna(X_df[col].mode()[1])
        else:
            X_df[col] = X_df[col].fillna(X_df[col].mean())


def scratch_train_test_split(X, y,train_size=0.75):
    """
    検証用データを分割する。

    Parameters
    ----------
    X : 次の形のndarray, shape (n_samples, n_features)
      学習データ
    y : 次の形のndarray, shape (n_samples, )
      正解値
    train_size : float (0<train_size<1)
      何割をtrainとするか指定

    Returns
    ----------
    X_train : 次の形のndarray, shape (n_samples, n_features)
      学習データ
    X_test : 次の形のndarray, shape (n_samples, n_features)
      検証データ
    y_train : 次の形のndarray, shape (n_samples, )
      学習データの正解値
    y_test : 次の形のndarray, shape (n_samples, )
      検証データの正解値
    """ 
    #X -> random sampling, (1-train_size)*X=X_test, rest= X_train
    
    #Xをランダムに並べ替え
    p=np.random.permutation(len(X))
    randX, randy=X[p], y[p]
    X_train, X_test=np.vsplit(randX, [int(randX.shape[0] * train_size)])
    y_train, y_test=np.split(randy,[int(randy.size*train_size)]) 

    return X_train, X_test, y_train, y_test



In [44]:
#シンプルデータセット1

np.random.seed(seed=0)
n_samples = 500
f0 = [-1, 2]
f1 = [2, -1]
cov = [[1.0,0.8], [0.8, 1.0]]

f0 = np.random.multivariate_normal(f0, cov, int(n_samples/2))
f1 = np.random.multivariate_normal(f1, cov, int(n_samples/2))

X = np.concatenate((f0, f1))
y = np.concatenate((np.ones((int(n_samples/2))), np.ones((int(n_samples/2))) *(-1))).astype(np.int)

random_index = np.random.permutation(np.arange(n_samples))
X1 = X[random_index]
y1 = y[random_index]

In [45]:
#シンプルデータセット2作成コード

X2 = np.array([[-0.44699 , -2.8073  ],[-1.4621  , -2.4586  ],
       [ 0.10645 ,  1.9242  ],[-3.5944  , -4.0112  ],
       [-0.9888  ,  4.5718  ],[-3.1625  , -3.9606  ],
       [ 0.56421 ,  0.72888 ],[-0.60216 ,  8.4636  ],
       [-0.61251 , -0.75345 ],[-0.73535 , -2.2718  ],
       [-0.80647 , -2.2135  ],[ 0.86291 ,  2.3946  ],
       [-3.1108  ,  0.15394 ],[-2.9362  ,  2.5462  ],
       [-0.57242 , -2.9915  ],[ 1.4771  ,  3.4896  ],
       [ 0.58619 ,  0.37158 ],[ 0.6017  ,  4.3439  ],
       [-2.1086  ,  8.3428  ],[-4.1013  , -4.353   ],
       [-1.9948  , -1.3927  ],[ 0.35084 , -0.031994],
       [ 0.96765 ,  7.8929  ],[-1.281   , 15.6824  ],
       [ 0.96765 , 10.083   ],[ 1.3763  ,  1.3347  ],
       [-2.234   , -2.5323  ],[-2.9452  , -1.8219  ],
       [ 0.14654 , -0.28733 ],[ 0.5461  ,  5.8245  ],
       [-0.65259 ,  9.3444  ],[ 0.59912 ,  5.3524  ],
       [ 0.50214 , -0.31818 ],[-3.0603  , -3.6461  ],
       [-6.6797  ,  0.67661 ],[-2.353   , -0.72261 ],
       [ 1.1319  ,  2.4023  ],[-0.12243 ,  9.0162  ],
       [-2.5677  , 13.1779  ],[ 0.057313,  5.4681  ]])
y2 = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [48]:
X_train, X_test, y_train, y_test = scratch_train_test_split(X2, y2,train_size=0.50)

In [42]:
X_train.shape

(20, 2)

## 【問題2】 分類問題を解くコードの作成
- ロジスティック回帰
- SVM
- 決定木
>上記3種類の手法で3種類のデータセットを学習・推定するコードを作成してください。

In [61]:
#検証用データの分割には問題1で作成した自作の関数を用いてください。
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

def train_pred(model,X,y):
    X_train, X_test, y_train, y_test = scratch_train_test_split(X, y,train_size=0.50)
    
    Ir=model()
    Ir.fit(X_train,y_train)
    y_pred=Ir.predict(X_test)
    accuracy=accuracy_score(y_test, y_pred)
    precision=precision_score(y_test, y_pred)
    recall=recall_score(y_test, y_pred)
    f1=f1_score(y_test, y_pred)
    con_mat=confusion_matrix(y_test, y_pred)
    print('accuracy: ', accuracy)
    print('precision: ', precision)
    print('recall: ', recall)
    print('f1_score: ', f1)
    print('confusion matrix:\n', con_mat)

In [63]:
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import load_iris
iris=load_iris()
X_ir=iris.data[50:,:]
y_ir=iris.target[50:]

models = {SGDClassifier, SVC, DecisionTreeClassifier}

for i in models:
    print('model: ',i)
    print("data set 1")
    train_pred(i,X1,y1)
    print("data set 2")
    train_pred(i,X2,y2)
    print("data set 3")
    train_pred(i,X_ir,y_ir)

model:  <class 'sklearn.svm.classes.SVC'>
data set 1
accuracy:  1.0
precision:  1.0
recall:  1.0
f1_score:  1.0
confusion matrix:
 [[129   0]
 [  0 121]]
data set 2
accuracy:  0.45
precision:  0.6
recall:  0.25
f1_score:  0.35294117647058826
confusion matrix:
 [[6 2]
 [9 3]]
data set 3
accuracy:  0.92
precision:  0.9230769230769231
recall:  0.9230769230769231
f1_score:  0.9230769230769231
confusion matrix:
 [[24  2]
 [ 2 22]]
model:  <class 'sklearn.linear_model.stochastic_gradient.SGDClassifier'>
data set 1
accuracy:  1.0
precision:  1.0
recall:  1.0
f1_score:  1.0
confusion matrix:
 [[126   0]
 [  0 124]]
data set 2
accuracy:  0.4
precision:  0.42857142857142855
recall:  0.2727272727272727
f1_score:  0.33333333333333326
confusion matrix:
 [[5 4]
 [8 3]]
data set 3
accuracy:  0.8
precision:  1.0
recall:  0.6296296296296297
f1_score:  0.7727272727272727
confusion matrix:
 [[17 10]
 [ 0 23]]
model:  <class 'sklearn.tree.tree.DecisionTreeClassifier'>
data set 1
accuracy:  0.996
precision



## 回帰問題

## 【問題3】 回帰問題を解くコードの作成
>線形回帰でHouse Pricesデータセットを学習・推定するコードを作成してください。

In [64]:
X = np.array(X_df)
y = np.array(y_df)

X_train, X_test, y_train, y_test = scratch_train_test_split(X, y,train_size=0.50)

In [66]:
from sklearn.linear_model import SGDRegressor

Sg=SGDRegressor()
Sg.fit(X_train, y_train)
y_pred=Sg.predict(X_test)

#平均二乗誤差
from sklearn.metrics import mean_squared_error
lin_mse=mean_squared_error(y_test, y_pred)
print('平均２乗誤差: ', lin_mse)

平均２乗誤差:  9.656930088740988e+29
