# 行列演算の基礎

In [58]:
import numpy as np

In [4]:
#ベクトルの定義
x = np.array([[1],[2],[3]])
x

array([[1],
       [2],
       [3]])

In [5]:
#行列の定義
X = np.array([[1,2],[3,4]])
print(X)

[[1 2]
 [3 4]]


In [7]:
#転置
Xt = X.T
Xt

array([[1, 3],
       [2, 4]])

In [9]:
# 逆行列
# linear algebra: 線形代数
X_inv = np.linalg.inv(X)
print(X_inv)

[[-2.   1. ]
 [ 1.5 -0.5]]


In [10]:
#行列積
XX_inv = np.dot(X,X_inv)
print(XX_inv)

[[1.0000000e+00 0.0000000e+00]
 [8.8817842e-16 1.0000000e+00]]


# Numpyでよく使う処理

In [15]:
X = np.array([
    [2,3,4],
    [1,2,3]    
])

In [16]:
print(X)

[[2 3 4]
 [1 2 3]]


In [17]:
X.shape

(2, 3)

In [18]:
row,col = X.shape

In [19]:
row

2

In [20]:
col

3

In [21]:
for x in X:
    print(x)
    print("---")

[2 3 4]
---
[1 2 3]
---


# 演習問題

In [22]:
#データの定義
X = np.array([
    [1,2,3],
    [1,2,5],
    [1,3,4],
    [1,5,9]
])
print(X)

[[1 2 3]
 [1 2 5]
 [1 3 4]
 [1 5 9]]


In [23]:
#yの定義
y = np.array([
    [1],
    [5],
    [6],
    [8],
])
print(y)

[[1]
 [5]
 [6]
 [8]]


In [25]:
#Step1 行列積：dot
XtX = np.dot(X.T,X)
print(XtX)

[[  4  12  21]
 [ 12  42  73]
 [ 21  73 131]]


In [27]:
# Step2 
XtX_inv = np.linalg.inv(XtX)
print(XtX_inv)

[[ 1.76530612 -0.39795918 -0.06122449]
 [-0.39795918  0.84693878 -0.40816327]
 [-0.06122449 -0.40816327  0.24489796]]


In [28]:
#Step3
Xty = np.dot(X.T,y)
print(Xty)

[[ 20]
 [ 70]
 [124]]


In [30]:
#Step4
w = np.dot(XtX_inv,Xty)
print(w)

[[-0.14285714]
 [ 0.71428571]
 [ 0.57142857]]


# Scikit-learnで実装

In [2]:
import sklearn

In [32]:
from sklearn.linear_model import LinearRegression

In [33]:
#モデルの宣言
model = LinearRegression()

In [34]:
#モデルの学習　←　パラメータの調整
model.fit(X,y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [35]:
#　調整後のパラメータ
model.coef_

array([[0.        , 0.71428571, 0.57142857]])

In [36]:
model.intercept_

array([-0.14285714])

In [37]:
#　予測精度　←　決定係数
model.score(X,y)

0.6923076923076923

In [38]:
#　予測値の計算
x = np.array([
    [1,2,3]
])

In [40]:
y_pred = model.predict(x)
y_pred

array([[3.]])

# 実データでの演習

In [3]:
# データの読み込み
import pandas as pd

In [4]:
# CSVファイルの読み込み
df = pd.read_csv('housing.csv')

In [5]:
df.head()

Unnamed: 0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,y
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2


In [6]:
# レコード数の確認（サンプル数）
len(df)

506

In [7]:
#統計量の算出
df.describe()

Unnamed: 0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,y
count,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0
mean,3.613524,11.363636,11.136779,0.06917,0.554695,6.284634,68.574901,3.795043,9.549407,408.237154,18.455534,356.674032,12.653063,22.532806
std,8.601545,23.322453,6.860353,0.253994,0.115878,0.702617,28.148861,2.10571,8.707259,168.537116,2.164946,91.294864,7.141062,9.197104
min,0.00632,0.0,0.46,0.0,0.385,3.561,2.9,1.1296,1.0,187.0,12.6,0.32,1.73,5.0
25%,0.082045,0.0,5.19,0.0,0.449,5.8855,45.025,2.100175,4.0,279.0,17.4,375.3775,6.95,17.025
50%,0.25651,0.0,9.69,0.0,0.538,6.2085,77.5,3.20745,5.0,330.0,19.05,391.44,11.36,21.2
75%,3.677082,12.5,18.1,0.0,0.624,6.6235,94.075,5.188425,24.0,666.0,20.2,396.225,16.955,25.0
max,88.9762,100.0,27.74,1.0,0.871,8.78,100.0,12.1265,24.0,711.0,22.0,396.9,37.97,50.0


# 分布の確認

In [2]:
!pip3 install seaborn

[33mYou are using pip version 10.0.1, however version 18.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [3]:
import seaborn as sns

ModuleNotFoundError: No module named 'seaborn'

In [1]:
!pip3 uninstall seaborn

[33mSkipping seaborny as it is not installed.[0m
[33mYou are using pip version 10.0.1, however version 18.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [1]:
%matplotlib inline
import seaborn 

ModuleNotFoundError: No module named 'seaborn'

In [5]:
import pandas 

# 相関関係の確認

In [5]:
#　相関係数の算出
df.corr()

Unnamed: 0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,y
x1,1.0,-0.200469,0.406583,-0.055892,0.420972,-0.219247,0.352734,-0.37967,0.625505,0.582764,0.289946,-0.385064,0.455621,-0.388305
x2,-0.200469,1.0,-0.533828,-0.042697,-0.516604,0.311991,-0.569537,0.664408,-0.311948,-0.314563,-0.391679,0.17552,-0.412995,0.360445
x3,0.406583,-0.533828,1.0,0.062938,0.763651,-0.391676,0.644779,-0.708027,0.595129,0.72076,0.383248,-0.356977,0.6038,-0.483725
x4,-0.055892,-0.042697,0.062938,1.0,0.091203,0.091251,0.086518,-0.099176,-0.007368,-0.035587,-0.121515,0.048788,-0.053929,0.17526
x5,0.420972,-0.516604,0.763651,0.091203,1.0,-0.302188,0.73147,-0.76923,0.611441,0.668023,0.188933,-0.380051,0.590879,-0.427321
x6,-0.219247,0.311991,-0.391676,0.091251,-0.302188,1.0,-0.240265,0.205246,-0.209847,-0.292048,-0.355501,0.128069,-0.613808,0.69536
x7,0.352734,-0.569537,0.644779,0.086518,0.73147,-0.240265,1.0,-0.747881,0.456022,0.506456,0.261515,-0.273534,0.602339,-0.376955
x8,-0.37967,0.664408,-0.708027,-0.099176,-0.76923,0.205246,-0.747881,1.0,-0.494588,-0.534432,-0.232471,0.291512,-0.496996,0.249929
x9,0.625505,-0.311948,0.595129,-0.007368,0.611441,-0.209847,0.456022,-0.494588,1.0,0.910228,0.464741,-0.444413,0.488676,-0.381626
x10,0.582764,-0.314563,0.72076,-0.035587,0.668023,-0.292048,0.506456,-0.534432,0.910228,1.0,0.460853,-0.441808,0.543993,-0.468536


In [9]:
# 相関係数を目視で確認
sns.pairplot(df)

NameError: name 'sns' is not defined

In [10]:
!pip3 list

Package            Version
------------------ -------
appnope            0.1.0  
backcall           0.1.0  
bleach             3.0.2  
cycler             0.10.0 
decorator          4.3.0  
defusedxml         0.5.0  
entrypoints        0.3    
ipykernel          5.1.0  
ipython            7.2.0  
ipython-genutils   0.2.0  
ipywidgets         7.4.2  
jedi               0.13.2 
Jinja2             2.10   
jsonschema         2.6.0  
jupyter            1.0.0  
jupyter-client     5.2.4  
jupyter-console    6.0.0  
jupyter-core       4.4.0  
kiwisolver         1.0.1  
MarkupSafe         1.1.0  
matplotlib         3.0.2  
mistune            0.8.4  
nbconvert          5.4.0  
nbformat           4.4.0  
notebook           5.7.4  
numpy              1.15.4 
pandas             0.23.4 
pandocfilters      1.4.2  
parso              0.3.1  
pexpect            4.6.0  
pickleshare        0.7.5  
pip                10.0.1 
prometheus-client  0.5.0  
prompt-toolkit     2.0.7  
ptyprocess         0.6.0  
P

In [12]:
import seaborn

ModuleNotFoundError: No module named 'seaborn'

In [6]:
df.head(3)

Unnamed: 0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,y
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7


In [7]:
#df,iloc[行,列]

In [8]:
df.iloc[0,0]

0.00632

In [18]:
X = df.iloc[:, :-1]

In [19]:
X

Unnamed: 0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.0900,1,296,15.3,396.90,4.98
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.90,9.14
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.90,5.33
5,0.02985,0.0,2.18,0,0.458,6.430,58.7,6.0622,3,222,18.7,394.12,5.21
6,0.08829,12.5,7.87,0,0.524,6.012,66.6,5.5605,5,311,15.2,395.60,12.43
7,0.14455,12.5,7.87,0,0.524,6.172,96.1,5.9505,5,311,15.2,396.90,19.15
8,0.21124,12.5,7.87,0,0.524,5.631,100.0,6.0821,5,311,15.2,386.63,29.93
9,0.17004,12.5,7.87,0,0.524,6.004,85.9,6.5921,5,311,15.2,386.71,17.10


In [26]:
y = df.iloc[:, -1]

In [27]:
y

0      24.0
1      21.6
2      34.7
3      33.4
4      36.2
5      28.7
6      22.9
7      27.1
8      16.5
9      18.9
10     15.0
11     18.9
12     21.7
13     20.4
14     18.2
15     19.9
16     23.1
17     17.5
18     20.2
19     18.2
20     13.6
21     19.6
22     15.2
23     14.5
24     15.6
25     13.9
26     16.6
27     14.8
28     18.4
29     21.0
       ... 
476    16.7
477    12.0
478    14.6
479    21.4
480    23.0
481    23.7
482    25.0
483    21.8
484    20.6
485    21.2
486    19.1
487    20.6
488    15.2
489     7.0
490     8.1
491    13.6
492    20.1
493    21.8
494    24.5
495    23.1
496    19.7
497    18.3
498    21.2
499    17.5
500    16.8
501    22.4
502    20.6
503    23.9
504    22.0
505    11.9
Name: y, Length: 506, dtype: float64

## モデルの構築と検証

In [28]:
from sklearn.linear_model import LinearRegression

In [29]:
#モデルの宣言
model = LinearRegression()

In [30]:
#モデルの学習
model.fit(X,y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [32]:
#検証（決定係数の計算）
model.score(X,y)

0.7406426641094094

## 訓練データと検証データ

In [33]:
from sklearn.model_selection import train_test_split

In [34]:
#訓練データと検証データの分割
X_train,X_test,y_train, y_test = train_test_split(X,y, test_size=0.4, random_state = 1) #random_stateは乱数のシードを固定

In [39]:
X_train.head()

Unnamed: 0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13
452,5.09017,0.0,18.1,0,0.713,6.297,91.8,2.3682,24,666,20.2,385.09,17.27
346,0.06162,0.0,4.39,0,0.442,5.898,52.3,8.0136,3,352,18.8,364.61,12.67
295,0.12932,0.0,13.92,0,0.437,6.678,31.1,5.9604,4,289,16.0,396.9,6.27
88,0.0566,0.0,3.41,0,0.489,7.007,86.3,3.4217,2,270,17.8,396.9,5.5
322,0.35114,0.0,7.38,0,0.493,6.041,49.9,4.7211,5,287,19.6,396.9,7.7


In [38]:
y_train.head()

452    16.1
346    17.2
295    28.6
88     23.6
322    20.4
Name: y, dtype: float64

In [40]:
# モデルの学習
model.fit(X_train,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [41]:
#検証　←　検証データ
model.score(X_test,y_test)

0.7209056672661757

In [42]:
#検証　⇦　訓練データ
model.score(X_train,y_train)

0.7468316520140627

## 予測値の計算

In [43]:
x = X.iloc[0, :]
x

x1       0.00632
x2      18.00000
x3       2.31000
x4       0.00000
x5       0.53800
x6       6.57500
x7      65.20000
x8       4.09000
x9       1.00000
x10    296.00000
x11     15.30000
x12    396.90000
x13      4.98000
Name: 0, dtype: float64

In [45]:
#予測値の計算
y_pred = model.predict([x])
y_pred

array([29.42368847])

## モデルの保存

In [46]:
from sklearn.externals import joblib

In [47]:
#モデルの保存
joblib.dump(model, 'model.pkl')

['model.pkl']

# モデルの読み込み

In [48]:
model_new = joblib.load('model.pkl')

In [49]:
x

x1       0.00632
x2      18.00000
x3       2.31000
x4       0.00000
x5       0.53800
x6       6.57500
x7      65.20000
x8       4.09000
x9       1.00000
x10    296.00000
x11     15.30000
x12    396.90000
x13      4.98000
Name: 0, dtype: float64

In [50]:
model_new.predict([x])

array([29.42368847])

## パラメータの確認

In [52]:
#パラメータwの値
model.coef_

array([-8.95714048e-02,  6.73132853e-02,  5.04649248e-02,  2.18579583e+00,
       -1.72053975e+01,  3.63606995e+00,  2.05579939e-03, -1.36602886e+00,
        2.89576718e-01, -1.22700072e-02, -8.34881849e-01,  9.40360790e-03,
       -5.04008320e-01])

In [55]:
np.set_printoptions(precision=3,suppress=True) #指数関数の表示が禁止

In [56]:
model.coef_

array([ -0.09 ,   0.067,   0.05 ,   2.186, -17.205,   3.636,   0.002,
        -1.366,   0.29 ,  -0.012,  -0.835,   0.009,  -0.504])

In [57]:
df.head(3)

Unnamed: 0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,y
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7


重み(パラメータw)を見るだけでは、どの変数が影響を与えているかわからない

## データの読み込み

In [59]:
#CSVデータの読み込み
df.head(3)

Unnamed: 0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,y
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7


# 分布の確認

# 外れ値除去( 3σ法 )

In [60]:
col = 'x6'

In [61]:
mean = df.mean()
mean

x1       3.613524
x2      11.363636
x3      11.136779
x4       0.069170
x5       0.554695
x6       6.284634
x7      68.574901
x8       3.795043
x9       9.549407
x10    408.237154
x11     18.455534
x12    356.674032
x13     12.653063
y       22.532806
dtype: float64

In [62]:
mean[col]

6.284634387351788

In [63]:
# 標準偏差（standard deviation)
sigma = df.std()
sigma

x1       8.601545
x2      23.322453
x3       6.860353
x4       0.253994
x5       0.115878
x6       0.702617
x7      28.148861
x8       2.105710
x9       8.707259
x10    168.537116
x11      2.164946
x12     91.294864
x13      7.141062
y        9.197104
dtype: float64

In [64]:
sigma[col]

0.7026171434153234

In [65]:
low = mean[col] - 3 * sigma[col]
low

4.176782957105817

In [66]:
high = mean[col] + 3 * sigma[col]
high

8.392485817597759

In [71]:
df2 = df[(df[col] > low) & (df[col] < high)]
df2

Unnamed: 0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,y
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.0900,1,296,15.3,396.90,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.90,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.90,5.33,36.2
5,0.02985,0.0,2.18,0,0.458,6.430,58.7,6.0622,3,222,18.7,394.12,5.21,28.7
6,0.08829,12.5,7.87,0,0.524,6.012,66.6,5.5605,5,311,15.2,395.60,12.43,22.9
7,0.14455,12.5,7.87,0,0.524,6.172,96.1,5.9505,5,311,15.2,396.90,19.15,27.1
8,0.21124,12.5,7.87,0,0.524,5.631,100.0,6.0821,5,311,15.2,386.63,29.93,16.5
9,0.17004,12.5,7.87,0,0.524,6.004,85.9,6.5921,5,311,15.2,386.71,17.10,18.9


In [73]:
len(df)

506

In [74]:
len(df2)

498

In [75]:
# 分布の確認

In [76]:
cols = df.columns

In [77]:
cols

Index(['x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7', 'x8', 'x9', 'x10', 'x11',
       'x12', 'x13', 'y'],
      dtype='object')

In [78]:
_df = df
for col in cols:
    #3σ法の上限値を設定
    low = mean[col] -3 * sigma[col]
    high = mean[col] + 3 * sigma[col]
    #条件で絞り込み
    _df = _df[(_df[col] > low)&(_df[col]< high)] 

In [79]:
#オリジナル
len(df)

506

In [80]:
#3a法適用後
len(_df)

415

サンプルが減る場合の対処法
-  外れ値は取り除く
- 外れ値を平均もしくは中央値で埋める
- 主成分分析等を使って、潜在変数に変換した後に3σ法を適用　⇦　高度

## 入力変数と出力変数に分割

In [82]:
_df.head(3)

Unnamed: 0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,y
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7


In [85]:
X = _df.iloc[:,:-1]
y = _df.iloc[:, -1]

## 訓練データと検証データに分割

In [90]:
X_train,X_test,y_train,y_text = train_test_split(X,y, test_size=0.4,random_state =1)

## 重回帰分析

In [91]:
#モデルを宣言
model = LinearRegression()

In [92]:
#モデルの学習
model.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [93]:
#検証　⇦　訓練データ
model.score(X_train,y_train)

0.7972109224535133

In [95]:
#　検証　⇦　検証データ
model.score(X_test,y_test)

ValueError: Found input variables with inconsistent numbers of samples: [203, 166]

## スケーリング

In [96]:
from sklearn.preprocessing import StandardScaler

In [97]:
#scalerの宣言
scaler = StandardScaler()

In [99]:
# scalerの学習　⇦　平均と標準偏差を計算
scaler.fit(X_train)

  return self.partial_fit(X, y)


StandardScaler(copy=True, with_mean=True, with_std=True)

In [100]:
# scaling
X_train2 = scaler.transform(X_train)
X_test2 = scaler.transform(X_test)

  
  This is separate from the ipykernel package so we can avoid doing imports until


In [101]:
X_train2

array([[-0.45 , -0.471, -0.015, ...,  0.372,  0.067, -0.734],
       [-0.454, -0.471, -0.739, ...,  0.372,  0.381, -0.328],
       [-0.2  , -0.471,  1.295, ..., -1.755, -0.182, -0.757],
       ...,
       [ 0.494, -0.471,  1.074, ...,  0.844,  0.28 , -0.121],
       [-0.457,  0.61 , -0.589, ...,  0.088,  0.448, -0.883],
       [-0.473, -0.471, -0.739, ...,  0.372,  0.448, -0.541]])

In [102]:
# モデルの宣言
model = LinearRegression()

In [103]:
model.fit(X_train2,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [104]:
#検証　⇦　訓練データ
model.score(X_train2,y_train)

0.7972109224535133

In [105]:
model.score(X_test2,y_test)

ValueError: Found input variables with inconsistent numbers of samples: [203, 166]

In [106]:
#重みの確認
model.coef_

array([-0.195,  0.212,  0.508,  0.   , -1.213,  3.92 , -0.37 , -1.857,
        1.166, -1.529, -1.873,  0.244, -2.761])