In [3]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

#numeric_only=True

class SklModel():
    dataframe = None
    columns = None
    test_parameters = {}
    model_parameters = {}

    #step 1
    @classmethod
    def set_filepath(cls, filepath):
        cls.filepath = filepath
        cls.dataframe = pd.read_csv(filepath) # CSV読込
        return cls.dataframe

    #step2
    @classmethod
    def set_parameter(cls, parameters):
        cls.model_parameters = parameters

    @classmethod
    def set_test_parameter(cls, parameters):
        cls.test_parameters = parameters

    @classmethod
    def set_target(cls, target_column):
        cls.target_column = target_column

        # 特徴量と正解ラベルを分割する
        if cls.columns is None:
            columns = cls.dataframe.drop(cls.target_column, axis=1).columns.tolist()
            cls.set_columns(columns)

    @classmethod
    def get_target(cls):
        return cls.target_column

    @classmethod
    def set_columns(cls, columns):
        cls.columns = columns
        return cls.columns
        # return cls.columns := columns

    @classmethod
    def get_columns(cls):
        return cls.columns

    @classmethod
    def split(cls,*parrays):
        # データを訓練データとテストデータに分割する
        return train_test_split(*parrays, **cls.test_parameters)
    
    @classmethod
    def fillna_meaning(cls, df) -> pd.DataFrame:
        return df.fillna(df.mean())

    @classmethod
    def get_dummy_columns(cls, df:pd.DataFrame, columns:list, drop=0):
        drop_flag = {0:False,1:True}
        # print(drop_flag[drop])
        for column in columns:
            dummy = pd.get_dummies(df[column], drop_first=drop_flag[drop], dtype=int)
            df = pd.concat([df, dummy], axis=1)
        df = df.drop(columns, axis=1)
        return df
    
    @classmethod
    def ss_transform(cls, df, model=None):
        if (model is None):
            sc_model_x = StandardScaler() #訓練データxの標準化モデル
            sc_model_x.fit(df)
        else:
            sc_model_x = model
        
        # 各列のデータを標準化してsc_xに代入
        sc_x = sc_model_x.transform(df) #標準化されたxのdfデータ
        sc_x # 表示

        return sc_x, sc_model_x
    
    @classmethod
    def init_model(cls,model_name=""):
        if(model_name=="DecisionTreeClassifier"):
            return DecisionTreeClassifier(**cls.model_parameters)
        elif(model_name=="LinearRegression"):
            return LinearRegression()
        else:
            return None



#8-1 CSVの読込
# import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
# from DTC2 import DTC
import pandas as pd
%matplotlib inline

df=SklModel.set_filepath("../datafiles/Boston.csv")
# print(df.head(2))

#8-3 CRIMEの調査
df['CRIME'].value_counts()

#8-4 ダミー変数設定(自動化候補)
df2 = SklModel.get_dummy_columns(df, ['CRIME'], 1)

#8-5 columnsの設定が無いのでtarget以外をcolumnsの値とする
SklModel.set_test_parameter({"test_size": 0.2, "random_state": 0})
SklModel.dataframe_org = SklModel.dataframe.copy()
SklModel.dataframe = df2
train_val,test = SklModel.split(df2)
train_val,test

#8-6 欠損値の処理
train_val.isnull().sum()

#8-7 欠損値を平均値で穴埋め（自動化候補）
train_val2 = SklModel.fillna_meaning(train_val)
train_val2

#8-8 外れ値の処理（自動化候補）
# colname = train_val2.columns
# colname
# for name in colname[:13]:
# for name in colname:
#     train_val2.plot(kind = 'scatter', x = name, y = 'PRICE')

#8-8-2
import matplotlib.pyplot as plt
# fig = plt.figure(figsize=(8,10))
# colname = train_val2.columns
# # for name in colname:   
# #     train_val2.plot(kind = 'scatter', x = name, y = 'PRICE')
# for n, col in enumerate(colname):
#     train_val2.plot(ax=fig.add_subplot(5,3,n+1), kind='scatter', x= col, y='PRICE', s=3)

# plt.tight_layout()
# plt.show()

#8-9 外れ値が存在するインデックスの確認
# RMの外れ値
out_line1 = train_val2[(train_val2['RM'] < 6) &
(train_val2['PRICE'] > 40)].index
# PTRATIOの外れ値
out_line2 = train_val2[(train_val2['PTRATIO'] > 18) &
(train_val2['PRICE'] > 40)].index

# print(out_line1, out_line2)

#8-10 外れ値の削除
train_val3 = train_val2.drop([76], axis = 0)

#8-11 予測に使用する特徴量の列以外を取り除く
col = ['INDUS', 'NOX', 'RM', 'PTRATIO', 'LSTAT', 'PRICE']

train_val4 = train_val3[col]
train_val4.head(3)

#8-12 各列同士の相関係数を調べる
train_val4.corr()

#8-13 PRICE列との相関係数を調べる
train_cor = train_val4.corr()['PRICE']
train_cor

#8-16 各要素を絶対値に変換
abs_cor = train_cor.map(abs)
abs_cor

#8-17 降順
abs_cor.sort_values(ascending = False)

#8-18 データ分割
col =SklModel.set_columns(['RM', 'LSTAT', 'PTRATIO'])
x = train_val4[col]
t = train_val4[['PRICE']]
# DTC.set_test_parameter({"test_size": 0.2, "random_state": 0})

x_train, x_val, y_train, y_val = SklModel.split(x,t)
x_train, x_val, y_train, y_val

#8-19 データ標準化
# sc_model_x = StandardScaler() #訓練データxの標準化モデル
# sc_model_x.fit(x_train)

# # 各列のデータを標準化してsc_xに代入
# sc_x = sc_model_x.transform(x_train) #標準化されたxのdfデータ
# sc_x # 表示
sc_x, sc_model_x = SklModel.ss_transform(x_train)
sc_x, sc_model_x

#8-20 見やすくして平均値0（ほぼ0）を確認
# array 型だと見づらいのでデータフレームに変換
tmp_df = pd.DataFrame(sc_x, columns = x_train.columns)
# 平均値の計算
tmp_df.mean()

#8-21 標準偏差の計算
tmp_df.std() # 標準偏差の計算

#8-22 正解データを標準化
sc_y,sc_model_y = SklModel.ss_transform(y_train)

###---
#決定木
# from sklearn import tree
# model = tree.DecisionTreeClassifier(max_depth = 3,
#     random_state = 0)
# model.fit(x_train, y_train)

#8-23 標準化したデータで学習
model=SklModel.init_model("LinearRegression")
model.fit(sc_x,sc_y)

#8-24 決定係数を求める
model.score(x_val,y_val)

#8-25
sc_x_val = sc_model_x.transform(x_val)
sc_y_val = sc_model_y.transform(y_val)
# 標準化した検証データで決定係数を計算
model.score(sc_x_val, sc_y_val)

#8-27 learn関数
def learn(x, t):
    x_train, x_val, y_train, y_val = train_test_split(x, t,
    test_size = 0.2, random_state = 0)
    # 訓練データを標準化
    sc_model_x = StandardScaler()
    sc_model_y = StandardScaler()
    sc_model_x.fit(x_train)
    sc_x_train = sc_model_x.transform(x_train)
    sc_model_y.fit(y_train)
    sc_y_train = sc_model_y.transform(y_train)
    # 学習
    model = LinearRegression()
    model.fit(sc_x_train, sc_y_train)
    #検証データを標準化
    sc_x_val = sc_model_x.transform(x_val)
    sc_y_val = sc_model_y.transform(y_val)
    # 訓練データと検証データの決定係数計算
    train_score = model.score(sc_x_train, sc_y_train)
    val_score = model.score(sc_x_val, sc_y_val)
    return train_score, val_score

# 8-28learn関数を実行
x = train_val3.loc[ :, ['RM', 'LSTAT', 'PTRATIO']]
t = train_val3[['PRICE']]
s1,s2 = learn(x, t)
print(s1, s2)

#8-29 特徴量にINDUS列を追加する
x = train_val3.loc[ :, ['RM', 'LSTAT', 'PTRATIO','INDUS']]
t = train_val3[['PRICE']]
s1,s2 = learn(x, t)
print(s1, s2)

#8-30特徴量エンジニアリング、RM（部屋数）を２乗する
x['RM'] ** 2

#8-31 RM2乗列を特徴量に追加する
# RM2乗のシリーズを新しい列として追加
x['RM2'] = x['RM'] ** 2
# コード8-29で、INDUS列を追加したので削除
x = x.drop('INDUS', axis = 1)
x.head(2)

#8-33 再学習
s1, s2 = learn(x, t)
print(s1, s2)

#8-34 LAST列とPTRATIO列の2乗も特徴量に追加
# LSTAT列の2乗を追加
x['LSTAT2'] = x['LSTAT'] ** 2
s1, s2 = learn(x, t)
print(s1, s2)

# PTRATIO列の2乗を追加
x['PTRATIO2'] = x['PTRATIO'] ** 2
s1, s2 = learn(x, t)
print(s1, s2)

#8-35行加算のサンプル
se1 = pd.Series([1, 2, 3])
se2 = pd.Series([10, 20, 30])
se1 + se2 # 対応する各要素を足し算したシリーズ

#8-36交互作用特徴量を追加
x['RM * LSTAT'] = x['RM'] * x['LSTAT']
x.head(2)

#8-37 再々学習
s1, s2 = learn(x, t)
print(s1, s2)

#8-38の標準化後に再々学習
# 訓練データと検証データを合わせて再学習させるので
# 再度、標準化する
sc_model_x2 = StandardScaler()
sc_model_x2.fit(x)
sc_x = sc_model_x2.transform(x)

sc_model_y2 = StandardScaler()
sc_model_y2.fit(t)
sc_y = sc_model_y2.transform(t)
model = LinearRegression()
model.fit(sc_x, sc_y)

#8-38テストデータにも前処理を行う
test2 = test.fillna(train_val.mean()) # 欠損値を平均値で補完
x_test = test2.loc[ :, ['RM','LSTAT', 'PTRATIO'] ]
y_test = test2[['PRICE']]

x_test['RM2'] = x_test['RM'] ** 2
x_test['LSTAT2'] = x_test['LSTAT'] ** 2
x_test['PTRATIO2'] = x_test['PTRATIO'] ** 2

x_test['RM * LSTAT'] = x_test['RM'] * x_test['LSTAT']

sc_x_test = sc_model_x2.transform(x_test)
sc_y_test = sc_model_y2.transform(y_test)

#8-40決定係数を計算
model.score(sc_x_test, sc_y_test)



0.7175897572515981 0.7359028880290998
0.7190252930186809 0.7295535344941491
0.8456207631185566 0.8372526287986777
0.8565689444345094 0.8425282632102126
0.8643834988984441 0.8678022326740733
0.8668534967796697 0.8739347357775972




0.7649249353669053

In [None]:
# #8-41モデルの保存
# import pickle
# with open('boston.pkl',"wb") as f:
#     pickle.dump(model,f) #モデルの保存
# with open('boston_scx.pkl','wb') as f:
#     pickle.dump(sc_model_x2,f) #xの標準化モデルの保存
# with open('boston_scy.pkl','wb') as f:
#     pickle.dump(sc_model_y2,f) #yの標準化モデルの保存
