In [1]:
import os
import numpy as np
import pandas as pd
from sklearn.datasets import load_linnerud
import smogn

In [2]:
DATA_PATH = os.path.join('data')
OUTPUT_CSV = os.path.join(DATA_PATH,'output','linnerud_smogn_with_missing.csv')

In [3]:
# linnerudデータセットの読み込み
df_linnerud1 = load_linnerud()
X = pd.DataFrame(df_linnerud1.data, columns=df_linnerud1.feature_names)
y = pd.DataFrame(df_linnerud1.target, columns=df_linnerud1.target_names)
df_linnerud2 = pd.concat([X,y],axis=1)

# Weightカラムを一番うしろに設定
y_column = "Weight"
X_columns = df_linnerud2.columns.to_list()
X_columns.remove(y_column)
X = df_linnerud2[X_columns]
y = df_linnerud2[y_column]
df_linnerud3 = X.join(y)

In [4]:
# 元のデータサイズ
original_size = len(df_linnerud3)
target_size = original_size * 3

In [5]:
# SMOGNを使用してオーバーサンプリング
# rel_thresを調整して目標サイズに近づける
rel_thres = 0.5  # 開始値
data_resampled = smogn.smoter(
    data=df_linnerud3,## training set  (pandas dataframe)
    y='Weight',  # 'Weight'を目的変数として使用
    k = 3,                    ## num of neighs for over-sampling  (pos int)
    samp_method='extreme',  ## % over / under sample  ("balance" or extreme")
    drop_na_col=True,       ## auto drop columns with nan's  (bool)
    drop_na_row=True,       ## auto drop rows with nan's  (bool)
    replace=False,          ## sampling replacement  (bool)
    rel_thres=rel_thres,         ## relevance threshold considered rare  (pos real)
    rel_method='auto',      ## relevance method  ("auto" or "manual")
    rel_xtrm_type = "both",   ## distribution focus  ("high", "low", "both")
    rel_coef = 1.5,           ## coefficient for box plot  (pos real)
    rel_ctrl_pts_rg = None    ## input for "manual" rel method  (2d array)
)

df_linnerud4 = pd.concat([df_linnerud3, data_resampled], axis=0).reset_index(drop=True)

dist_matrix: 100%|##################################################################| 3/3 [00:00<00:00, 2408.67it/s]
synth_matrix: 100%|##################################################################| 3/3 [00:00<00:00, 117.21it/s]
r_index: 100%|#######################################################################| 2/2 [00:00<00:00, 807.84it/s]


In [6]:
# Weight カラムにのみ 10%程度の欠損値を追加
mask = np.random.rand(len(df_linnerud4)) < 0.05
df_linnerud4.loc[mask, 'Weight'] = np.nan

In [7]:
df_linnerud4.sample(7)

Unnamed: 0,Chins,Situps,Jumps,Waist,Pulse,Weight
49,15.0,225.0,73.0,33.0,54.0,156.0
9,17.0,251.0,250.0,33.0,56.0,
35,13.0,155.0,58.0,35.0,46.0,189.0
46,12.0,210.0,120.0,37.0,62.0,202.0
41,17.0,120.0,38.0,34.0,50.0,169.0
44,1.0,50.0,50.0,46.0,50.0,247.0
37,8.0,101.0,38.0,38.0,56.0,211.0


In [8]:
# 結果の確認
print("元のデータ形状:", df_linnerud3.shape)
print("オーバーサンプリング後のデータ形状:", data_resampled.shape)
print("欠損値を含むデータ形状:", df_linnerud4.shape)
print("\n欠損値の割合:")
print(df_linnerud4.isnull().mean())

元のデータ形状: (20, 6)
オーバーサンプリング後のデータ形状: (31, 6)
欠損値を含むデータ形状: (51, 6)

欠損値の割合:
Chins     0.000000
Situps    0.000000
Jumps     0.000000
Waist     0.000000
Pulse     0.000000
Weight    0.058824
dtype: float64


In [9]:
df_linnerud4.isna().sum()

Chins     0
Situps    0
Jumps     0
Waist     0
Pulse     0
Weight    3
dtype: int64

In [10]:
# データの保存
df_linnerud4.to_csv(OUTPUT_CSV)

In [11]:
df_linnerud4.tail()

Unnamed: 0,Chins,Situps,Jumps,Waist,Pulse,Weight
46,12.0,210.0,120.0,37.0,62.0,202.0
47,4.0,60.0,25.0,37.0,54.0,
48,11.0,230.0,80.0,32.0,52.0,157.0
49,15.0,225.0,73.0,33.0,54.0,156.0
50,2.0,110.0,43.0,33.0,68.0,138.0


In [12]:
df_linnerud5 = df_linnerud4.dropna(how='any')
df_linnerud5

Unnamed: 0,Chins,Situps,Jumps,Waist,Pulse,Weight
0,5.0,162.0,60.0,36.0,50.0,191.0
1,2.0,110.0,60.0,37.0,52.0,189.0
2,12.0,101.0,101.0,38.0,58.0,193.0
3,12.0,105.0,37.0,35.0,62.0,162.0
4,13.0,155.0,58.0,35.0,46.0,189.0
5,4.0,101.0,42.0,36.0,56.0,182.0
6,8.0,101.0,38.0,38.0,56.0,211.0
7,6.0,125.0,40.0,34.0,60.0,167.0
8,15.0,200.0,40.0,31.0,74.0,176.0
10,17.0,120.0,38.0,34.0,50.0,169.0


In [13]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

x = df_linnerud5.iloc[:,:-1]
y = df_linnerud5.iloc[:, -1]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3)

model = RandomForestRegressor()
model.fit(x_train, y_train)
pred = model.predict(x_test)

print(pred)
print(r2_score(y_test, pred))

[209.78520093 158.15       246.48526259 209.37064105 189.00818231
 246.53022588 191.091768   190.34603293 158.15       190.34603293
 195.7592395  190.67408858 181.99737405 246.54798327 179.27      ]
0.9504078023763922


- オーバーサンプリングすれば十分な精度は出るけど、これリーケージしてるんだよねぇ。。。(´・ω・｀)