In [None]:
import pandas as pd
import numpy as np
import matplotlib as plt
import warnings 
warnings.filterwarnings(action="ignore")

train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test.csv')

all = pd.concat((train_df, test_df), axis=0)

all = all.drop(["User_ID", "Health_Score"], axis=1)


# print(all.info())  #결측값 없음
# print(all.describe())  #통계량 확인 

#--전처리 (스케일링))
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

cols = [col for col in all.columns]
all[cols] = scaler.fit_transform(all[cols])

from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso

X = all[:len(train_df)]
y = train_df['Health_Score']
test = all[len(train_df):]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

sample_model = GradientBoostingRegressor()
sample_model.fit(X_train, y_train)
pred = sample_model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, pred))

important = sample_model.feature_importances_
import_df = pd.DataFrame(important.reshape(1,-1), columns=[col for col in X.columns])
#print(import_df)

candidate_col = ['Exercise_Score', 'Sleeping_Score']

#전처리 (pca 컬럼 추가)
pca = PCA(n_components=2)
tmp = all.copy()

pca_result = pca.fit_transform(tmp)
pca_col = ['pca' + str(i) for i in range(pca_result.shape[-1])]
pca_df = pd.DataFrame(pca_result, columns = pca_col).set_index(all.index)
all = pd.concat((all, pca_df), axis=1)

#전처리 (중요한 컬럼 통계량 컬럼 추가)
all['mean'] = all[candidate_col].mean(axis=1)
all['min'] = all[candidate_col].min(axis=1)
all['max'] = all[candidate_col].max(axis=1)
all['std'] = all[candidate_col].std(axis=1)

#--모델링--
X = all[:len(train_df)]
y = train_df['Health_Score']
test = all[len(train_df):]

train_pred = np.zeros((1, len(X)))
test_pred = np.zeros((1, len(test)))
score = []
kf = KFold(n_splits=5, shuffle=True, random_state=42)

#rf모델
for i, (train_idx, test_idx) in enumerate(kf.split(X, y)):
	X_train, X_test = X.iloc[train_idx, :], X.iloc[test_idx, :]
	y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
	
	model = RandomForestRegressor()
	model.fit(X_train, y_train)
	pred = model.predict(X_test)
	
	rmse = np.sqrt(mean_squared_error(y_test, pred))
	score.append(rmse)
	train_pred[0][test_idx] += pred
	test_pred += (model.predict(test)/5)
	print(f"rf_{i}th rmse : {rmse}")
print(f"Mean rmse : {np.mean(score)}")

#gb모델
train_pred2 = np.zeros((1, len(X)))
test_pred2 = np.zeros((1, len(test)))
score = []
kf = KFold(n_splits=5, shuffle=True, random_state=42)

for i, (train_idx, test_idx) in enumerate(kf.split(X, y)):
	X_train, X_test = X.iloc[train_idx, :], X.iloc[test_idx, :]
	y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
	
	model = GradientBoostingRegressor()
	model.fit(X_train, y_train)
	pred = model.predict(X_test)
	
	rmse = np.sqrt(mean_squared_error(y_test, pred))
	score.append(rmse)
	train_pred2[0][test_idx] += pred
	test_pred2 += (model.predict(test)/5)
	print(f"gb_{i}th rmse : {rmse}")
print(f"Mean rmse : {np.mean(score)}")

#ensemble
final_train = np.zeros((1, len(X)))
final_test = np.zeros((1, len(test)))
score = []
kf = KFold(n_splits=5, shuffle=True, random_state=42)

all_train = np.concatenate((train_pred.reshape(-1,1),train_pred2.reshape(-1,1)), axis=1)
all_test = np.concatenate((test_pred.reshape(-1,1),test_pred2.reshape(-1,1)), axis=1)

for i, (train_idx, test_idx) in enumerate(kf.split(all_train, y)):
	X_train, X_test = all_train[train_idx, :], all_train[test_idx, :]
	y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
	
	model = LinearRegression()
	model.fit(X_train, y_train)
	pred = model.predict(X_test)
	
	rmse = np.sqrt(mean_squared_error(y_test, pred))
	score.append(rmse)
	final_train[0][test_idx] += pred
	final_test += (model.predict(all_test)/5)
	print(f"final_{i}th rmse : {rmse}")
print(f"Mean rmse : {np.mean(score)}")

answer = pd.DataFrame({'Health_Score':final_test[0]})

answer.to_csv('data/answer.csv', index=False)

In [None]:
train =
test = 
all = pd.concat((train, test), axis=0)
all = all.drop(columns=[])

print(all.info())

#--전처리
categorical = [col for col in all.columns if all[col].dtype == 'O']
numerical = [col for col in all.columns if all[col].dtype != 'O']

print(all[categorical])
#문자형 컬럼 전처리
all[''] = all[''].replace({dkf : djffj, dkf, })

#라벨, 원핫 인코딩
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, MinMaxScaler
cols = []
onehot = OneHotEncoder()

for  col in cols:
    onehot.fit(list(all[col].values))
    all[col] = onehot.fit_transform(list(all[col]).values)
    
    if col == :
        onehot = OneHotEncoder()
        onehot = fit[all[col].values.reshape(-1,1)].toarray()
        onehot_col = ['' + str(i) for i in range onehot.shape[-1]]
        onehot_df = pd.DataFrame(onehot, columns=onehot_col)
        all= pd.concat((all, onehot_df), axis=1).reset_index(drop=True)
        
#숫자형 컬럼 전처리
print(all.describe())

all[''] = np.log(all[''])

#스케일러 적용
scaler = MinMaxScaler()
cols = []
all[cols] = scaler.fit_transform(all[cols])

#--feature 엔지니어링
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier

X = all[:len(train)]
y = train['']
test = all[len(train):]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

sample_model = ()
sample_model.fit()

important = sample_model.feature_importances_
important_df = pd.DataFrame(important.reshape(1, -1), columns=[col for col in X.columns])

print(important_df)

candidate_col = []

pca  = PCA(n_components)