# 제 1 유형

> 데이터
* https://www.kaggle.com/adityakadiwal/water-potability
* 수질 음용성 여부(Potability: 0, 1)

In [1]:
import pandas as pd

df = pd.read_csv("https://raw.githubusercontent.com/Datamanim/datarepo/main/waters/train.csv")

## Q1.
> ph 값은 상당히 많은 결측치를 포함한다. 결측치를 제외한 나머지 데이터들 중 사분위값 기준 하위 25%의 값들의 평균값은?

In [3]:
df.isna().sum()

ph                 395
Hardness             0
Solids               0
Chloramines          0
Sulfate            617
Conductivity         0
Organic_carbon       0
Trihalomethanes    132
Turbidity            0
Potability           0
dtype: int64

In [9]:
clean_ph = df["ph"].dropna()

In [15]:
print(clean_ph[clean_ph <= clean_ph.quantile(0.25)].mean())

5.057093462441731


# 제 2 유형

> 데이터
* https://www.kaggle.com/adityakadiwal/water-potability
* 수질 음용성 여부(Potability: 0, 1)
* 평가지표: f1-score

In [16]:
import pandas as pd
train_set = pd.read_csv("https://raw.githubusercontent.com/Datamanim/datarepo/main/waters/train.csv")
test_set = pd.read_csv("https://raw.githubusercontent.com/Datamanim/datarepo/main/waters/test.csv")

## 데이터 살펴보기

In [18]:
train_set.head()

Unnamed: 0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity,Potability
0,7.91815,214.186611,23823.492888,7.290878,341.173322,411.424483,19.585002,25.057375,4.028958,0
1,5.422446,205.26608,18542.957451,5.491963,306.702227,382.080129,10.504023,67.49345,2.911751,1
2,7.341547,187.672402,21273.457066,7.784003,,332.084293,16.842334,55.019151,4.025644,0
3,9.056245,197.666301,17403.532167,7.688917,337.460176,414.766631,15.349869,63.696746,3.319354,0
4,5.039374,142.860598,40829.353167,7.271543,,386.803057,16.823773,52.297113,4.95742,0


In [20]:
train_set.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2620 entries, 0 to 2619
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   ph               2225 non-null   float64
 1   Hardness         2620 non-null   float64
 2   Solids           2620 non-null   float64
 3   Chloramines      2620 non-null   float64
 4   Sulfate          2003 non-null   float64
 5   Conductivity     2620 non-null   float64
 6   Organic_carbon   2620 non-null   float64
 7   Trihalomethanes  2488 non-null   float64
 8   Turbidity        2620 non-null   float64
 9   Potability       2620 non-null   int64  
dtypes: float64(9), int64(1)
memory usage: 204.8 KB


In [21]:
test_set.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 656 entries, 0 to 655
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   ph               549 non-null    float64
 1   Hardness         656 non-null    float64
 2   Solids           656 non-null    float64
 3   Chloramines      656 non-null    float64
 4   Sulfate          504 non-null    float64
 5   Conductivity     656 non-null    float64
 6   Organic_carbon   656 non-null    float64
 7   Trihalomethanes  618 non-null    float64
 8   Turbidity        656 non-null    float64
dtypes: float64(9)
memory usage: 46.2 KB


In [23]:
import matplotlib.pyplot as plt
import seaborn as sns

In [25]:
sns.pairplot(train_set)

Output hidden; open in https://colab.research.google.com to view.

## 전처리 함수 작성

In [51]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import StandardScaler

def my_preprocess(org_df, target="train", imputers=None, scalers=None):
    df = org_df.copy()
    if target == "train":
        imp = IterativeImputer(random_state=8)
        df = imp.fit_transform(df)
        sc = StandardScaler()
        df = sc.fit_transform(df)
        return df, imp, sc
    elif target == "test":
        imp = imputers
        sc = scalers
        df = imp.transform(df)
        df = sc.transform(df)
        return df

## 종속변수 분리 및 검증, 훈련 분할(K fold)

In [29]:
X = train_set.drop("Potability", axis=1)
y = train_set["Potability"]

In [49]:
from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=8)
X_trains, X_vals, y_trains, y_vals = [[], [], [], []]
for train_idx, test_idx in skf.split(X, y):
    X_trains.append(X.iloc[train_idx])
    X_vals.append(X.iloc[test_idx])
    y_trains.append(y.iloc[train_idx])
    y_vals.append(y.iloc[test_idx])

## 전처리 시행

In [52]:
X_trains_proc = []
X_vals_proc = []
for i in range(5):
    train_temp, imputer, scaler = my_preprocess(X_trains[i], target="train", imputers=None, scalers=None)
    val_temp = my_preprocess(X_vals[i], target="test", imputers=imputer, scalers=scaler)
    X_trains_proc.append(train_temp)
    X_vals_proc.append(val_temp)

## 분석

In [65]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
import tensorflow as tf

from sklearn.metrics import f1_score, confusion_matrix

### 로지스틱 회귀

In [79]:
fold_scores_lr = []
for i in range(5):
    model_lr = LogisticRegression()
    model_lr.fit(X_trains_proc[i], y_trains[i])
    pred = model_lr.predict(X_vals_proc[i])
    fold_scores_lr.append(f1_score(y_vals[i], pred))

sum(fold_scores_lr) / 5

0.007692846306453495

### 랜덤포레스트

In [87]:
for k in range(15, 26):
    fold_scores_rf = []
    for i in range(5):
        model_rf = RandomForestClassifier(max_depth=k, random_state=8)
        model_rf.fit(X_trains_proc[i], y_trains[i])
        pred = model_rf.predict(X_vals_proc[i])
        fold_scores_rf.append(f1_score(y_vals[i], pred))

    print(k, sum(fold_scores_rf) / 5)

15 0.3954451928801624
16 0.3965446531955966
17 0.3971904441265722
18 0.4009408717805198
19 0.41169978008741615
20 0.4020184016824396
21 0.4171022495037174
22 0.4187595960006666
23 0.408518330991493
24 0.41783447759751624
25 0.40328056523564176


In [89]:
fold_scores_rf = []
for i in range(5):
    model_rf = RandomForestClassifier(max_depth=22, random_state=8)
    model_rf.fit(X_trains_proc[i], y_trains[i])
    pred = model_rf.predict(X_vals_proc[i])
    fold_scores_rf.append(f1_score(y_vals[i], pred))

print(sum(fold_scores_rf) / 5)

0.4187595960006666


### AdaBoost

In [90]:
fold_scores_ab = []
for i in range(5):
    model_ab = AdaBoostClassifier(random_state=8)
    model_ab.fit(X_trains_proc[i], y_trains[i])
    pred = model_ab.predict(X_vals_proc[i])
    fold_scores_ab.append(f1_score(y_vals[i], pred))

print(sum(fold_scores_ab) / 5)

0.27471107757148133


### XGBoost

In [92]:
for k in range(2, 26):
    fold_scores_xgb = []
    for i in range(5):
        model_xgb = XGBClassifier(max_depth=k, random_state=8)
        model_xgb.fit(X_trains_proc[i], y_trains[i])
        pred = model_xgb.predict(X_vals_proc[i])
        fold_scores_xgb.append(f1_score(y_vals[i], pred))

    print(k, sum(fold_scores_xgb) / 5)

2 0.3876987188829146
3 0.41873937145254975
4 0.43826475465710785
5 0.45740291351315465
6 0.45703861597326895
7 0.46921973701411535
8 0.47162420702389
9 0.4648410268134027
10 0.4611820448260445
11 0.47869958421796766
12 0.4762152667980962
13 0.4786136047951216
14 0.4836301280257154
15 0.4712391767974154
16 0.46434202656664647
17 0.47302057571585115
18 0.46573236020613235
19 0.4738342297199735
20 0.4758392993889752
21 0.4684863959574609
22 0.4779189186198803
23 0.47076488407524747
24 0.4737895842729021
25 0.4773520345939361


In [94]:
fold_scores_xgb = []
for i in range(5):
    model_xgb = XGBClassifier(max_depth=14, random_state=8)
    model_xgb.fit(X_trains_proc[i], y_trains[i])
    pred = model_xgb.predict(X_vals_proc[i])
    fold_scores_xgb.append(f1_score(y_vals[i], pred))

print(sum(fold_scores_xgb) / 5)

0.4836301280257154


### KNN

In [101]:
for k in range(1, 35, 2):
    fold_scores_knn = []
    for i in range(5):
        model_knn = KNeighborsClassifier(n_neighbors=k)
        model_knn.fit(X_trains_proc[i], y_trains[i])
        pred = model_knn.predict(X_vals_proc[i])
        fold_scores_knn.append(f1_score(y_vals[i], pred))

    print(k, sum(fold_scores_knn) / 5)

1 0.46655246714783977
3 0.4423416965238178
5 0.43569844570300553
7 0.4438558767286809
9 0.4152574767372605
11 0.4065406851342185
13 0.3857146571794733
15 0.3635718825861377
17 0.35259099231221275
19 0.3483191649738729
21 0.32253995947015957
23 0.3223357326491107
25 0.3152902633231158
27 0.2996685435098131
29 0.29125788375168626
31 0.28308275135610794
33 0.29291646402680377


In [102]:
fold_scores_knn = []
for i in range(5):
    model_knn = KNeighborsClassifier(n_neighbors=1)
    model_knn.fit(X_trains_proc[i], y_trains[i])
    pred = model_knn.predict(X_vals_proc[i])
    fold_scores_knn.append(f1_score(y_vals[i], pred))

print(sum(fold_scores_knn) / 5)

0.46655246714783977


### SVM

In [111]:
fold_scores_svm = []
for i in range(5):
    model_svm = SVC(C=1, kernel="rbf")
    model_svm.fit(X_trains_proc[i], y_trains[i])
    pred = model_svm.predict(X_vals_proc[i])
    fold_scores_svm.append(f1_score(y_vals[i], pred))

print(sum(fold_scores_svm) / 5)

0.3928048681535756


### ANN

In [117]:
def build_ann():
    model_ann = tf.keras.Sequential()
    model_ann.add(tf.keras.layers.Dense(64, activation="relu"))
    model_ann.add(tf.keras.layers.Dropout(0.2))
    model_ann.add(tf.keras.layers.Dense(64, activation="relu"))
    model_ann.add(tf.keras.layers.Dropout(0.2))
    model_ann.add(tf.keras.layers.Dense(64, activation="relu"))
    model_ann.add(tf.keras.layers.Dropout(0.2))
    model_ann.add(tf.keras.layers.Dense(1, activation="sigmoid"))
    model_ann.compile(optimizer="adam", loss=tf.keras.losses.BinaryCrossentropy(), metrics=["accuracy"])
    return model_ann

In [121]:
fold_scores_ann = []
for i in range(5):
    model_ann = build_ann()
    model_ann.fit(X_trains_proc[i], y_trains[i], batch_size=32, epochs=150, validation_split=0.2, callbacks=tf.keras.callbacks.EarlyStopping(monitor="val_loss", patience=10), verbose=0)
    pred = model_ann.predict(X_vals_proc[i])
    fold_scores_ann.append(f1_score(y_vals[i], pred.round()))

print(sum(fold_scores_ann) / 5)

0.4748505325983528


## XGBoost의 성능이 가장 좋음

In [123]:
train_proc, imp, sc = my_preprocess(X, target="train", imputers=None, scalers=None)
test_proc = my_preprocess(test_set, target="test", imputers=imp, scalers=sc)

model_xgb = XGBClassifier(max_depth=14, random_state=8)
model_xgb.fit(train_proc, y)
test_pred = model_xgb.predict(test_proc)

In [126]:
pd.DataFrame(test_pred).to_csv("submission_pred.csv")