<font size=3>**IMPORTID**<font>

In [490]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import accuracy_score
from time import time
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.tree import DecisionTreeClassifier as DTC
from sklearn.svm import SVC
from sklearn.ensemble import HistGradientBoostingClassifier as HGBC
from sklearn.metrics import confusion_matrix
import winsound

<font size=3>**FUNKTSIOONID**<font>

In [491]:
def PlotFeatures(dataframe, sub_rows):
    cols = dataframe.columns

    fig, axs = plt.subplots(sub_rows, 2, figsize=(20, 200))
    for i in range(len(cols)-1):
        values = dataframe[cols[i]].to_numpy()
        df = pd.DataFrame({"value" : values, "result" : dataframe["compliance_2021"]})
        df = df.sort_values(by="value")

        for j in range(df["value"].shape[0]):
            if df["result"].iloc[j] == 1:
                axs[i//2, i%2].plot(j, df["value"].iloc[j], "go")
            else:
                axs[i//2, i%2].plot(j, df["value"].iloc[j], "ro", markersize=3)

        axs[i//2, i%2].set_title(cols[i])

    plt.show()

In [492]:
def data_cleaning(file):
    df = pd.read_csv(file).drop(columns=["compliance_2019", "compliance_2020", 
                                                      "Enterococci_2019", "Enterococci_2020", "Escherichia-coli-Colilert_2019",
                                                     "Escherichia-coli-Colilert_2020", "Escherichia-coli_2019", "Escherichia-coli_2020"])

    station_id = df["station_id"]
    df = df.drop(columns="station_id")

    cols = df.columns

    limit = [60, 1.5, 1.5, 300, 10, 30, 350, 8, 40, 3000, 1.75, 2000, 400, 20, 0.15, 10, 5, 3, 300, 80, 4, 10, 20, 9]

    for i in range(len(cols)-1):
        col = np.array(df[cols[i]].values.tolist())
        df[cols[i]] = np.where(col > limit[i//2], np.nan, col).tolist()

    df = df.fillna(df.mean())
    
    return [df, station_id]

In [493]:
def data_cleaning_2(file):
    df = pd.read_csv(file).drop(columns=["Color-Pt-Co-unit_2019", "Color-Pt-Co-unit_2020",
                                        "Enterococci_2019", "Enterococci_2020", "Escherichia-coli-Colilert_2019",
                                        "Escherichia-coli-Colilert_2020", "Escherichia-coli_2019", "Escherichia-coli_2020"])

    station_id = df["station_id"]
    df = df.drop(columns="station_id")

    cols = df.columns

    limit = [60, 1.5, 1.5, 300, 10, 30, 350, 40, 3000, 1.75, 2000, 400, 20, 0.15, 10, 5, 3, 300, 80, 4, 10, 20, 9]

    for i in range(len(cols)-3):
        col = np.array(df[cols[i]].values.tolist())
        df[cols[i]] = np.where(col > limit[i//2], np.nan, col).tolist()
    
    return [df, station_id]

In [494]:
def data_balance(file):
    df = pd.read_csv(file).drop(columns=["compliance_2019", "compliance_2020", 
                                                      "Enterococci_2019", "Enterococci_2020", "Escherichia-coli-Colilert_2019",
                                                     "Escherichia-coli-Colilert_2020", "Escherichia-coli_2019", "Escherichia-coli_2020"])

    
    positive_cases = df[df["compliance_2021"] == 1]
    negative_sample = df[df["compliance_2021"] == 0].sample(132)
    df_bal = pd.concat([positive_cases, negative_sample])
    df_bal.sort_index(inplace=True)
    
    station_id = df_bal["station_id"]
    df_bal = df_bal.drop(columns="station_id")

    cols = df_bal.columns

    limit = [60, 1.5, 1.5, 300, 10, 30, 350, 8, 40, 3000, 1.75, 2000, 400, 20, 0.15, 10, 5, 3, 300, 80, 4, 10, 20, 9]

    for i in range(len(cols)-1):
        col = np.array(df_bal[cols[i]].values.tolist())
        df_bal[cols[i]] = np.where(col > limit[i//2], np.nan, col).tolist()

    df_bal = df_bal.fillna(df_bal.mean())
    
    return [df_bal, station_id]

In [568]:
def gen_comp_21(dataframe, comp_19, comp_20):
    
    cols_21 = comp_19.columns
    comp_21 = pd.DataFrame(columns=cols_21, index=np.arange(dataframe.shape[0]))
    
    dataframe = dataframe.drop(columns=["compliance_2019", "compliance_2020"])
    cols = dataframe.columns

    for i in range(len(cols_21)-1):
        for j in range(dataframe.shape[0]):
            data_19 = dataframe[cols[i*2]].iloc[j]
            data_20 = dataframe[cols[i*2+1]].iloc[j]

            if dataframe[cols[i*2]].isnull()[j] and dataframe[cols[i*2+1]].isnull()[j]:
                data_21 = np.nan
            elif dataframe[cols[i*2]].isnull()[j]:
                data_21 = data_20
            elif dataframe[cols[i*2+1]].isnull()[j]:
                data_21 = data_19
            else:
                dif = data_20 - data_19
                data_21 = data_20 + dif

                if data_21 < 0:
                    data_21 = 0

            col = cols_21[i]

            comp_21.at[j, col] = data_21

    try:
        comp_21["compliance"] = dataframe["compliance_2021"]
    except:
        comp_21 = comp_21.drop(columns="compliance")
    
    return comp_21

In [531]:
def sep_years(dataframe):
    
    try:
        dataframe = dataframe.drop(columns="compliance_2021")
    except:
        pass

    comp_19 = dataframe
    comp_20 = dataframe

    cols = dataframe.columns

    for i in range((len(cols))//2):
        comp_19 = comp_19.drop(columns = cols[i*2+1])
        comp_20 = comp_20.drop(columns = cols[i*2])

    cols_19 = comp_19.columns
    cols_20 = comp_20.columns

    for i in range((len(cols_19))):
        comp_19 = comp_19.rename(columns={cols_19[i] : cols_19[i][:-5]})
        comp_20 = comp_20.rename(columns={cols_20[i] : cols_20[i][:-5]})
    
        
    return [comp_19, comp_20]

<font size=6>**1. KATSE**<font>

<font size=3>**ANDMED**<font>

<font size=3>**ANDMETE KUJUTUS**<font>

<font size=3>**TRAIN SETS**<font>

<font size=3>**PARIM RFC**<font> (runtime ~ 1 min) <br>
<font size=1>RFC(max_depth = 18, min_samples_split = 2, min_samples_leaf = 4) <br>
    acc = 84.5<font>

<font size=3>**TESTIMINE**<font>

<font size=6>**2. KATSE**<font>

<font size=3>**ANDMED**<font>

<font size=3>**ANDMETE KUJUTUS**<font>

<font size=3>**TRAIN SETS**<font>

<font size=3>**TESTIMINE**<font>

<font size=6>**3. KATSE**<font>

<font size=3>**ANDMED**<font>

<font size=3>**TRAIN SETS**<font>

<font size=3>**TESTIMINE**<font>

<font size=6>**4. KATSE**<font>

<font size=3>**ANDMED**<font>

<font size=3>**TRAIN SETS**<font>

<font size=3>**ERINEVAD MUDELID**<font>

<font size=2>**RFC**<font> (runtime ~ 6.2 h) <br>
<font size=1>RFC(criterion = "gini", n_estimators = 10, max_depth = 13, min_samples_split = 2, min_samples_leaf = 3) <br>
    acc = 85.9<font>

<font size=2>**KNN**<font>  (runtime ~ 8 min)<br>
<font size=1>KNN(n_neighbors = 13, metric = "euclidean", leaf_size = 28) <br>
    acc = 85.2<font>

<font size=2>**DTC**<font> (runtime ~ 46 s)<br>
<font size=1>DTC(criterion = "gini", max_depth = 8, min_samples_split = 9, min_samples_leaf = 2) <br>
    acc = 82.1<font>

<font size=2>**SVC (auto)**<font> (runtime ~ 1 min) <br>
<font size=1>acc = 73<font>

<font size=2>**SVC (scale)**<font> (runtime ~ 1 s)<br>
<font size=1>acc = 84.7<font>

<font size=6>**5. KATSE**<font>

<font size=3>**ANDMED**<font>

<font size=3>**ANDMETE KUJUTUS**<font>

<font size=3>**TRAIN SETS**<font>

<font size=2>**HGBC**<font> (runtime ~ 48 s) <br>
<font size=1>HGBC(learning_rate = 0.01, max_iter = 100) <br>
    acc = 85<font>

<font size=2>**HGBC**<font> (runtime ~ 10 s) <br>
<font size=1>HGBC(learning_rate = 0.01, max_iter = 100, max_depth = 5) <br>
    acc = 85.2<font>

<font size=3>**TESTIMINE**<font>

<font size=6>**6. KATSE**<font>

<font size=3>**ANDMED**<font>

<font size=3>**ANDMETE KUJUTUS**<font>

<font size=3>**TRAIN SETS**<font>

<font size=2>**HGBC**<font> (runtime ~ 7 s) <br>
<font size=1>HGBC(learning_rate = 0.01) <br>
    acc = 85<font>

<font size=2>**HGBC**<font> (runtime ~ 32 s) <br>
<font size=1>HGBC(learning_rate = 0.01, min_samples_leaf = 9) <br>
    acc = 85.2<font>

<font size=3>**TESTIMINE**<font>

<font size=6>**7. KATSE**<font>

<font size=3>**ANDMED**<font>

In [497]:
train_df = data_cleaning("train.csv")[0]

<font size=3>**TRAIN SETS**<font>

In [498]:
X = train_df.drop(columns="compliance_2021")
y = train_df["compliance_2021"]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)

<font size=3>**RFC CONFUSION MATRIX**<font>

In [499]:
model = RFC(criterion = "gini", n_estimators = 10, max_depth = 13, 
            min_samples_split = 2, min_samples_leaf = 3).fit(X_train, y_train)

pred = model.predict(X_val)

matrix = confusion_matrix(y_val.values, pred, labels=[1,0])
print("Confusion matrix:\n%s" % matrix)

Confusion matrix:
[[ 0 10]
 [ 1 77]]


<font size=3>**KNN CONFUSION MATRIX**<font>

In [500]:
model = KNN(n_neighbors = 13, metric = "euclidean", leaf_size = 28).fit(X_train, y_train)

pred = model.predict(X_val)

matrix = confusion_matrix(y_val.values, pred, labels=[1,0])
print("Confusion matrix:\n%s" % matrix)

Confusion matrix:
[[ 0 10]
 [ 0 78]]


<font size=3>**SVC (auto) CONFUSION MATRIX**<font>

In [501]:
model = SVC(kernel = "poly", gamma = "auto").fit(X_train, y_train)

pred = model.predict(X_val)

matrix = confusion_matrix(y_val.values, pred, labels=[1,0])
print("Confusion matrix:\n%s" % matrix)

KeyboardInterrupt: 

<font size=3>**SVC (scale) CONFUSION MATRIX**<font>

In [None]:
model = SVC(kernel = "poly", gamma = "scale").fit(X_train, y_train)

pred = model.predict(X_val)

matrix = confusion_matrix(y_val.values, pred, labels=[1,0])
print("Confusion matrix:\n%s" % matrix)

<font size=3>**HGBC CONFUSION MATRIX**<font>

In [None]:
model = HGBC(learning_rate = 0.01, max_iter = 100, max_depth = 5).fit(X_train, y_train)

pred = model.predict(X_val)

matrix = confusion_matrix(y_val.values, pred, labels=[1,0])
print("Confusion matrix:\n%s" % matrix)

<font size=6>**8. KATSE**<font>

<font size=3>**ANDMED**<font>

In [None]:
train_df = data_balance("train.csv")[0]

<font size=3>**TRAIN SETS**<font>

In [None]:
X = train_df.drop(columns="compliance_2021")
y = train_df["compliance_2021"]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)

<font size=3>**RFC CONFUSION MATRIX**<font>

In [None]:
model = RFC(criterion = "gini", n_estimators = 10, max_depth = 13, 
            min_samples_split = 2, min_samples_leaf = 3).fit(X_train, y_train)

pred = model.predict(X_val)

matrix = confusion_matrix(y_val.values, pred, labels=[1,0])
print("Confusion matrix:\n%s" % matrix)

<font size=3>**KNN CONFUSION MATRIX**<font>

In [None]:
model = KNN(n_neighbors = 13, metric = "euclidean", leaf_size = 28).fit(X_train, y_train)

pred = model.predict(X_val)

matrix = confusion_matrix(y_val.values, pred, labels=[1,0])
print("Confusion matrix:\n%s" % matrix)

<font size=3>**SVC (auto) CONFUSION MATRIX**<font>

In [None]:
model = SVC(kernel = "poly", gamma = "auto").fit(X_train, y_train)

pred = model.predict(X_val)

matrix = confusion_matrix(y_val.values, pred, labels=[1,0])
print("Confusion matrix:\n%s" % matrix)

<font size=3>**SVC (scale) CONFUSION MATRIX**<font>

In [None]:
model = SVC(kernel = "poly", gamma = "scale").fit(X_train, y_train)

pred = model.predict(X_val)

matrix = confusion_matrix(y_val.values, pred, labels=[1,0])
print("Confusion matrix:\n%s" % matrix)

<font size=3>**HGBC CONFUSION MATRIX**<font>

In [None]:
model = HGBC(learning_rate = 0.01, max_iter = 100, max_depth = 5).fit(X_train, y_train)

pred = model.predict(X_val)

matrix = confusion_matrix(y_val.values, pred, labels=[1,0])
print("Confusion matrix:\n%s" % matrix)

<font size=6>**9. KATSE**<font>

<font size=3>**ANDMED**<font>

In [None]:
train_df = data_balance("train.csv")[0]

<font size=3>**TRAIN SETS**<font>

In [None]:
X = train_df.drop(columns="compliance_2021")
y = train_df["compliance_2021"]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)

<font size=3>**RFC PROBABILITY**<font>

In [None]:
model = RFC(criterion = "gini", n_estimators = 10, max_depth = 13, 
            min_samples_split = 2, min_samples_leaf = 3).fit(X_train, y_train)

pred = pd.DataFrame(model.predict_proba(X_val))[1]

results = pd.DataFrame({"pred" : pred, "true" : y_val.reset_index(drop=True)})

results

<font size=6>**10. KATSE**<font>

<font size=3>**ANDMED**<font>

In [None]:
train_df = data_cleaning_2("train.csv")[0]
train_df = train_df.drop(columns="compliance_2021")

comp_19 = train_df
comp_20 = train_df

cols = train_df.columns

for i in range((len(cols))//2):
    comp_19 = comp_19.drop(columns = cols[i*2+1])
    comp_20 = comp_20.drop(columns = cols[i*2])

In [None]:
cor = comp_19.corr()
cols = comp_19.columns

for col in cols:
    for i in range(23):
        element = cor[col].iloc[i]
        if element != 1.0 and element > 0.55:
            print("({}) - ({})   cor: {}".format(col, cols[i], element))
            
cor = comp_20.corr()
cols = comp_20.columns

for col in cols:
    for i in range(23):
        element = cor[col].iloc[i]
        if element != 1.0 and element > 0.55:
            print("({}) - ({})   cor: {}".format(col, cols[i], element))


<font size=6>**11. KATSE**<font>

<font size=3>**ANDMED**<font>

In [None]:
train_df = data_cleaning_2("train.csv")[0]
train_df = train_df.drop(columns="compliance_2021")

comp_19 = train_df
comp_20 = train_df

cols = train_df.columns

for i in range((len(cols))//2):
    comp_19 = comp_19.drop(columns = cols[i*2+1])
    comp_20 = comp_20.drop(columns = cols[i*2])

In [None]:
cols_19 = comp_19.columns
cols_20 = comp_20.columns

for i in range((len(cols_19))):
    comp_19 = comp_19.rename(columns={cols_19[i] : cols_19[i][:-5]})
    comp_20 = comp_20.rename(columns={cols_20[i] : cols_20[i][:-5]})
    
train_df = pd.concat([comp_19, comp_20], ignore_index=True)

<font size=3>**TRAIN SETS**<font>

In [None]:
X = train_df.drop(columns="compliance")
y = train_df["compliance"]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)

<font size=3>**RFC CONFUSION MATRIX**<font>

In [None]:
model = RFC(criterion = "gini", n_estimators = 10, max_depth = 13, 
            min_samples_split = 2, min_samples_leaf = 3).fit(X_train, y_train)

pred = model.predict(X_val)

matrix = confusion_matrix(y_val.values, pred, labels=[1,0])
print("Confusion matrix:\n%s" % matrix)

<font size=6>**12. KATSE**<font>

<font size=3>**ANDMED**<font>

In [None]:
train_df = data_cleaning_2("train.csv")[0]

comp_19, comp_20 = sep_years(train_df.drop(columns="compliance_2021"))
comp_21 = gen_comp_21(train_df, comp_19, comp_20)

train_df = comp_21

train_df = train_df.fillna(train_df.mean())

<font size=3>**TRAIN SETS**<font>

In [None]:
positive_cases = train_df[train_df["compliance"] == 1]
negative_sample = train_df[train_df["compliance"] == 0].sample(positive_cases.shape[0])
train_df = pd.concat([positive_cases, negative_sample], ignore_index=True)

X = train_df.drop(columns="compliance")
y = train_df["compliance"]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)

<font size=3>**RFC CONFUSION MATRIX**<font>

In [None]:
model = RFC(criterion = "gini", n_estimators = 10, max_depth = 13, 
            min_samples_split = 2, min_samples_leaf = 3).fit(X_train, y_train)

pred = model.predict(X_val)

matrix = confusion_matrix(y_val.values, pred, labels=[1,0])
print("Confusion matrix:\n%s" % matrix)

model = RFC(criterion = "gini", n_estimators = 10, max_depth = 13, 
            min_samples_split = 2, min_samples_leaf = 3)

acc = cross_val_score(model, X_train, y_train, cv = 5).mean()

print("\nacc:", round(acc*100, 2) , "%")

<font size=3>**TESTING**<font>

In [None]:
X_train = train_df.drop(columns="compliance")
y_train = train_df["compliance"]

In [None]:
test_df, station_id = data_cleaning_2("test.csv")

comp_19, comp_20 = sep_years(test_df)
comp_21 = gen_comp_21(test_df, comp_19, comp_20)

test_df = comp_21

test_df = test_df.fillna(test_df.mean())

model = RFC(max_depth = 15, min_samples_split = 2, min_samples_leaf = 4).fit(X_train, y_train)
results = model.predict(test_df)
results_df = pd.DataFrame({"station_id" : station_id, "compliance_2021" : results})
results_df.to_csv("RFC_comp_21.csv", index = False)

<font size=6>**12. KATSE**<font>

<font size=3>**ANDMED**<font>

In [None]:
train_df = data_cleaning_2("train.csv")[0]
test_df = data_cleaning_2("test.csv")[0]

train_19, train_20 = sep_years(train_df)
test_19, test_20 = sep_years(test_df)

train_df = pd.concat([train_19, train_20, test_19, test_20], ignore_index=True)

train_df = train_df.fillna(train_df.mean())

<font size=3>**TRAIN SETS**<font>

In [None]:
X_train = train_df.drop(columns="compliance")
y_train = train_df["compliance"]

X_val = test_df.drop(columns="compliance")
y_val = test_df["compliance"]

<font size=3>**TESTING**<font>

In [None]:
model = RFC(criterion = "gini", n_estimators = 10, max_depth = 13, 
            min_samples_split = 2, min_samples_leaf = 3).fit(X_train, y_train)

pred = model.predict(X_val)

matrix = confusion_matrix(y_val.values, pred, labels=[1,0])

print("Positive guess acc:", round(matrix[0][0]/(matrix[1][0] + matrix[0][0])*100, 2) , "%")
print("Negative guess acc:", round(matrix[1][1]/(matrix[1][1] + matrix[0][1])*100, 2) , "%")

print("\nPositive case find acc:", round(matrix[0][0]/(matrix[0][1] + matrix[0][0])*100, 2) , "%")
print("Negative case find acc:", round(matrix[1][1]/(matrix[1][1] + matrix[1][0])*100, 2) , "%")

model = RFC(criterion = "gini", n_estimators = 10, max_depth = 13, 
            min_samples_split = 2, min_samples_leaf = 3)

acc = cross_val_score(model, X_train, y_train, cv = 5).mean()

print("\nacc:", round(acc*100, 2) , "%")

<font size=6>**13. KATSE**<font>

<font size=3>**ANDMED**<font>

In [None]:
result = []

for i in range(100):
    train_df = data_cleaning_2("train.csv")[0]

    comp_19, comp_20 = sep_years(train_df.drop(columns="compliance_2021"))
    comp_21 = gen_comp_21(train_df, comp_19, comp_20)

    train_df = comp_21

    train_df = train_df.fillna(train_df.mean())
    train_df = train_df.sample(frac = 1, ignore_index=True)

    train_df, test_df = [train_df.iloc[:352], train_df.iloc[352:]]
    
    
    positive_cases = train_df[train_df["compliance"] == 1]
    negative_sample = train_df[train_df["compliance"] == 0].sample(int(positive_cases.shape[0]*4.5))
    df = pd.concat([positive_cases, negative_sample], ignore_index=True)

    X_train = df.drop(columns="compliance")
    y_train = df["compliance"]
    X_val = test_df.drop(columns="compliance")
    y_val = test_df["compliance"]
    
    model = RFC().fit(X_train, y_train)
    pred = model.predict(X_val)
    matrix = confusion_matrix(y_val.values, pred, labels=[1,0])
    
    if (matrix[0][0] + matrix[1][0]) != 0:
        result.append(matrix[0][0]/(matrix[0][0] + matrix[1][0]))
    
print(round(sum(result) / len(result)*100, 2))

<font size=6>**14. KATSE**<font>

<font size=3>**ANDMED**<font>

In [None]:
def data(file):
    
    try:
        dataframe = pd.read_csv(file).drop(columns=["station_id", "compliance_2021"])
    except:
        dataframe = pd.read_csv(file).drop(columns="station_id")

    comp_19, comp_20 = sep_years(dataframe)
    dataframe = pd.concat([comp_19, comp_20], ignore_index=True)
    
    dataframe = dataframe.fillna(dataframe.mean())
    
    return dataframe

In [None]:
train_df = data("train.csv")
test_df = data("test.csv")

<font size=3>**TRAIN SETS**<font>

In [None]:
X_train = train_df.drop(columns="compliance")
y_train = train_df["compliance"]

X_val = test_df.drop(columns="compliance")
y_val = test_df["compliance"]

<font size=3>**TESTING**<font>

In [None]:
ga = []
fa = []
accs = []

for i in range(10):
    model = RFC().fit(X_train, y_train)
    pred = model.predict(X_val)
    matrix = confusion_matrix(y_val.values, pred, labels=[1,0])

    model = RFC()
    acc = cross_val_score(model, X_train, y_train, cv = 5).mean()

    ga.append(matrix[0][0]/(matrix[1][0] + matrix[0][0]))
    fa.append(matrix[0][0]/(matrix[0][1] + matrix[0][0]))
    accs.append(acc)

print("Guess acc:", round(sum(ga)/len(ga)*100, 2) , "%")
print("Find acc:", round(sum(fa)/len(fa)*100, 2) , "%")
print("acc:", round(sum(accs)/len(accs)*100, 2) , "%\n\n")



cols = X_train.columns

for col in cols:
    X_train = train_df.drop(columns=["compliance", col])
    y_train = train_df["compliance"]

    X_val = test_df.drop(columns=["compliance", col])
    y_val = test_df["compliance"]
    
    ga = []
    fa = []
    accs = []
    
    for i in range(10):
        model = RFC().fit(X_train, y_train)
        pred = model.predict(X_val)
        matrix = confusion_matrix(y_val.values, pred, labels=[1,0])

        model = RFC()
        acc = cross_val_score(model, X_train, y_train, cv = 5).mean()
        
        ga.append(matrix[0][0]/(matrix[1][0] + matrix[0][0]))
        fa.append(matrix[0][0]/(matrix[0][1] + matrix[0][0]))
        accs.append(acc)

    print(col)
    print("Guess acc:", round(sum(ga)/len(ga)*100, 2) , "%")
    print("Find acc:", round(sum(fa)/len(fa)*100, 2) , "%")
    print("acc:", round(sum(accs)/len(accs)*100, 2) , "%\n\n")

<font size=6>**15. KATSE**<font>

<font size=3>**ANDMED**<font>

In [563]:
def data_2(file):
    dataframe = pd.read_csv(file).drop(columns="station_id")

    dataframe = dataframe.drop(columns=[ "Coli-like-bacteria_2019", "Boron_2019", "Oxidability_2019", "Electrical-conductivity_2019", 
                                        "Colony-count-at-22-C_2019", "Nitrite_2019", "Taste-ball-units_2019", "Sulphate_2019", 
                                       "Aluminium_2019", "Nitrate_2019", "Fluoride_2019", "Odour-dilution-level_2019", "Turbidity-NTU_2019", 
                                       "Chloride_2019", "Color-Pt-Co-unit_2019", "Smell-ball-units_2019"])
    
    dataframe = dataframe.drop(columns=[ "Coli-like-bacteria_2020", "Boron_2020", "Oxidability_2020", "Electrical-conductivity_2020", 
                                        "Colony-count-at-22-C_2020", "Nitrite_2020", "Taste-ball-units_2020", "Sulphate_2020", 
                                       "Aluminium_2020", "Nitrate_2020", "Fluoride_2020", "Odour-dilution-level_2020", "Turbidity-NTU_2020", 
                                       "Chloride_2020", "Color-Pt-Co-unit_2020", "Smell-ball-units_2020"])

    
    return dataframe

In [601]:
train_df = data_2("train.csv")

comp_19, comp_20 = sep_years(train_df)
comp_21 = gen_comp_21(train_df, comp_19, comp_20)

test_df = comp_21

train_df = pd.concat([comp_19, comp_20], ignore_index=True)

train_df = train_df.fillna(train_df.mean())
test_df = test_df.fillna(test_df.mean())

In [607]:


#positive_cases = train_df[train_df["compliance"] == 1]
#negative_sample = train_df[train_df["compliance"] == 0].sample(int(positive_cases.shape[0]*4.5))
#df = pd.concat([positive_cases, negative_sample], ignore_index=True)

X_train = train_df.drop(columns="compliance")
y_train = train_df["compliance"]
X_val = test_df.drop(columns="compliance")
y_val = test_df["compliance"]

model = RFC().fit(X_train, y_train)
pred = model.predict(X_val)
X_val["res"] = pred
matrix = confusion_matrix(y_val.values, pred, labels=[1,0])
print(matrix)
X_val

[[ 36  30]
 [100 274]]


Unnamed: 0,Ammonium,Coli-like-bacteria-Colilert,Color-Pt/Co-scale,Enterococci,Escherichia-coli-Colilert,Escherichia-coli,Iron,Manganese,Sodium,Taste-dilution-degree,pH,res
0,0.05,2.303497,4.284949,0.0125,0.028369,0.0,20.0,48.278856,51.317015,1.844985,7.4,0
1,0.05,2.303497,0.0,0.0125,0.028369,0.0,20.0,48.278856,51.317015,1.0,6.9,0
2,0.43,2.303497,0.0,0.0,0.028369,0.0,22.0,10.0,51.317015,1.0,7.6,0
3,0.01,2.303497,4.0,0.0125,0.028369,0.0,0.0,48.278856,51.317015,1.844985,7.72,0
4,0.04,2.303497,4.284949,0.0125,0.028369,0.0,6916.0,48.278856,51.317015,1.844985,8.3,1
5,0.14,2.303497,3.0,0.0,0.028369,0.0,120.0,48.278856,51.317015,1.844985,8.26,1
6,0.139292,0.0,4.0,0.0125,0.0,0.0,57.0,5.0,51.317015,3.0,8.5,0
7,0.23,2.303497,4.284949,0.0,0.028369,0.0,310.0,48.278856,51.317015,1.844985,7.35,1
8,0.26,0.0,2.4,0.0,0.0,0.0,60.0,36.0,15.0,1.844985,7.9,0
9,0.05,2.303497,13.0,0.0125,0.028369,0.0,54.0,5.0,48.2,1.844985,6.99,0
