In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.metrics import confusion_matrix

In [2]:
def sep_years(dataframe):

    comp_19 = dataframe
    comp_20 = dataframe

    cols = dataframe.columns

    for i in range((len(cols))//2):
        comp_19 = comp_19.drop(columns = cols[i*2+1])
        comp_20 = comp_20.drop(columns = cols[i*2])

    cols_19 = comp_19.columns
    cols_20 = comp_20.columns

    for i in range((len(cols_19))):
        comp_19 = comp_19.rename(columns={cols_19[i] : cols_19[i][:-5]})
        comp_20 = comp_20.rename(columns={cols_20[i] : cols_20[i][:-5]})
    
        
    return [comp_19, comp_20]

In [3]:
def data(file):
    
    try:
        dataframe = pd.read_csv(file).drop(columns=["station_id", "compliance_2021"])
    except:
        dataframe = pd.read_csv(file).drop(columns="station_id")

    comp_19, comp_20 = sep_years(dataframe)
    dataframe = pd.concat([comp_19, comp_20], ignore_index=True)
    
    dataframe = dataframe.fillna(dataframe.mean())
    
    return dataframe

In [4]:
def test(X_train, y_train, X_val, y_val, col, results, reps):
    pos_guess_acc = []
    pos_cases_found = []

    for i in range(reps):
        model = RFC().fit(X_train, y_train)
        pred = model.predict(X_val)
        matrix = confusion_matrix(y_val.values, pred, labels=[1,0])
        
        tp = matrix[0][0]
        fp = matrix[1][0]
        fn = matrix[0][1]

        pos_guess_acc.append(tp/(fp + tp))
        pos_cases_found.append(tp/(fn + tp))

    avg_pos_guess_acc = round(100 * sum(pos_guess_acc) / len(pos_guess_acc) , 2)
    avg_pos_cases_found = round(100 * sum(pos_cases_found) / len(pos_cases_found) , 2)
    
    try:
        results.at[col, "Guess acc"] = avg_pos_guess_acc
        results.at[col, "Find acc"] = avg_pos_cases_found
    
        return results
    
    except:
        pass
    

In [5]:
train_df = data("train.csv")
test_df = data("test.csv")

del_cols = []

X_train = train_df.drop(columns="compliance")
y_train = train_df["compliance"]

X_val = test_df.drop(columns="compliance")
y_val = test_df["compliance"]

In [6]:
cols = X_train.columns
results = pd.DataFrame(columns = ["Guess acc", "Find acc"], index = cols)

for col in cols:
    X_train = train_df.drop(columns=["compliance", col])
    y_train = train_df["compliance"]

    X_val = test_df.drop(columns=["compliance", col])
    y_val = test_df["compliance"]
    
    results = test(X_train, y_train, X_val, y_val, col, results, 10)
    

results = results.sort_values(by = "Guess acc", ascending = False)
del_cols.append(results.index[0])

results.head(1)

Unnamed: 0,Guess acc,Find acc
Coli-like-bacteria,90.41,71.05


In [7]:
def data_sets(del_cols):
    train_df = data("train.csv").drop(columns=del_cols)
    test_df = data("test.csv").drop(columns=del_cols)

    X_train = train_df.drop(columns="compliance")
    y_train = train_df["compliance"]

    X_val = test_df.drop(columns="compliance")
    y_val = test_df["compliance"]
    
    return [train_df, test_df, X_train, y_train, X_val, y_val]
    

In [8]:
def consec_test(train_df, test_df, X_train, y_train, X_val, y_val):
    cols = X_train.columns
    results = pd.DataFrame(columns = ["Guess acc", "Find acc"], index = cols)

    for col in cols:
        X_train = train_df.drop(columns=["compliance", col])
        y_train = train_df["compliance"]

        X_val = test_df.drop(columns=["compliance", col])
        y_val = test_df["compliance"]

        results = test(X_train, y_train, X_val, y_val, col, results, 10)


    results = results.sort_values(by = "Guess acc", ascending = False)

    
    return results

In [9]:
train_df, test_df, X_train, y_train, X_val, y_val = data_sets(del_cols)

results = consec_test(train_df, test_df, X_train, y_train, X_val, y_val)
del_cols.append(results.index[0])

results.head(1)

Unnamed: 0,Guess acc,Find acc
Boron,95.23,67.89


In [10]:
train_df, test_df, X_train, y_train, X_val, y_val = data_sets(del_cols)

results = consec_test(train_df, test_df, X_train, y_train, X_val, y_val)
del_cols.append(results.index[0])

results.head(1)

Unnamed: 0,Guess acc,Find acc
Oxidability,96.38,66.05


In [11]:
train_df, test_df, X_train, y_train, X_val, y_val = data_sets(del_cols)

results = consec_test(train_df, test_df, X_train, y_train, X_val, y_val)
del_cols.append(results.index[0])

results.head(1)

Unnamed: 0,Guess acc,Find acc
Odour-dilution-level,97.81,64.61


In [12]:
train_df, test_df, X_train, y_train, X_val, y_val = data_sets(del_cols)

results = consec_test(train_df, test_df, X_train, y_train, X_val, y_val)
del_cols.append(results.index[0])

results.head(1)

Unnamed: 0,Guess acc,Find acc
Turbidity-NTU,98.64,65.26


In [13]:
train_df, test_df, X_train, y_train, X_val, y_val = data_sets(del_cols)

results = consec_test(train_df, test_df, X_train, y_train, X_val, y_val)
del_cols.append(results.index[0])

results.head(1)

Unnamed: 0,Guess acc,Find acc
Nitrite,99.04,66.58


In [14]:
train_df, test_df, X_train, y_train, X_val, y_val = data_sets(del_cols)

results = consec_test(train_df, test_df, X_train, y_train, X_val, y_val)
del_cols.append(results.index[0])

results.head(1)

Unnamed: 0,Guess acc,Find acc
Taste-ball-units,98.7,66.84


In [15]:
train_df, test_df, X_train, y_train, X_val, y_val = data_sets(del_cols)

results = consec_test(train_df, test_df, X_train, y_train, X_val, y_val)
del_cols.append(results.index[0])

results.head(1)

Unnamed: 0,Guess acc,Find acc
Electrical-conductivity,99.25,68.03


In [16]:
train_df, test_df, X_train, y_train, X_val, y_val = data_sets(del_cols)

results = consec_test(train_df, test_df, X_train, y_train, X_val, y_val)
del_cols.append(results.index[0])

results.head(1)

Unnamed: 0,Guess acc,Find acc
Sulphate,99.23,68.16


In [17]:
train_df, test_df, X_train, y_train, X_val, y_val = data_sets(del_cols)

results = consec_test(train_df, test_df, X_train, y_train, X_val, y_val)
del_cols.append(results.index[0])

results.head(1)

Unnamed: 0,Guess acc,Find acc
Aluminium,99.43,67.37


In [18]:
train_df, test_df, X_train, y_train, X_val, y_val = data_sets(del_cols)

results = consec_test(train_df, test_df, X_train, y_train, X_val, y_val)
del_cols.append(results.index[0])

results.head(1)

Unnamed: 0,Guess acc,Find acc
Smell-ball-units,99.63,68.82


In [19]:
train_df, test_df, X_train, y_train, X_val, y_val = data_sets(del_cols)

results = consec_test(train_df, test_df, X_train, y_train, X_val, y_val)
del_cols.append(results.index[0])

results.head(1)

Unnamed: 0,Guess acc,Find acc
Color-Pt/Co-scale,99.81,67.24


In [20]:
train_df, test_df, X_train, y_train, X_val, y_val = data_sets(del_cols)

results = consec_test(train_df, test_df, X_train, y_train, X_val, y_val)
del_cols.append(results.index[0])

results.head(1)

Unnamed: 0,Guess acc,Find acc
Nitrate,99.62,67.76


In [21]:
train_df, test_df, X_train, y_train, X_val, y_val = data_sets(del_cols)

results = consec_test(train_df, test_df, X_train, y_train, X_val, y_val)
del_cols.append(results.index[0])

results.head(1)

Unnamed: 0,Guess acc,Find acc
Fluoride,99.63,70.39


In [22]:
train_df, test_df, X_train, y_train, X_val, y_val = data_sets(del_cols)

results = consec_test(train_df, test_df, X_train, y_train, X_val, y_val)
del_cols.append(results.index[0])

results.head(1)

Unnamed: 0,Guess acc,Find acc
Enterococci,99.24,66.84


In [23]:
train_df, test_df, X_train, y_train, X_val, y_val = data_sets(del_cols)

results = consec_test(train_df, test_df, X_train, y_train, X_val, y_val)
del_cols.append(results.index[0])

results.head(1)

Unnamed: 0,Guess acc,Find acc
Color-Pt-Co-unit,99.24,67.37


In [24]:
train_df, test_df, X_train, y_train, X_val, y_val = data_sets(del_cols)

results = consec_test(train_df, test_df, X_train, y_train, X_val, y_val)
del_cols.append(results.index[0])

results.head(1)

Unnamed: 0,Guess acc,Find acc
Escherichia-coli-Colilert,99.4,65.39


In [25]:
train_df, test_df, X_train, y_train, X_val, y_val = data_sets(del_cols)
print(X_train.columns.to_list())

['Ammonium', 'Chloride', 'Coli-like-bacteria-Colilert', 'Colony-count-at-22-C', 'Escherichia-coli', 'Iron', 'Manganese', 'Sodium', 'Taste-dilution-degree', 'pH ']
