In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestRegressor, IsolationForest
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import SGDOneClassSVM
from sklearn.neighbors import LocalOutlierFactor
import seaborn as sns

# The Checklist
- [ ] Get the data
- [ ] Do the One Class SVM, Isolation Tree, and Local Outlier Factor
- [ ] Clip the Target
- [ ] Arithmetic Mean


In [2]:
df = pd.read_csv('sample_submission.csv')
df

Unnamed: 0,id,target,O2_1,O2_2,O2_3,O2_4,O2_5,O2_6,O2_7,NH4_1,...,NO3_5,NO3_6,NO3_7,BOD5_1,BOD5_2,BOD5_3,BOD5_4,BOD5_5,BOD5_6,BOD5_7
0,0,8.59,7.500,9.000,9.545,9.265,8.110,8.430,7.150,0.180,...,4.950,1.730,1.800,4.800,3.150,10.665,10.465,16.645,5.750,10.37
1,1,9.10,13.533,40.900,8.770,9.265,6.015,10.070,7.150,1.107,...,20.050,9.530,7.695,4.550,6.950,2.040,5.200,5.725,2.950,2.23
2,2,8.21,3.710,5.420,8.770,9.265,4.550,10.070,7.150,0.020,...,4.580,3.025,3.960,4.935,4.950,4.725,6.075,6.750,3.500,3.17
3,3,8.39,8.700,8.100,9.500,9.200,5.200,8.670,6.670,0.280,...,8.450,2.070,1.730,6.300,4.700,3.500,6.200,8.670,2.900,7.37
4,4,8.07,8.050,8.650,7.960,9.265,3.290,10.070,7.150,0.360,...,2.020,1.730,0.760,4.800,4.970,3.950,2.800,8.400,3.500,3.90
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3495,3495,8.08,6.250,8.300,7.795,9.265,5.690,8.555,6.335,0.565,...,14.575,0.636,1.640,4.235,4.100,2.800,3.950,7.695,3.540,2.50
3496,3496,8.09,6.630,6.630,8.370,7.600,0.636,8.430,7.150,1.300,...,4.580,1.730,1.800,4.900,3.150,2.040,6.075,8.415,2.155,2.90
3497,3497,9.95,8.367,8.433,8.770,6.170,5.800,10.400,7.200,0.430,...,20.050,1.440,1.800,4.867,4.833,4.725,4.950,8.400,6.625,4.20
3498,3498,9.52,10.000,6.630,9.545,9.265,3.290,8.980,2.310,0.300,...,20.050,9.530,7.695,10.700,5.400,4.725,3.300,6.750,6.625,5.00


# Setting up the Helper Methods

Now I am going to setup a streamlined k folds and random forest regressor

In [3]:
def test_on_data(x_train, x_test, y_train, y_test):
    rf = RandomForestRegressor(
       n_estimators=1000,
       max_depth=7,
       n_jobs=-1,
       random_state=42)

    rf.fit(x_train, y_train)
    preds = rf.predict(x_test)
    return mean_squared_error(y_test, preds)

In [4]:
def test_on_data_cv(x, y, cv=5):
    rf = RandomForestRegressor(
       n_estimators=1000,
       max_depth=7,
       n_jobs=-1,
       random_state=42)

    scores = cross_val_score(rf, x, y, cv=cv, scoring='neg_root_mean_squared_error')
    scores
    return scores

TODO: add custom K-Folds

In [5]:
def test_with_std(x, y, cv=5, n=10):
    score_total = []
    for i in range(n):
        score_total.append(test_on_data_cv(x, y, cv=cv))
    score_total = np.array(score_total)
    print(f'Mean {score_total.mean()} and std {score_total.std()}')

In [None]:
# Let's try these methods.
# test_with_std(df.drop('target', axis=1), df.target)

For normal k-folds without any change in data, we got a pretty bad score and even worst we got a big std compared to mean. For curisoty sake I'm going to do it on the clipped version too.

In [None]:
# Testing cell, not run by default
# test_with_std(df.drop('target', axis=1), df['target'].clip(7, 16))

Ok this seems better with 0.9 MSE and 0.02 std, MAYBE I SHOULD VALUE STD MORE.

In [None]:
def give_zero_sub(df=df):
    dfc = df.copy()
    dfc.iloc[:]=0
    return dfc

# Half hearted EDA & Outlier Detection

In [None]:
data_to_plot = df.drop(['id'], axis=1)

# Determine the number of columns to plot, 36
num_cols = len(data_to_plot.columns)

# Calculate the number of rows and columns for the subplots
num_rows = (num_cols + 2) // 3
num_cols_per_subplot = min(num_cols, 5)

In [None]:
# Create subplots
fig, axes = plt.subplots(num_rows, num_cols_per_subplot, figsize=(18, 2 * num_rows))

# Flatten the axes for iteration
axes = axes.flatten()

# Create box plots for each column
for i, col in enumerate(data_to_plot.columns):
    # The main magic happens here:
    sns.boxplot(x=data_to_plot[col], ax=axes[i])
    axes[i].set_title(col)
    axes[i].set_xticks([])  # Remove x-axis labels for better readability
    
# Remove any empty subplots
for i in range(num_cols, num_rows * num_cols_per_subplot):
    fig.delaxes(axes[i])

plt.tight_layout()
plt.show()

In [None]:
sns.lineplot(data=df, x='id',y='target')

In [None]:
sns.lineplot(data=df, x='id',y='O2_1')

In [None]:
sns.lineplot(data=df, x='id',y='O2_2')

In [None]:
sns.lineplot(data=df, x='id',y='BOD5_5')

In [None]:
df[df.target>20]

In [None]:
df[df['BOD5_5'] >35]

2 outliers on target which are 316 and 451, from this point we are going to pretend that ID column does not exist, because internet says so.

# Outlier Cleaning

## SVM, Isolation Forest, LOF

In [None]:
def dropped_df(idx):
    return df.drop(idx, axis=0)

In [None]:
def get_svm_idx(nu=.3):
    svm = SGDOneClassSVM(nu)
    svm.fit(df.drop('id', axis=1))
    svm_pred = svm.predict(df.drop('id', axis=1))
    svm_remove_idx = np.where(svm_pred==-1)[0]
    return svm_remove_idx

SGDOneClassSVM works best without id for finding outliers.

In [None]:
# # Testing different nu values

# svm_df_3 = dropped_df(get_svm_idx(nu=0.3))
# test_with_std(svm_df_3.drop('target', axis=1), svm_df_3.target, cv=5, n=7)

In [None]:
# svm_df_5 = dropped_df(get_svm_idx(nu=0.5))
# test_with_std(svm_df_5.drop('target', axis=1), svm_df_5.target, cv=5, n=7)

In [None]:
# svm_df_8 = dropped_df(get_svm_idx(nu=0.8))
# test_with_std(svm_df_8.drop('target', axis=1), svm_df_8.target, cv=5, n=7)

In [None]:
def get_iso_idx(cont=0.005):
    isofor = IsolationForest(contamination=cont)
    isofor.fit(df.drop('id',axis=1))
    iso_pred = isofor.predict(df.drop('id',axis=1))
    iso_remove_idx = np.where(iso_pred==-1)[0]
    return iso_remove_idx

In [None]:
# # Testing different contamination values

# iso_df_1 = dropped_df(get_iso_idx(cont='auto'))
# test_with_std(iso_df_1.drop('target', axis=1), iso_df_1.target, cv=5, n=7)

In [None]:
# iso_df_2 = dropped_df(get_iso_idx(cont=0.01))
# test_with_std(iso_df_2.drop('target', axis=1), iso_df_2.target, cv=5, n=7)

In [None]:
# iso_df_3 = dropped_df(get_iso_idx(cont=0.5))
# test_with_std(iso_df_3.drop('target', axis=1), iso_df_3.target, cv=5, n=7)

In [None]:
def get_lof_idx(cont=0.005):
    lof = LocalOutlierFactor(contamination=cont)
    lof_pred = lof.fit_predict(df.drop('id',axis=1))
    lof_remove_idx = np.where(lof_pred==-1)[0]
    return lof_remove_idx

In [None]:
# # Testing different contamination values

# lof_df_1 = dropped_df(get_lof_idx(cont='auto'))
# test_with_std(lof_df_1.drop('target', axis=1), lof_df_1.target, cv=5, n=7)

In [None]:
# lof_df_2 = dropped_df(get_lof_idx(cont=0.1))
# test_with_std(lof_df_2.drop('target', axis=1), lof_df_2.target, cv=5, n=7)

In [None]:
# iso_df_3 = dropped_df(get_lof_idx(cont=0.5))
# test_with_std(iso_df_3.drop('target', axis=1), iso_df_3.target, cv=5, n=7)

Score i got were:
SVM(nu): 0.3:(-1.39;0.61), 0.5:(-1.39;0.61), 0.8:(-1.39;0.61) - So no meaningful differences.

Isolation Tree(cont): auto:(1.41;0.61), 0.01:(1.40;0.61), **0.5:(0.93; 0.08),**

LocalOutlierFactor(cont): auto:(0.98;0.05), 0.1:(0.92;0.03), **0.5:(0.83; 0.02)**

I wonder if is should this radical numbers for cont, since no one else seems to be using them. 

edit: Ok now i know the reason why people in the Kaggle don't use the big number because it get picky pretty fast. My values are nu=0.3 and both contaminations set to 0.01

In [None]:
svm_remove_idx = get_svm_idx(nu=0.3)
iso_remove_idx = get_iso_idx(cont=0.005)
lof_remove_idx = get_lof_idx(cont=0.005)

idx_to_remove = np.union1d(np.union1d(lof_remove_idx, iso_remove_idx), svm_remove_idx)

In [None]:
len(svm_remove_idx)

In [None]:
len(iso_remove_idx)

In [None]:
len(lof_remove_idx)

In [None]:
len(idx_to_remove)

In [None]:
df_clean = dropped_df(idx_to_remove)
len(df_clean)

3500-80 = 3420
Ok we will clip target once between 7 and 20 and second between 7 and 16

## Features Importance

In [None]:
X = df.drop(['id','target'],axis=1)
y = df.loc[:,'target']
rf = RandomForestRegressor(
       n_estimators=1000,
       max_depth=7,
       n_jobs=-1,
       random_state=42)

rf.fit(X,y)
importances = rf.feature_importances_

In [None]:
s = pd.Series(importances)
selected_features = s.sort_values(ascending=False)[:6]
selected_features

In [None]:
# The names of first 4 features
X.columns[selected_features.index]

In [None]:
X = df_clean.drop(['id','target'], axis=1)
y = df_clean.loc[:,'target']
rf = RandomForestRegressor(
       n_estimators=1000,
       max_depth=7,
       n_jobs=-1,
       random_state=42)

rf.fit(X,y)
importances = rf.feature_importances_
s = pd.Series(importances)
selected_features = s.sort_values(ascending=False)[:4]
selected_features

In [None]:
# The names of first 4 features
X.columns[selected_features.index]

Outliers and the hyperparameters of above operations operate imporatance of these features too much but still I am going to choose 4.

In [None]:
columns_idx = selected_features.index.to_list()
columns_idx_x = columns_idx + [15]

# Clipping

In [None]:
def y_clipped(low, high):
    return df_clean.target.clip(low, high)

In [None]:
y_20 = y_clipped(7, 20)
y_16 = y_clipped(7, 16)

In [7]:
# TEST CELL

# test_with_std(X.iloc[:, columns_idx], y_20, n=7)
# test_with_std(X.iloc[:, columns_idx], y_16, n=7)
# test_with_std(X.iloc[:, columns_idx_x], y_20, n=7)
# test_with_std(X.iloc[:, columns_idx_x], y_16, n=7)

In [None]:
df_sub = df_clean.copy()
df_sub[:]=0

In [None]:
columns_idxcol

In [None]:
col_np = np.array(columns_idx)
col_np+2

In [None]:
df_sub.iloc[:, col_np+2] = df_clean.iloc[:,col_np+2]
df_sub

In [None]:
df_sub.target = y_20

In [None]:
df_sub.to_csv('subv8_1.csv', index=False)

In [None]:
df_sub.target = y_16
df_sub.to_csv('subv8_2.csv', index=False)

In [None]:
test_with_std(X.iloc[:, columns_idx_x], y_clipped(7, 35), n=5)

In [None]:
sns.lineplot(data=df, x='id',y='target')

In [None]:
df_sub.target = y_clipped(7, 35)
df_sub.to_csv('subv8_3.csv', index=False)

# Testing

# Experimental Y Dropping

# Experimental Artihmatic Mean