In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from sklearn.linear_model import SGDOneClassSVM

In [2]:
df = pd.read_csv('/kaggle/input/playground-series-s3e21/sample_submission.csv')

In [3]:
target = 'target'
num_var = df.columns.drop(['target','id'])

In [4]:
df = df[df.id != 2365]
df.target = df.target.clip(7, 20)

In [5]:
# SGD
clf = SGDOneClassSVM(nu=0.55).fit(df.drop(['id'],axis=1))
df.drop(np.array(np.where(clf.predict(df.drop(['id'],axis=1))==-1))[0], axis=0, inplace=True)
df = df.reset_index(drop=True)

# IsolationForest
lf = IsolationForest(random_state=0).fit(df.drop(['id'],axis=1))
df.drop(np.array(np.where(lf.predict(df.drop(['id'],axis=1))==-1))[0], axis=0, inplace=True)
df = df.reset_index(drop=True)

# LOF
lof = LocalOutlierFactor(n_neighbors=3, contamination=0.1)
predictions = lof.fit_predict(df[num_var])
df = df.drop(index=np.where(predictions == -1)[0])
df.reset_index(drop=True, inplace=True)

# Tricks from https://www.kaggle.com/code/warcoder/lb-1-32253-lof-svm-iforest-cleanlab
lowest_quality_labels = [2294,448, 437,309,1684,773,1722,2007]
df.drop(list(lowest_quality_labels),inplace=True)

In [6]:
drop_index = np.random.choice(df[df['target']==7].index.to_numpy(), 130, replace=False)
df.loc[drop_index]

Unnamed: 0,id,target,O2_1,O2_2,O2_3,O2_4,O2_5,O2_6,O2_7,NH4_1,...,NO3_5,NO3_6,NO3_7,BOD5_1,BOD5_2,BOD5_3,BOD5_4,BOD5_5,BOD5_6,BOD5_7
294,333,7.0,5.300,8.100,11.220,7.550,3.240,8.460,8.290,0.260,...,7.795,1.730,1.730,2.700,5.30,2.910,3.150,6.025,2.300,4.20
286,324,7.0,5.890,5.420,8.770,4.660,4.420,8.430,6.375,0.240,...,7.795,0.636,2.095,4.800,4.97,7.370,8.500,8.415,2.155,2.90
405,465,7.0,12.567,12.100,9.545,7.750,6.015,10.570,10.090,0.215,...,16.760,0.820,0.567,6.367,6.30,2.150,6.025,6.025,3.025,3.90
1995,2289,7.0,5.000,6.345,8.370,6.765,5.845,9.805,5.030,0.240,...,20.050,0.636,0.760,5.400,5.40,7.600,5.725,6.750,5.750,5.70
2148,2466,7.0,7.400,8.500,8.420,6.650,5.015,8.760,8.110,0.473,...,5.070,1.730,0.760,4.800,3.25,2.800,5.825,30.050,5.025,5.20
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28,30,7.0,5.890,6.490,6.025,9.265,8.010,10.070,6.375,0.240,...,9.450,9.530,5.090,4.800,4.97,7.900,6.075,6.750,6.625,5.00
1944,2234,7.0,5.100,6.500,8.880,7.170,5.020,10.070,5.070,0.490,...,7.450,1.570,1.795,4.300,3.40,4.105,5.725,5.790,3.600,4.14
1289,1482,7.0,7.100,7.600,9.500,7.400,5.200,8.360,6.900,0.310,...,3.790,1.570,1.780,2.400,3.40,2.170,3.870,7.300,2.400,1.60
125,144,7.0,7.800,7.200,8.560,6.170,5.240,10.070,10.230,0.210,...,4.660,1.730,1.640,3.900,4.90,2.900,6.480,7.150,3.480,3.00


In [7]:
col_to_keep = ['O2_1', 'O2_2', 'BOD5_5', 'target']

for i in df.columns:
    if i not in col_to_keep:
        df[i]=0

In [8]:
df.to_csv('submission.csv',index=False)
df


Unnamed: 0,id,target,O2_1,O2_2,O2_3,O2_4,O2_5,O2_6,O2_7,NH4_1,...,NO3_5,NO3_6,NO3_7,BOD5_1,BOD5_2,BOD5_3,BOD5_4,BOD5_5,BOD5_6,BOD5_7
0,0,8.59,7.500,9.000,0,0,0,0,0,0,...,0,0,0,0,0,0,0,16.645,0,0
1,0,9.10,13.533,40.900,0,0,0,0,0,0,...,0,0,0,0,0,0,0,5.725,0,0
2,0,8.21,3.710,5.420,0,0,0,0,0,0,...,0,0,0,0,0,0,0,6.750,0,0
3,0,8.39,8.700,8.100,0,0,0,0,0,0,...,0,0,0,0,0,0,0,8.670,0,0
4,0,8.07,8.050,8.650,0,0,0,0,0,0,...,0,0,0,0,0,0,0,8.400,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3060,0,8.65,8.367,8.760,0,0,0,0,0,0,...,0,0,0,0,0,0,0,8.415,0,0
3061,0,8.08,6.250,8.300,0,0,0,0,0,0,...,0,0,0,0,0,0,0,7.695,0,0
3062,0,8.09,6.630,6.630,0,0,0,0,0,0,...,0,0,0,0,0,0,0,8.415,0,0
3063,0,9.95,8.367,8.433,0,0,0,0,0,0,...,0,0,0,0,0,0,0,8.400,0,0
