In [183]:
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.preprocessing import StandardScaler
import pickle
import pandas as pd

In [None]:
def outlier_removal(df, iso_forest = None):
    
    if iso_forest is None:
        X = df.values

        # Train Isolation Forest
        iso_forest = IsolationForest(contamination=0.01)
        iso_forest.fit(X)

        # Save model
        with open('isoforest_model.pkl', 'wb') as f:
            pickle.dump(iso_forest, f)

    # Predict outliers
    X = df.values
    outliers = iso_forest.predict(X)
    print("Inliers:", X[outliers == 1].shape)
    df = df[outliers == 1]
    return df

In [185]:
def standardize(df, scaler=None):
    if scaler is None:
        scaler = StandardScaler()
        scaler.fit(df[['year', 'operating_hours', 'efficiency', 'registration_fees', 'engine_capacity']])
        with open('scaler.pkl', 'wb') as f:
            pickle.dump(scaler, f)
    
    df[['year', 'operating_hours', 'efficiency', 'registration_fees', 'engine_capacity']] = scaler.transform(df[['year', 'operating_hours', 'efficiency', 'registration_fees', 'engine_capacity']])
    
        
    return df

In [186]:
def encoder(df, enc=None):
    
    if enc is None:
        list_of_model = df['model'].unique()
        print('list_of_model shape:', list_of_model.shape)
        list_of_gearbox_type = df['gearbox_type'].unique()
        print('list_of_gearbox_type shape:', list_of_gearbox_type.shape)
        list_of_fuel_type = df['fuel_type'].unique()
        print('list_of_fuel_type shape:', list_of_fuel_type.shape)

        enc = OneHotEncoder(handle_unknown='infrequent_if_exist', categories=[list_of_model, list_of_gearbox_type, list_of_fuel_type])
        enc.fit(df[['model', 'gearbox_type', 'fuel_type']])
        
        with open('encoder.pkl', 'wb') as f:
            pickle.dump(enc, f)
    
    encoded_feature = enc.transform(df[['model', 'gearbox_type', 'fuel_type']])
    
    df.drop(['model', 'gearbox_type', 'fuel_type'], axis=1, inplace=True)
    df = pd.concat([df, pd.DataFrame(encoded_feature.toarray())], axis=1)
    
    return df


In [187]:
train_df = pd.read_csv('data/org_train.csv')
test_df = pd.read_csv('data/org_test.csv')
train_df = train_df.drop(['manufacturer'], axis=1)
test_df = test_df.drop(['manufacturer'], axis=1)

train_df = standardize(train_df)
test_df = standardize(test_df, pickle.load(open('scaler.pkl', 'rb')))
train_df = encoder(train_df)
test_df = encoder(test_df, pickle.load(open('encoder.pkl', 'rb')))
train_df.to_csv('data/train_with_outliers.csv', index=False)
train_df = outlier_removal(train_df)

print(train_df.shape)
print(test_df.shape)

train_df.drop(['id'], axis=1, inplace=True)
train_df.to_csv('data/train.csv', index=False)
test_df.to_csv('data/test.csv', index=False)

list_of_model shape: (169,)
list_of_gearbox_type shape: (3,)
list_of_fuel_type shape: (5,)
Inliers: (9900, 183)
(9900, 184)
(2000, 183)
