In [228]:
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.preprocessing import StandardScaler
import pickle
import pandas as pd

In [229]:
df = pd.read_csv('data/org_train.csv')
df = df.drop(['id'], axis=1)
df = df.drop(['manufacturer'], axis=1)
le = LabelEncoder()

df['model'] = le.fit_transform(df['model'])
df['gearbox_type'] = le.fit_transform(df['gearbox_type'])
df['fuel_type'] = le.fit_transform(df['fuel_type'])

X = df.values

# Train Isolation Forest
iso_forest = IsolationForest(contamination=0.01)
iso_forest.fit(X)

# Predict outliers
outliers = iso_forest.predict(X)
print("Inliers:", X[outliers == 1].shape)
df = pd.read_csv('data/org_train.csv')
df = df[outliers == 1]
df.to_csv('data/train.csv', index=False)

Inliers: (9900, 9)


In [230]:
def standardize(df1, df2=None, scaler=None):
    if scaler is None:
        scaler = StandardScaler()
        scaler.fit(df1[['year', 'operating_hours', 'efficiency', 'registration_fees', 'engine_capacity']])
        # scaler.fit(df['year', 'operating_hours', 'efficiency'])
    
    # df['year', 'operating_hours', 'efficiency'] = scaler.transform(df['year', 'operating_hours', 'efficiency'])
    df1[['year', 'operating_hours', 'efficiency', 'registration_fees', 'engine_capacity']] = scaler.transform(df1[['year', 'operating_hours', 'efficiency', 'registration_fees', 'engine_capacity']])
    
    if df2 is not None:
        df2[['year', 'operating_hours', 'efficiency', 'registration_fees', 'engine_capacity']] = scaler.transform(df2[['year', 'operating_hours', 'efficiency', 'registration_fees', 'engine_capacity']])
    
    with open('scaler.pkl', 'wb') as f:
        pickle.dump(scaler, f)
        
    return df1, df2

In [231]:
df = pd.read_csv('data/train.csv')

est = KBinsDiscretizer(n_bins=[8, 10], strategy='kmeans', subsample=None)
est.fit(df[['registration_fees', 'engine_capacity']])

list_of_model = df['model'].unique()
print('list_of_model shape:', list_of_model.shape)
list_of_gearbox_type = df['gearbox_type'].unique()
print('list_of_gearbox_type shape:', list_of_gearbox_type.shape)
list_of_fuel_type = df['fuel_type'].unique()
print('list_of_fuel_type shape:', list_of_fuel_type.shape)

enc = OneHotEncoder(handle_unknown='infrequent_if_exist', categories=[list_of_model, list_of_gearbox_type, list_of_fuel_type])
enc.fit(df[['model', 'gearbox_type', 'fuel_type']])
def encoder(df):
    df = df.drop(['manufacturer'], axis=1)
    
    # discrete_feature = est.transform(df[['registration_fees', 'engine_capacity']]).toarray()
    # data_count = discrete_feature.sum(axis=0)
    # print('Data counts:', data_count)
    
    encoded_feature = enc.transform(df[['model', 'gearbox_type', 'fuel_type']])
    
    df.drop(['model', 'gearbox_type', 'fuel_type'], axis=1, inplace=True)
    df = pd.concat([df, pd.DataFrame(encoded_feature.toarray())], axis=1)

    # df.drop(['model', 'gearbox_type', 'fuel_type', 'registration_fees', 'engine_capacity'], axis=1, inplace=True)
    # df = pd.concat([df, pd.DataFrame(discrete_feature)], axis=1) 
    
    return df


list_of_model shape: (164,)
list_of_gearbox_type shape: (3,)
list_of_fuel_type shape: (4,)


In [232]:
train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/org_test.csv')

train_df.drop(['id'], axis=1, inplace=True)
train_df, test_df = standardize(train_df, test_df)
train_df = encoder(train_df)
test_df = encoder(test_df)

train_df_with_outliers = pd.read_csv('data/org_train.csv')
train_df_with_outliers = train_df_with_outliers.drop(['id'], axis=1)
pickle.dump(le, open('label_encoder.pkl', 'wb'))
train_df_with_outliers, _ = standardize(train_df_with_outliers, scaler=pickle.load(open('scaler.pkl', 'rb')))
train_df_with_outliers = encoder(train_df_with_outliers)

print(train_df.shape)
print(test_df.shape)


train_df.to_csv('data/train.csv', index=False)
test_df.to_csv('data/test.csv', index=False)
train_df_with_outliers.to_csv('data/train_with_outliers.csv', index=False)

(9900, 177)
(2000, 177)
