# Pre-Processing

In [8]:
import numpy as np 
import pandas as pd

from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import make_column_transformer, ColumnTransformer
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, FunctionTransformer

In [12]:
train = pd.read_csv("data/Train.csv")
test = pd.read_csv("data/Test.csv")

df = train.copy()

## Dropping 

In [3]:
def drop_cols(df):
    # Dropping columns with more than 90% missing values
    missing_percent = df.isnull().mean()
    columns_to_drop = missing_percent[missing_percent > 0.9].index
    df.drop(columns=columns_to_drop, inplace=True)
    
    # Dropping id columns
    df.drop('id', axis=1, inplace=True)
    df.drop('site_id', axis=1, inplace=True)
    df.drop('date', axis=1, inplace=True)

    # City and Country can be dropped since we have site_latitude and site_longitude
    df.drop('city', axis=1, inplace=True)
    df.drop('country', axis=1, inplace=True)

    return df 

In [20]:
drop_transformer = make_pipeline(FunctionTransformer(func=drop_cols))

## Imputation

In [15]:
knn_imputer = make_pipeline(KNNImputer(n_neighbors=5, weights="distance"))
imputer = make_pipeline(SimpleImputer(strategy="mean"))

In [16]:
cols_with_missing_values = df.columns[df.isnull().any()].tolist()

## Scaling

In [17]:
std = make_pipeline(StandardScaler())
min_max = make_pipeline(MinMaxScaler())
rob = make_pipeline(RobustScaler())

In [19]:
numeric_features = df.select_dtypes(include=["int64", "float64"]).columns.tolist()

# Pre-Processor

In [25]:
transformer = make_column_transformer(
    #(drop_transformer, slice(None)),
    (knn_imputer, cols_with_missing_values),
    remainder=std
)

In [24]:
preprocessor = make_pipeline(transformer)