# Data Science Homework 4
Try different imbalanced classification datasets using $k$-fold cross validation and various classification methods.
TODO List:
- Make sure we can open all the data as either DataFrame or nparray
- Handle categorical data (tokenize, one-hot encoding, ....)
- Split each dataset into training and testing dataset.
- Perform any necessary sampling, imputaiton, encoding techniques depending on dataset
- Perform 5-fold cross-validation to select datasets.

In [2]:
import os
import numpy as np
import pandas as pd
import sklearn.metrics as metrics
import category_encoders as ce # sklearn library
import xgboost
import lightgbm

from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction import FeatureHasher
from sklearn.model_selection import train_test_split, KFold
from sklearn import preprocessing

DATA_DIR = './data'
files = os.listdir(DATA_DIR)
if '.ipynb_checkpoints' in files:
    files.remove('.ipynb_checkpoints')

special_delims = { 'arcene_train.data': ' '}
no_headers = ['covtype.data', 'arcene_train.data']

print(files)

['covtype.data', 'dataset', 'biodeg.csv', 'eeg_eye_state.arff', 'heart_2020_cleaned.csv', 'phpAmSP4g.arff', 'WA_Fn-UseC_-Telco-Customer-Churn.csv', 'WineQT.csv', 'HTRU_2.csv', 'spambase.data', 'Raisin_Dataset.arff', 'UCI_Credit_Card.csv', 'income_evaluation.csv', 'abalone.data', 'arcene_train.data']


## Load datasets
Some datasets are in csv format, others have just the data. First convert to `DataFrame`s to allow for numeric, categorical values.

In [3]:

def load(name, header=True):
    sep = special_delims[name] if name in special_delims else ','
    name = os.path.join(DATA_DIR, name)
    if header:
        df = pd.read_csv(name, sep=sep)
    else:
        df = pd.read_csv(name, header=None, sep=sep)
    return df

## Encode Categorical Values

In [52]:
def encode_per_column(data, encoder):
    for column in data.columns:
        data[column] = encoder.fit_transform(data[column])
    return data
        
def encode_categorical(data:pd.DataFrame, method='ordinal'):
    if method == 'ordinal':
        encoder = ce.OrdinalEncoder()
    elif method == 'onehot':
        encoder = ce.OneHotEncoder()
    elif method == 'label':
        encoder = preprocessing.LabelEncoder()
        return encode_per_column(data,encoder)
    elif method == 'feature':
        encoder = FeatureHasher(n_features=10, input_type='string')
        encoder.transform(data.type)
    elif method == 'target':
        encoder = ce.target_encoder.TargetEncoder()
        y = data.iloc[:,-1]
        return encoder.fit_transform(data.iloc[:,:-1], y)
    elif method == 'leaveoneout':
        y = data.iloc[:,-1]
        encoder = ce.LeaveOneOutEncoder()
        return encoder.fit_transform(data.iloc[:,:-1], y)
    elif method == 'frequency':
        encoder = ce.CountEncoder()
    return encoder.fit_transform(data)

def scale_features(data:pd.DataFrame, method='standard') -> pd.DataFrame:
    if method == 'standard':
        scaler = preprocessing.StandardScaler()
    elif method == 'minmax':
        scaler = preprocessing.MinMaxScaler()
    return scaler.fit_transform(data)

## Train model:
The evalutation should be fixed on 5-fold cross validation, choose from `RandomForest`, `GBDT`, `XGBoost`,`LightBGM`, `CatBoost`, `KNN`, `Logistic Regression`,`MLP`, `SVM`.

Train a new model every iteration of the cross-validation.

In [53]:

def get_model(method):
    if method == 'xgboost':
        model = xgboost.XGBClassifier(5)
    elif method == 'knn':
        model = KNeighborsClassifier()
    elif method == 'forest':
        model = RandomForestClassifier(10)
    elif method == 'lightgbm':
        model = lgb
    
    else:
        print(f'{method} not supported.')
        model = None
    return model

def get_metric(method):
    if method == 'acc':
        met = metrics.accuracy_score
    elif method == 'auc':
        met = metrics.auc
    elif method == 'roc_auc':
        met = metrics.roc_auc_score
    elif method == 'f1':
        met = metrics.f1_score
    return met

def train(x, y, method='xgboost', metric='acc'):
    # Perform K-Fold cross validation
    metric = get_metric(metric)
    n_items = x.shape[0]
    kf = KFold( n_splits=5, shuffle=True)
    
    mean_score = 0 
    for i, (train_index, test_index) in enumerate(kf.split(x)):
        model = get_model(method)
        x_train, x_test = x.iloc[train_index,: ], x.iloc[test_index, :]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        if method == 'xgboost':
            y_train -= 1 # Class labels should be zero-based
        
        model.fit(x_train, y_train)
        
        # Evaluate
        preds = model.predict(x_test)
        score = metric(y_test, preds)
        mean_score += score
        #print(f'\t{mean_score}')
        
    mean_score /= (i+1)
    #print(f'Mean score over validation: {mean_score} using {method}')

    return mean_score
        

## P1: How does feature scaling (e.g. performing normalization) affect performance?
With standardization we apply the formula
$$x' = \frac{x - \mu}{\sigma}$$
so that we have 0 mean in the training data.

For One-hot Encoding we should not use this with tree-based models. For these models we can use label encoding, feature-hashing.
We should first encode the categorical values, then train the models on all our datasets both with feature scaling and without.

In [13]:
for file in files:
    
    df = load(file, False if file in no_headers else True)
    # Encode categorical columns
    cats = df.select_dtypes(include=['object'])
    df[cats.columns] = encode_categorical(cats)
    
    control = df.copy(deep=True) # Compare with standardized dataset

    df, labels = df.iloc[:, :-1], df.iloc[:, -1]
    control, c_labels = control.iloc[:, :-1], control.iloc[:, -1]
    
    # Feature scaling
    df = pd.DataFrame(scale_features(df))
    
    # Train model
    print(f'Training on {file}')
    score = train(df, labels, method='knn', metric='acc')
    print(f'train: {file}: avg: {score}')
   
    score = train(control, c_labels, method='knn', metric='acc')
    print(f'test: {file}: avg: {score}')
    print('\n')
    # The performance is mostly the same across models

Training on covtype.data


KeyboardInterrupt: 

## P2 When using tree-based algorithms, will usng one-hot encoding for categorical features generate worse performance than using label encoding? Why?
One-hot encoding transforms inputs with a range of $(1,\dots,n)$. This method generates extra columns and usually results in a sparse matrix.

Label Encoding adds replaces each unique class name with an integer.

In [None]:
for file in files:
    df = load(file, False if file in no_headers else True)
    target_column = df.columns[-1]
    label = df.copy(deep=True) # Use label encoding
    
    cats = df.select_dtypes(include=['object'])

    # Ignore files w/ no categorical data, otherw encoder error
    if cats.empty:
        print(f'No categorical in {file}\n')
        continue
        
    n_unique = df[target_column].unique().shape[0] # Num of unique items in target column

    one_hot = pd.get_dummies(df)
    cats = label.select_dtypes(include=['object'])
    #print(f'{file}: {target_column}')
    # Apply Label Encoder to each column
    for column in cats.columns:
        label[column] = encode_categorical(label[column], method='label')
        
    # If target column is not categorical, labels will only be one column
    if df.dtypes[target_column] == object:
        n_unique = df[target_column].unique().shape[0]
    else:
        n_unique = 1
    
    x_ohe, y_ohe = one_hot.iloc[:,:-n_unique], one_hot.iloc[:,-n_unique:]
    x_label, y_label = label.iloc[:,:-1], label.iloc[:,-1]
    
    #print(f'{file}: xcols: {len(x_label)} ycols: {len(y_label)}')
    #print(one_hot)
    print(f'Training on {file}')
    score = train(x_ohe, y_ohe, method='forest', metric='acc')
    print(f'train: {file}: avg: {score}')
    
    score = train(x_label, y_label, method='forest', metric='acc')
    print(f'test: {file}: avg: {score}')
    print('\n')
    ## There is a slight difference in performance (label encoder usually better by a bit)

## P3
Will feature binning provide performance improvement? When is binning useful (which models or which kinds of datasets)?

Loop through each column in the dataframe and bin individually.
We use LabelEncoder for categorical features on both DataFrames.

In [None]:
for file in files:
    df = load(file, False if file in no_headers else True)
    binned = df.copy(deep=True)
    # TODO: fix arcene random space unable to parse
    if file == 'arcene_train.data':
        continue
    # Bin each column individually, use LabelEncoder for categorical
    for column in binned.columns:
        if (binned[column].dtype.name == 'int64'
           or binned[column].dtype.name == 'float64'):
            #n_unique = binned[column].unique() 
            binned[column]= pd.cut(binned[column], 5, labels=False, duplicates='drop')
            #binned[column].fillna(0, inplace=True)

            if binned[column].isnull().values.any():
                binned[colum] = 0
        #elif check if column is categorical
        elif binned[column].dtype.name == 'object':
            binned[column] = encode_categorical(binned[column], method='label')
            df[column] = encode_categorical(df[column], method='ordinal')

    # Train
    x_bin, y_bin = binned.iloc[:, :-1], binned.iloc[:, -1]
    x_label, y_label = df.iloc[:, :-1], df.iloc[:, -1]
    
    print(f'Training on {file}')
    score = train(x_bin, y_bin, method='forest', metric='acc')
    print(f'train: {file}: avg: {score}')
    
    score = train(x_label, y_label, method='forest', metric='acc')
    print(f'test: {file}: avg: {score}')
    print('\n')

## P4
Compare the performance of 6 different categorical feature encoding methods based on Random Forest, XGBoost LightGBM, MLP, SVM. Which of the 6 encoding methods is better?

Models to test: 
1. One hot encoding
2. Label Encoding
3. Feature Hasing
4. Frequency Encoding
5. Target Encoding
6. Leave One Out Encoding

In [55]:
ceMethods = [#'onehot',
              #'label',
              #'feature',
              #'target',
              'leaveoneout',
              'frequency'
             ]
modelNames = ['forest',
             'xgboost',
             #'lightgbm',
             #'MLP',
             #'SVM'
            ]

for file in files:
    print(f'Checking {file}')
    df = load(file, header = False if file in no_headers else True)
    cats = df.select_dtypes(include=['object'])
    print(df.columns)
    for method in ceMethods:
        # Encode features
        print(f'Using {method}')
        encoded = df.copy()
        try:
            encode_categorical(encoded, method=method)
        except:
            continue
        for modelName in modelNames:
            
            print(modelName)
            

Checking covtype.data
Int64Index([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
            17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
            34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
            51, 52, 53, 54],
           dtype='int64')
Using leaveoneout
forest
xgboost
Using frequency
forest
xgboost
Checking dataset
Index(['month', 'credit_amount', 'credit_term', 'age', 'sex', 'education',
       'product_type', 'having_children_flg', 'region', 'income',
       'family_status', 'phone_operator', 'is_client', 'bad_client_target'],
      dtype='object')
Using leaveoneout
forest
xgboost
Using frequency
forest
xgboost
Checking biodeg.csv
Index(['SpMax_L', 'J_Dz(e)', 'nHM', 'F01[N-N]', 'F04[C-N]', 'NssssC', 'nCb-',
       'C%', 'nCp', 'n)', 'F03[C-N]', 'SdssC', 'HwWi_B(m)', 'LOC', 'SM6_L',
       'F03[C-O]', 'Me', 'Mi', 'nN-N', 'nArNO2', 'nCRX3', 'SpPosA_B(p)',
       'nCIR', 'B01[C-BR]', 'B03[C-CI]', 'N-073', '

EmptyDataError: No columns to parse from file