# Preprocessing

In [5]:

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.impute  import KNNImputer


#This function is used if one or more of your column has categorical data
def convert_categorical_to_numeric(df, label_encoders, mode):
    """
    Parameters: df the input DataFrame with some categorical columns, list of label encoders, mode ('train' or 'test').
    Returns: df_converted with all columns converted to numeric types.
    """
    # Create a copy of df to avoid modifying the original one
    df_converted = df.copy()
    # for each column in df
    for column in range(len(df_converted.columns)):
        # Initialise LabelEncoder that performs integer encoding as described in the lecture
        if mode == 'develop':
          label_encoders.append(LabelEncoder())
        # Check if the column is of type object (string/categorical)
        if df_converted.iloc[:,column].dtype == 'object':
          # if so, conert using integer encoding
          if mode == 'develop':
            df_converted.iloc[:,column] = label_encoders[column].fit_transform(df_converted.iloc[:,column])
          else:
            df_converted.iloc[:,column] = label_encoders[column].transform(df_converted.iloc[:,column])
    return df_converted, label_encoders


#This function is used for converting model output into categorical form.
#Used for classification tasks only
def convert_numeric_to_categorical(y, label_encoders):
    df_converted = pd.DataFrame(y)

    # for each column in df
    for column in range(len(df_converted.columns)):
        # Check if the column is numerical
        if pd.api.types.is_numeric_dtype(df_converted.iloc[:,column]):
          df_converted.iloc[:,column] = label_encoders[column].inverse_transform(df_converted.iloc[:,column].astype(int))

    return df_converted



def replace_missing_values_with_neighours_data(data, n_neighbors=1):
    # in this specific dataset, missing values are indicated by '?'
    # so we will need to first replace all '?' values with NaN
    df_replaced = data.replace('?', np.nan)

    # Initialize KNNImputer with the specified number of neighbors
    imputer = KNNImputer(n_neighbors=n_neighbors)
    df_numeric = df_replaced.apply(pd.to_numeric, errors='coerce')
    df_imputed = pd.DataFrame(imputer.fit_transform(df_numeric), columns=df_numeric.columns)
    return df_imputed



# This function can be used for data augmentation
# Below is just one example of data augmentation, but you can use any other suitable data augmentation technique
# If you don't want to use any data augmentation for your task, then in the first line of the function write return X,y
def augment_data(X,y, noise_factor=0.05):
  """
  Augments the data by adding noise to the 'capital.gain' and 'capital.loss' columns.

  Parameters:  X,y: data before augmentation
              noise_factor: The factor to multiply the standard deviation of the columns by to generate noise.

  Returns:The augmented data
  """
  # Make a copy of the data to avoid modifying the original
  x_noisy = X.copy()

  # Add noise to 'capital.gain' which is column # 10
  noise_gain = np.random.normal(0, noise_factor * X[:,10].std(), size=len(X[:,10]))
  x_noisy[:,10] += noise_gain

  # Add noise to 'capital.loss' which is column # 11
  noise_loss = np.random.normal(0, noise_factor * X[:,11].std(), size=len(X[:,11]))
  x_noisy[:,11] += noise_loss

  #use both original & newly created datapoints for training
  x_augmented = np.vstack([x_noisy, X])
  y_augmented = np.concatenate([y, y])
  return x_augmented, y_augmented



# This function can be used for feature engineering
# Below is just one example of feature engineering where we kept the original raw data and added an extra column. You can use any other features as deemed appropriate for your problem
def feature_engineering(x):
  # todo: add new features as needed
  x_copy = x.copy()
  #example, here we add a new feature: capital.net calculated as the capital.gain-capital.loss
  capital_net = x_copy[:,10] - x_copy[:,11]

  x_copy = np.hstack((x_copy, capital_net.reshape(-1,1)))
  return x_copy



# DevelopAndEvaluate

In [23]:

import numpy as np
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split


# if mode is 'develop', it means that we will use the available data for training our model
#if mode is 'evaluate', it means that we will load the model that was trained before (as well as any operators for data pre-processing) to make prediction on novel data
mode = 'develop'

#How many outputs (i.e., columns) your model should predict
num_outputs  = 1

if mode == 'develop':
  #read data from file
  all_data = pd.read_csv('./data/adult1.csv')
  print("reading data complete")

  #convert categorical data to numeric (similar to what we did in Lecture 4 - page 10)
  label_encoders = []
  all_data_numeric,label_encoders = convert_categorical_to_numeric(all_data, label_encoders, mode)
  print("categorical to numerical conversion complete")
  print("shape of data before categorical concersion", all_data.shape)
  print("shape of data after categorical concersion", all_data_numeric.shape)


  #replace missing data with data from the most similar raw (similar to what we did in Lecture 4 - page 6)
  all_data_numeric_no_missing = replace_missing_values_with_neighours_data(all_data_numeric)
  print("missing data replacement complete")


  X = all_data_numeric_no_missing.iloc[:, :-num_outputs] # inputs
  y = all_data_numeric_no_missing.iloc[:, -num_outputs:] # outputs
  # split into train & test sets
  x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=33)

  # Standardise the traing data (similar to what we did in Lecture 4 - pages 15-16)
  scaler = StandardScaler()
  x_train_scaled = scaler.fit_transform(x_train)


  #augment data (similar to what we did in Lecture 4 - pages 39)
  x_train_augmented, y_train_augmented = augment_data(x_train_scaled,y_train)

  # Apply feature engineering
  x_train_feature_engineered = feature_engineering(x_train_augmented)

  # Apply PCA to reduce the dimensionality of the data (similar to what we did in Lecture 4 - pages 28-29)
  pca = PCA(n_components=8)
  pca.fit(x_train_feature_engineered)
  x_train_pca = pca.transform(x_train_feature_engineered)

  #Train the model
  # we specify that 10% of the training data will be used for validation. Ealy stopping is applied (similar to what we did in Lecture 4, page 33)
  # we also use alpha =0.1 to activate L2 regularisation
  model = MLPClassifier(hidden_layer_sizes=(24, 48), activation= 'relu', early_stopping = True, validation_fraction =0.1,alpha =0.1 , verbose = False, learning_rate_init=0.01, batch_size=80 )
  model.fit(x_train_pca, y_train_augmented)
  train_score = model.score(x_train_pca, y_train_augmented)
  print("Training score:", train_score)

  # test model
  x_test_scaled = scaler.transform(x_test)
  x_test_feature_engineered = feature_engineering(x_test_scaled)
  x_test_pca = pca.transform(x_test_feature_engineered)
  test_score = model.score(x_test_pca, y_test)
  print("Testing score:", test_score)

  # now save the model, the PCA object, the scaler object & the encoders so that you can load them at a later point & use them for novel data
  with open('model.pkl', 'wb') as f:
    pickle.dump(model, f)

  with open('pca.pkl', 'wb') as f:
    pickle.dump(pca, f)

  with open('scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

  with open('label_encoders.pkl', 'wb') as f:
    pickle.dump(label_encoders, f)



# Evaluate the model
elif mode == 'evaluate':
  # read the pre-trained model, the PCA object & the scaler object & encoders
  model = pickle.load(open('model.pkl', 'rb'))
  pca = pickle.load(open('pca.pkl', 'rb'))
  scaler = pickle.load(open('scaler.pkl', 'rb'))
  label_encoders= pickle.load(open('label_encoders.pkl', 'rb'))

  # Read novel data. This data usually has no output y. You'll need to use your model to predict y
  novel_data= pd.read_csv('./data/adult2.csv')

  # you will need to do the exact same pre-preocessing for this novel data similar to what you did for your training data

  #convert categorical data to numeric
  novel_data_numeric, _ = convert_categorical_to_numeric(novel_data, label_encoders, mode)

  #replace missing data with data from the most similar raw
  novel_data_no_missing = replace_missing_values_with_neighours_data(novel_data_numeric)



  X_novel = novel_data_no_missing    # novel data will not contain any output; so use all columns as X
  X_novel = scaler.transform(X_novel)
  # standardise
  X_novel = feature_engineering(X_novel)
  # get pca components
  X_novel_pca = pca.transform(X_novel)
  y_predict = model.predict(X_novel_pca)
  print("prediction complete")
  #if output was originally categorical, then convert back to its original form
  y_predict = convert_numeric_to_categorical(y_predict, label_encoders[-num_outputs:])
  print(y_predict)
  #save the prediction to a csv file
  pd.DataFrame(y_predict).to_csv('prediction.csv')

reading data complete
categorical to numerical conversion complete
shape of data before categorical concersion (19536, 15)
shape of data after categorical concersion (19536, 15)
missing data replacement complete


  df_converted.iloc[:,column] = label_encoders[column].fit_transform(df_converted.iloc[:,column])
  df_converted.iloc[:,column] = label_encoders[column].fit_transform(df_converted.iloc[:,column])
  df_converted.iloc[:,column] = label_encoders[column].fit_transform(df_converted.iloc[:,column])
  df_converted.iloc[:,column] = label_encoders[column].fit_transform(df_converted.iloc[:,column])
  df_converted.iloc[:,column] = label_encoders[column].fit_transform(df_converted.iloc[:,column])
  df_converted.iloc[:,column] = label_encoders[column].fit_transform(df_converted.iloc[:,column])
  df_converted.iloc[:,column] = label_encoders[column].fit_transform(df_converted.iloc[:,column])
  df_converted.iloc[:,column] = label_encoders[column].fit_transform(df_converted.iloc[:,column])
  df_converted.iloc[:,column] = label_encoders[column].fit_transform(df_converted.iloc[:,column])
  y = column_or_1d(y, warn=True)


Training score: 0.84
Testing score: 0.8392765739634874
