<a href="https://colab.research.google.com/github/ameyanator/data-science/blob/main/template.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [9]:
import pandas as pd

In [10]:
# function for comparing different approaches
def score_dataset(X_train, X_valid, y_train, y_valid, model, scorer):
    model = RandomForestRegressor(n_estimators=100, random_state=0)
    model.fit(X_train, y_train)
    preds = model.predict(X_valid)
    return scorer(y_valid, preds)

In [11]:
# find columns which have data in train set but not validation set
def seperate_good_bad_columns_categorical(X_train, X_valid):
  # Categorical columns in the training data
  object_cols = [col for col in X_train.columns if X_train[col].dtype == "object"]

  # Columns that can be safely ordinal encoded
  good_label_cols = [col for col in object_cols if
                    set(X_valid[col]).issubset(set(X_train[col]))]

  # Problematic columns that will be dropped from the dataset
  bad_label_cols = list(set(object_cols)-set(good_label_cols))

  print('Categorical columns that will be ordinal encoded:', good_label_cols)
  print('\nCategorical columns that will be dropped from the dataset:', bad_label_cols)
  return good_label_cols, bad_label_cols

In [12]:
def find_high_low_cardinality_cols(X_train):
  object_cols = [col for col in X_train.columns if X_train[col].dtype == "object"]
  # Columns that will be one-hot encoded
  low_cardinality_cols = [col for col in object_cols if X_train[col].nunique() < 10]

  # Columns that will be dropped from the dataset
  high_cardinality_cols = list(set(object_cols)-set(low_cardinality_cols))

  print('Categorical columns that will be one-hot encoded:', low_cardinality_cols)
  print('\nCategorical columns that will be dropped from the dataset:', high_cardinality_cols)
  return high_cardinality_cols, low_cardinality_cols

In [13]:
from sklearn.preprocessing import OrdinalEncoder

# ordinal encode dataset
def get_ordinal_encoding(X_train, X_valid):
  cols = seperate_good_bad_columns_categorical(X_train, X_valid)
  good_label_cols = cols[0]
  bad_label_cols = cols[1]
  # Drop categorical columns that will not be encoded
  label_X_train = X_train.drop(bad_label_cols, axis=1)
  label_X_valid = X_valid.drop(bad_label_cols, axis=1)

  # Apply ordinal encoder
  ordinal_encoder = OrdinalEncoder() # Your code here
  label_X_train[good_label_cols] = ordinal_encoder.fit_transform(label_X_train[good_label_cols])
  label_X_valid[good_label_cols] = ordinal_encoder.transform(label_X_valid[good_label_cols])
  return label_X_train, label_X_valid

In [14]:
from sklearn.preprocessing import OneHotEncoder
#one hot encoding of dataset

def get_one_hot_encoding(X_train, X_valid):
  # Use as many lines of code as you need!
  cols = find_high_low_cardinality_cols(X_train)
  high_cardinality_cols = cols[0]
  low_cardinality_cols = cols[1]
  OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
  OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(X_train[low_cardinality_cols]))
  OH_cols_valid = pd.DataFrame(OH_encoder.transform(X_valid[low_cardinality_cols]))

  # One-hot encoding removed index; put it back
  OH_cols_train.index = X_train.index
  OH_cols_valid.index = X_valid.index

  object_cols = [col for col in X_train.columns if X_train[col].dtype == "object"]
  # Remove categorical columns (will replace with one-hot encoding)
  num_X_train = X_train.drop(object_cols, axis=1)
  num_X_valid = X_valid.drop(object_cols, axis=1)

  # Add one-hot encoded columns to numerical features
  OH_X_train = pd.concat([num_X_train, OH_cols_train], axis=1)
  OH_X_valid = pd.concat([num_X_valid, OH_cols_valid], axis=1)

  # Ensure all columns have string type
  OH_X_train.columns = OH_X_train.columns.astype(str)
  OH_X_valid.columns = OH_X_valid.columns.astype(str)
  return OH_X_train, OH_X_valid


In [15]:
# get unique entries per columns
def get_unique_entries_per_column(X_train):
  object_cols = [col for col in X_train.columns if X_train[col].dtype == "object"]
  # Get number of unique entries in each column with categorical data
  object_nunique = list(map(lambda col: X_train[col].nunique(), object_cols))
  d = dict(zip(object_cols, object_nunique))

  # Print number of unique entries by column, in ascending order
  sorted(d.items(), key=lambda x: x[1])

In [16]:
#drop cols with categorical data
def drop_cols_with_categorical_data(X_train, X_valid):
  # Fill in the lines below: drop columns in training and validation data
  drop_X_train = X_train.select_dtypes(exclude=['object'])
  drop_X_valid = X_valid.select_dtypes(exclude=['object'])
  return drop_X_train, drop_X_valid

In [17]:
#cols with missing values
def get_cols_with_missing_values(X_train):
  # Shape of training data (num_rows, num_columns)
  print("Shape of X_train: {}".format(X_train.shape))

  # Number of missing values in each column of training data
  missing_val_count_by_column = (X_train.isnull().sum())
  print(missing_val_count_by_column[missing_val_count_by_column > 0])

In [18]:
#drop cols with missing data
def drop_cols_with_missing_data(X_train, X_valid):
  # Get names of columns with missing values
  cols_with_missing = [col for col in X_train.columns
                      if X_train[col].isnull().any()]

  # Drop columns in training and validation data
  reduced_X_train = X_train.drop(cols_with_missing, axis=1)
  reduced_X_valid = X_valid.drop(cols_with_missing, axis=1)
  return reduced_X_train, reduced_X_valid

In [19]:
from sklearn.impute import SimpleImputer

#impute cols with missing data
def impute_cols_with_missing_data(X_train, X_valid, strategy):
  imputer = SimpleImputer(strategy=strategy)  # Or use other strategies like 'median', 'most_frequent', etc.
  imputed_X_train = pd.DataFrame(imputer.fit_transform(X_train))
  imputed_X_valid = pd.DataFrame(imputer.transform(X_valid))

  imputed_X_train.columns = X_train.columns
  imputed_X_valid.columns = X_valid.columns