In [None]:
class Preprocessing:
    
    #load the data from the file and split into train and test sets
    def fetch_train_test_data(self, path):
        
        raw_features_and_labels = pd.read_csv(path, sep = ",",)
        
        #handle categorical features before splitting the data set.
        
        features_labels = [ "longitude", "latitude", "housing_median_age", "total_rooms", "total_bedrooms", "population", 
                   "households", "median_income", "ocean_proximity"]
        target_label = ["median_house_value"]
        
        features = raw_features_and_labels[features_labels]
        labels = raw_features_and_labels[target_label]
        
        #Create a boolean mask for the features that are categorical among all the features in the dataframe passed
        categorical_feature_mask = features.dtypes == object
        
        #label encode categorical features
        features_with_label_encoded_categorical_features = self.label_encode_categorical_features(features, categorical_feature_mask)
        
        #Only label Encoding is not enough since it gives a natural ordering to different classes in a multiclass categorical
        #feature. One-Hot Encoding is required for the encoding to be unbiased.
        features_with_categorical_features_one_hot_encoded = pd.DataFrame(self.one_hot_encode_categorical_features(features_with_label_encoded_categorical_features, categorical_feature_mask))

        #merge the features and labels before splitting
        features_and_labels = features_with_categorical_features_one_hot_encoded.join(labels)
        
        train_set, test_set = train_test_split(features_and_labels, test_size = 0.2, random_state = 42)
        
        return train_set, test_set
    


    def label_encode_categorical_features(self, features, categorical_feature_mask):
        
        # Filter categorical columns using mask and turn it into a list
        categorical_columns = features.columns[categorical_feature_mask].tolist()
        #LabelEncoder converts each class of the categorical features to a numerical value.
        label_encoder = LabelEncoder()
        #Apply LabelEncoder to each categorical feature
        features[categorical_columns] = features[categorical_columns].apply(lambda col: label_encoder.fit_transform(col))
        print(f"After lable encoding, the categorical features appear as below: ")
        print(features[categorical_columns].head(10))
        print(features.info())
        
        
        return features

    
    
    def one_hot_encode_categorical_features(self, features_with_label_encoded_categorical_features, categorical_feature_mask):
        
        one_hot_encoder = OneHotEncoder(categorical_features = categorical_feature_mask, sparse=False ) 
        # "sparse = False" outputs an array, not a sparse matrix
        features_with_categorical_features_one_hot_encoded = one_hot_encoder.fit_transform(features_with_label_encoded_categorical_features)
        return features_with_categorical_features_one_hot_encoded
    

