In [None]:
class DataPipeline:
    """Подготовка исходных данных"""
    
    def __init__(self):
        """Параметры класса"""
        self.medians = None
        self.Rooms_median = None
        self.Square_median = None
        self.LifeSquare_median = None
        self.KitchenSquare_median = None
        self.HouseFloor_median = None
        self.HouseYear_median = None
        
    def fit(self, df):
        """Сохранение статистик"""
        
        # Расчет медиан
        self.medians = df[['LifeSquare', 'Healthcare_1']].median()
        self.Rooms_median = df['Rooms'].median()
        self.Square_median = df['Square'].median()
        self.LifeSquare_median = df['LifeSquare'].median()
        self.KitchenSquare_median = df['KitchenSquare'].median()
        self.HouseFloor_median = df['HouseFloor'].median()
        self.HouseYear_median = df['HouseYear'].median()
        
        
        
    def transform(self, df):
        """Трансформация данных"""
        
        # 1. Пропуски
        df[['LifeSquare', 'Healthcare_1']] = df[['LifeSquare', 'Healthcare_1']].fillna(self.medians)
        
        # 2. Исключение ненужных столбцов
        df.drop(columns=['Id'], inplace=True)
        
        
        # 3. Выбросы (outliers)
        
            # Rooms
            condition = (df['Rooms'] > np.quantile(df['Rooms'], q=0.99)) | (df['Rooms'] < np.quantile(df['Rooms'], q=0.025))
            df['Rooms_outlier'] = 0
            df.loc[condition, 'Rooms_outlier'] = 1
            df.loc[condition, ['Rooms']] = self.Rooms_median
            
            # Square
            square_max_value = np.quantile(df['Square'], q=0.997)
            square_min_value = np.quantile(df['Square'], q=0.002)
            condition = (df['Square'] > square_max_value) | (df['Square'] < square_min_value)
            df['Square_outlier'] = 0
            df.loc[condition, 'Square_outlier'] = 1
            df.loc[condition, 'Square'] = self.Square_median
            
            # KitchenSquare
            kitchen_max_value = np.quantile(df['KitchenSquare'], q=0.99)
            condition = (df['KitchenSquare'] > kitchen_max_value)
            df['KitchenSquare_outlier'] = 0
            df.loc[condition, 'KitchenSquare_outlier'] = 1
            df.loc[condition, 'KitchenSquare'] = self.KitchenSquare_median
            
            # LifeSquare
            lifesquare_max_value = np.quantile(df['LifeSquare'], q=0.975)
            lifesquare_min_value = np.quantile(df['LifeSquare'], q=0.025)
            condition = (df['LifeSquare'] < lifesquare_min_value) | (df['LifeSquare'] > lifesquare_max_value) |\
                        (df['LifeSquare'] >= (df['Square'] - df['KitchenSquare'])
            df['LifeSquare_outlier'] = 0
            df.loc[condition, 'LifeSquare_outlier'] = 1
            df.loc[condition, 'LifeSquare'] = (self.LifeSquare_median / self.Square_median) * df['Square']
            
            # HouseFloor
            housefloor_max_value = np.quantile(df['HouseFloor'], q=0.99)
            condition = (df['HouseFloor'] > housefloor_max_value) | (df['HouseFloor'] == 0)
            df['HouseFloor_outlier'] = 0
            df.loc[condition, 'HouseFloor_outlier'] = 1
            df.loc[condition, 'HouseFloor'] = self.HouseFloor_median
            
            # Floor
            df['Floor_outlier'] = 0
            df.loc[(df['HouseFloor'] < df['Floor']), 'Floor_outlier'] = 1
            df.loc[(df['HouseFloor'] < df['Floor']), 'Floor']  = df['HouseFloor']
            
            # HouseYear
            house_year_min_value = np.quantile(df['HouseYear'], q=0.025)
            condition = ((df['HouseYear'] < house_year_min_value) | (df['HouseYear'] > 2021))
            df['HouseYear_outlier'] = 0
            df.loc[condition, 'HouseYear_outlier'] = 1
            df.loc[condition, 'HouseYear'] = self.HouseYear_median
        
        
        # 4. Изменение типа
            
            # Shops_2
            df.loc[df['Shops_2'] == 'A', 'Shops_2'] = 1
            df.loc[df['Shops_2'] == 'B', 'Shops_2'] = 0
            df['Shops_2'] = df['Shops_2'].astype(int)
            
            # Ecology_2
            df.loc[(df['Ecology_2'] == 'A'), 'Ecology_2'] = 1
            df.loc[(df['Ecology_2'] == 'B'), 'Ecology_2']  = 0
            df['Ecology_2'] = df['Ecology_2'].astype(int)
            
            # Ecology_3
            df.loc[(df['Ecology_3'] == 'A'), 'Ecology_3'] = 1
            df.loc[(df['Ecology_3'] == 'B'), 'Ecology_3']  = 0
            df['Ecology_3'] = df['Ecology_3'].astype(int)
            
        # 5. Новые фичи (features)
        
            # Слияние Social_1 и Social_2
            df['Social_1_2'] = (df['Social_2'] / 200) + df['Social_1']
            df.drop(['Social_1', 'Social_2'], axis=1, inplace=True)
            
            # Отношение LifeSquare/Square
            df['LifeSquare/Square'] = (df['LifeSquare'] / df['Square'])
            
            # Отношение HouseFloor/Floor
            df['HouseFloor/Floor'] = ( df['HouseFloor'] / df['Floor'])
            
            # Нежилая площадь
            df['NonresidentialSquare'] = df['Square'] - df['LifeSquare'] - df['KitchenSquare']
        
            
        return df