-
Notifications
You must be signed in to change notification settings - Fork 0
/
data_cleaning.py
39 lines (31 loc) · 1.56 KB
/
data_cleaning.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
from sklearn.base import BaseEstimator, TransformerMixin
from urls_and_paths.path import PROD_ID_AND_LINK
from logs.logger import App_Logger
class DropDuplicates(BaseEstimator, TransformerMixin):
"""
Description: This method will drop all the duplicate rows in the dataset
"""
def fit(self, X, y=None):
return self
def transform(self, X):
data_copy = X.copy()
data_copy.drop_duplicates(inplace=True)
total_records = data_copy.shape[0]
App_Logger().log(module='preprocessing', msg_type='success', message=f'Duplicate rows dropped - after dropping duplicates total records saved is {total_records}')
return data_copy
class DropColumns(BaseEstimator, TransformerMixin):
"""
Description: Drop rows from Dataframe and return the Dataframe
Parameters: transform(X) where X is a Dataframe
Return: Dataframe
Columns to be dropped: product_id, product_link, Processor_Generation, Refresh_Rate
product_id and product_link columns will be stored in separate file for future use
"""
def fit(self, X, y=None):
return self
def transform(self, X):
dropcolumns = ['product_id', 'product_link', 'product_description', 'product_image', 'Processor_Generation', 'Refresh_Rate']
data_copy = X.copy()
data_copy[['product_id', 'product_link', 'product_description', 'product_image']].to_csv(PROD_ID_AND_LINK, index=False) # Save product id and link before dropping columns
data_copy.drop(dropcolumns, inplace=True, axis=1)
return data_copy