-
Notifications
You must be signed in to change notification settings - Fork 0
/
clean_data.py
29 lines (22 loc) · 892 Bytes
/
clean_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
import pandas as pd
import re
def clean_text(text):
# Remove unwanted symbols and text patterns
# This example removes all characters that are not letters, digits, or whitespace
cleaned_text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
return cleaned_text
def clean_csv(input_file, output_file):
# Load the CSV file
df = pd.read_csv(input_file)
# Apply the cleaning function to all text columns
text_columns = df.select_dtypes(include=['object']).columns
for col in text_columns:
df[col] = df[col].apply(lambda x: clean_text(str(x)))
# Save the cleaned CSV file
df.to_csv(output_file, index=False)
print(f"Cleaned CSV saved to {output_file}")
if __name__ == "__main__":
# Input and output file paths
input_csv = 'extracted_product_data.csv'
output_csv = 'extracted_product_data.csv'
clean_csv(input_csv, output_csv)