In [3]:
import pandas as pd


INPUT_FILE = "/Users/anushnabinde/Downloads/sales_data.txt"   
OUTPUT_FILE = "cleaned_transactions.csv"


def clean_data():
    # read input file
    df = pd.read_csv(
        INPUT_FILE,
        sep="|",
        skip_blank_lines=True
    )

    total_parsed = len(df)

    # drop rows with missing CustomerID or Region
    df_clean = df.dropna(subset=["CustomerID", "Region"])

    # remove commas from ProductName
    df_clean["ProductName"] = (
        df_clean["ProductName"]
        .astype(str)
        .str.replace(",", "", regex=False)
    )

    # remove commas from numeric features
    df_clean["Quantity"] = (
        df_clean["Quantity"]
        .astype(str)
        .str.replace(",", "", regex=False)
        .astype(int)
    )

    df_clean["UnitPrice"] = (
        df_clean["UnitPrice"]
        .astype(str)
        .str.replace(",", "", regex=False)
        .astype(float)
    )

    # applying filter rules
    df_clean = df_clean[df_clean["TransactionID"].str.startswith("T")]
    df_clean = df_clean[df_clean["Quantity"] > 0]
    df_clean = df_clean[df_clean["UnitPrice"] > 0]

    total_after_cleaning = len(df_clean)
    total_deleted = total_parsed - total_after_cleaning

    
    df_clean.to_csv(OUTPUT_FILE, index=False)

    
    print("Data Cleaning Summary")
    print("---------------------")
    print(f"Total records parsed         : {total_parsed}")
    print(f"Total records after cleaning : {total_after_cleaning}")
    print(f"Total records deleted        : {total_deleted}")
    print(f"Cleaned file saved as        : {OUTPUT_FILE}")


if __name__ == "__main__":
    clean_data()


Data Cleaning Summary
---------------------
Total records parsed       : 80
Total records after clean  : 70
Total records deleted      : 10
Cleaned file saved as      : cleaned_transactions.csv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean["ProductName"] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean["Quantity"] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean["UnitPrice"] = (
