In [1]:
import pandas as pd
import matplotlib.pyplot as plt


# loading customer data

In [2]:
# importing customer list data with the correct delimiter
customer_list = pd.read_csv('customer_list_updated.csv', delimiter='|')


In [3]:
print(customer_list.head(10))

   cust_id        date      time           name  \
0        1  2023-03-15  08:45:12         Rachel   
1        2  2023-05-22  12:30:45     R&! Geller   
2        3  2023-07-09  18:15:27  Monica Geller   
3        4  2023-09-01  21:05:33  Chandler Bing   
4        5  2023-11-18  14:22:10           Joey   
5        6  2024-01-05  10:55:49     P&! Buffay   
6        7  2024-02-14  16:40:05        Gunther   
7        8  2023-04-20  09:15:30         Janice   
8        9  2023-06-30  13:50:55       Mike H.^   
9       10  2023-08-25  17:25:10          Emily   

                            email         phone sms-opt-out   
0       rachel@centralperk.coffee  212-555-1001            N  
1        rossg@centralperk.coffee  212-555-1002            N  
2   chefmonica@centralperk.coffee  212-555-1003            N  
3    chandlerb@centralperk.coffee  212-555-1004            Y  
4  howyoudoing@centralperk.coffee  212-555-1005            N  
5    smellycat@centralperk.coffee  212-555-1006            N

In [4]:
# Removing extra spaces in column names
customer_list.columns = customer_list.columns.str.strip()


In [5]:
# Removing non-standard characters from 'name' column
customer_list['name'] = customer_list['name'].str.replace(r'[^a-zA-Z\-\.\s]', '', regex=True)


In [6]:
# changing phone numbers to the format NNN-NNN-NNNN
customer_list['phone'] = customer_list['phone'].str.replace(r'(\d{3})(\d{3})(\d{4})', r'\1-\2-\3', regex=True)


In [7]:
# Filling missing values in 'sms-opt-out' with 'N'
customer_list['sms-opt-out'] = customer_list['sms-opt-out'].fillna('N')


In [8]:
# Displaying the cleaned customer list
print(customer_list.head())


   cust_id        date      time           name  \
0        1  2023-03-15  08:45:12         Rachel   
1        2  2023-05-22  12:30:45       R Geller   
2        3  2023-07-09  18:15:27  Monica Geller   
3        4  2023-09-01  21:05:33  Chandler Bing   
4        5  2023-11-18  14:22:10           Joey   

                            email         phone sms-opt-out  
0       rachel@centralperk.coffee  212-555-1001           N  
1        rossg@centralperk.coffee  212-555-1002           N  
2   chefmonica@centralperk.coffee  212-555-1003           N  
3    chandlerb@centralperk.coffee  212-555-1004           Y  
4  howyoudoing@centralperk.coffee  212-555-1005           N  


In [9]:
# saving the cleaned customer_list
customer_list.to_csv('cleaned_customer_list.csv', index=False)


# comparing original customer_list with cleaned

In [10]:
# Loading the original and cleaned data
original_customer_list = pd.read_csv('customer_list_updated.csv', delimiter='|')
cleaned_customer_list = pd.read_csv('cleaned_customer_list.csv')


In [11]:
# Create a comparison DataFrame to view before and after cleaning
comparison = pd.DataFrame({
    "Original Name": original_customer_list['name'],
    "Cleaned Name": cleaned_customer_list['name'],
    "Original Phone": original_customer_list['phone'],
    "Cleaned Phone": cleaned_customer_list['phone'],
    "Original SMS Opt-Out": original_customer_list.get('sms-opt-out', 'N/A'),
    "Cleaned SMS Opt-Out": cleaned_customer_list['sms-opt-out']
})

# Display the first 10 rows for comparison
comparison.head(10)


Unnamed: 0,Original Name,Cleaned Name,Original Phone,Cleaned Phone,Original SMS Opt-Out,Cleaned SMS Opt-Out
0,Rachel,Rachel,212-555-1001,212-555-1001,,N
1,R&! Geller,R Geller,212-555-1002,212-555-1002,,N
2,Monica Geller,Monica Geller,212-555-1003,212-555-1003,,N
3,Chandler Bing,Chandler Bing,212-555-1004,212-555-1004,,Y
4,Joey,Joey,212-555-1005,212-555-1005,,N
5,P&! Buffay,P Buffay,212-555-1006,212-555-1006,,N
6,Gunther,Gunther,212-555-1007,212-555-1007,,N
7,Janice,Janice,212-555-1008,212-555-1008,,N
8,Mike H.^,Mike H.,212-555-1009,212-555-1009,,N
9,Emily,Emily,212-555-1010,212-555-1010,,N


In [12]:



# Load the original and cleaned customer list files
original_customer_list = pd.read_csv('customer_list_updated.csv', delimiter='|')
cleaned_customer_list = pd.read_csv('cleaned_customer_list.csv')

# Display the last 10 rows of the 'phone' column for both original and cleaned data
print("Original Phone - Last 10 Entries:")
print(original_customer_list['phone'].tail(10))

print("\nCleaned Phone - Last 10 Entries:")
print(cleaned_customer_list['phone'].tail(10))


Original Phone - Last 10 Entries:
511    555-555-3201
512    555-555-3202
513    555-555-3203
514    555-555-3204
515    555-555-3205
516    555-555-3206
517    555-555-3207
518    555-555-3208
519    555-555-3209
520    555-555-3210
Name: phone, dtype: object

Cleaned Phone - Last 10 Entries:
511    555-555-3201
512    555-555-3202
513    555-555-3203
514    555-555-3204
515    555-555-3205
516    555-555-3206
517    555-555-3207
518    555-555-3208
519    555-555-3209
520    555-555-3210
Name: phone, dtype: object


# loading the transaction data

In [13]:
# Load the transaction data
transaction_data = pd.read_csv('transaction_data.csv')


In [14]:
print(transaction_data.head(10))

   TransactionID  CustID       Date       Employee  ProductID  \
0          10258       1  7/17/2023  Nancy Davolio         32   
1          10275       2   8/7/2023  Nancy Davolio         24   
2          10275       2   8/7/2023  Nancy Davolio         59   
3          10292       3  8/28/2023  Nancy Davolio         20   
4          10293       4  8/29/2023  Nancy Davolio         18   
5          10293       4  8/29/2023  Nancy Davolio         24   
6          10293       4  8/29/2023  Nancy Davolio         63   
7          10293       4  8/29/2023  Nancy Davolio         75   
8          10304       4  9/12/2023  Nancy Davolio         59   
9          10304       4  9/12/2023  Nancy Davolio         71   

              ProductName  VendorID                    ProductVendor  \
0      Mascarpone Fabioli        14          Formaggi Fortini s.r.l.   
1        Guaran Fantstica        10        Refrescos Americanas LTDA   
2    Raclette Courdavault        28                      Gai pturage

In [15]:
# Convert the 'Date' column to datetime format
transaction_data['Date'] = pd.to_datetime(transaction_data['Date'], errors='coerce')


In [16]:
# Remove duplicate rows, if any
transaction_data = transaction_data.drop_duplicates()


In [17]:
# Check for missing values
missing_data = transaction_data.isnull().sum()
print("Missing values per column:\n", missing_data)



Missing values per column:
 TransactionID    0
CustID           0
Date             0
Employee         0
ProductID        0
ProductName      0
VendorID         0
ProductVendor    0
UnitPrice        0
Quantity         0
Subtotal         0
Tax(8%)          0
OrderTotal       0
dtype: int64


In [18]:
# Displaying the cleaned transaction list
print(transaction_data.head())

   TransactionID  CustID       Date       Employee  ProductID  \
0          10258       1 2023-07-17  Nancy Davolio         32   
1          10275       2 2023-08-07  Nancy Davolio         24   
2          10275       2 2023-08-07  Nancy Davolio         59   
3          10292       3 2023-08-28  Nancy Davolio         20   
4          10293       4 2023-08-29  Nancy Davolio         18   

              ProductName  VendorID              ProductVendor  UnitPrice  \
0      Mascarpone Fabioli        14    Formaggi Fortini s.r.l.       25.6   
1        Guaran Fantstica        10  Refrescos Americanas LTDA        3.6   
2    Raclette Courdavault        28                Gai pturage       44.0   
3  Sir Rodney's Marmalade         8   Specialty Biscuits, Ltd.       64.8   
4        Carnarvon Tigers         7              Pavlova, Ltd.       50.0   

   Quantity  Subtotal  Tax(8%)  OrderTotal  
0         6     153.6    12.29      165.89  
1        12      43.2     3.46       46.66  
2         6

In [19]:
# Saving the cleaned data to a new CSV file
transaction_data.to_csv('cleaned_transaction_data.csv', index=False)
