In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
customer=pd.read_csv("../data/raw/customers.csv")
product=pd.read_csv("../data/raw/products.csv")
transactions=pd.read_csv("../data/raw/sales_transactions.csv")

In [5]:
customer.head()


Unnamed: 0,CustomerID,Name,Region,Signup_Date
0,CUST-001,Customer_1,east,2020-01-05
1,CUST-002,Customer_2,East,2020-01-12
2,CUST-003,Customer_3,West,2020-01-19
3,CUST-004,Customer_4,midwest,2020-01-26
4,CUST-005,Customer_5,midwest,2020-02-02


In [36]:
customer.describe()

Unnamed: 0,CustomerID,Name,Region,Signup_Date
count,105,105,105,105
unique,100,100,10,100
top,CUST-008,Customer_8,South,2020-02-23
freq,2,2,23,2


In [37]:
customer.info()

<class 'pandas.DataFrame'>
RangeIndex: 105 entries, 0 to 104
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype
---  ------       --------------  -----
 0   CustomerID   105 non-null    str  
 1   Name         105 non-null    str  
 2   Region       105 non-null    str  
 3   Signup_Date  105 non-null    str  
dtypes: str(4)
memory usage: 3.4 KB


In [38]:
#sign up date should be string
customer['Signup_Date']=pd.to_datetime(customer['Signup_Date'], errors='coerce')

In [39]:
product.describe()

Unnamed: 0,Price
count,20.0
mean,283.633
std,121.485899
min,63.69
25%,205.18
50%,303.745
75%,362.3275
max,464.98


In [40]:
product.info()

<class 'pandas.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   ProductID  20 non-null     str    
 1   Category   20 non-null     str    
 2   Price      20 non-null     float64
dtypes: float64(1), str(2)
memory usage: 612.0 bytes


In [41]:
transactions.describe()

Unnamed: 0,TransactionID,Quantity,Discount
count,1000.0,1000.0,1000.0
mean,10500.5,9.882,0.0858
std,288.819436,70.281801,0.072998
min,10001.0,-9.0,0.0
25%,10250.75,3.0,0.05
50%,10500.5,5.0,0.05
75%,10750.25,7.0,0.1
max,11000.0,1000.0,0.2


In [42]:
transactions.info()


<class 'pandas.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   TransactionID  1000 non-null   int64  
 1   CustomerID     1000 non-null   str    
 2   ProductID      1000 non-null   str    
 3   Date           995 non-null    str    
 4   Quantity       1000 non-null   int64  
 5   Discount       1000 non-null   float64
 6   Total_Amount   971 non-null    str    
dtypes: float64(1), int64(2), str(4)
memory usage: 54.8 KB


In [None]:
#problems with data( transactions)
# (incorrect data types )
#transaction Date type is str
#total amount is str
#missing values in Date and total amount
#quantity min is negative

In [None]:
#Data Cleaning

In [43]:
product['Category'] = product['Category'].str.strip().str.title()
customer['Region'] = customer['Region'].str.strip().str.title()
customer['Name'] = customer['Name'].str.strip()
customer['Signup_Date'] = pd.to_datetime(customer['Signup_Date'], errors='coerce')

In [30]:
# handling missing values for Date ?


In [44]:
transactions['Quantity'].skew()
#since >1, then it's skewed
#since there is outlier or skewness in the dataset we will fill the missing values with median not mean

np.float64(14.021126545817783)

In [46]:
#fixing data types problem (Numerical cleaning )
transactions['Date']=pd.to_datetime(transactions['Date'], errors='coerce')
transactions['Total_Amount'] = pd.to_numeric(
    transactions['Total_Amount'],
    errors='coerce')


In [47]:
#Remove negative Quantity
transactions = transactions[transactions['Quantity'] >= 0]


In [48]:
transactions.info()


<class 'pandas.DataFrame'>
Index: 985 entries, 0 to 999
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   TransactionID  985 non-null    int64         
 1   CustomerID     985 non-null    str           
 2   ProductID      985 non-null    str           
 3   Date           970 non-null    datetime64[us]
 4   Quantity       985 non-null    int64         
 5   Discount       985 non-null    float64       
 6   Total_Amount   669 non-null    float64       
dtypes: datetime64[us](1), float64(2), int64(2), str(2)
memory usage: 61.6 KB


In [None]:
#Phase 2

In [49]:
# Merge sales with customers
sales_customers = transactions.merge(customer, on='CustomerID', how='left')

# Merge the result with products
full_data = sales_customers.merge(product, on='ProductID', how='left')

print(full_data.head())
print(full_data.info())

   TransactionID CustomerID ProductID                Date  Quantity  Discount  \
0          10001   CUST-011  PROD-017 2023-01-01 00:00:00         1      0.10   
1          10001   CUST-011  PROD-017 2023-01-01 00:00:00         1      0.10   
2          10003   CUST-091  PROD-017 2023-01-01 02:00:00         7      0.05   
3          10003   CUST-091  PROD-017 2023-01-01 02:00:00         7      0.05   
4          10004   CUST-008  PROD-006 2023-01-01 03:00:00         1      0.00   

   Total_Amount         Name Region Signup_Date   Category   Price  
0           NaN  Customer_11  South  2020-03-15  Furniture  403.27  
1           NaN  Customer_11  South  2020-03-15  Furniture  403.27  
2           NaN  Customer_91  North  2021-09-26  Furniture  403.27  
3           NaN  Customer_91  North  2021-09-26  Furniture  403.27  
4         148.0   Customer_8   West  2020-02-23  Furniture  148.00  
<class 'pandas.DataFrame'>
RangeIndex: 1038 entries, 0 to 1037
Data columns (total 12 columns):
 # 

In [None]:
#Phase 3


In [64]:
full_data[['Quantity', 'Price', 'Discount', 'Total_Amount', 'Calculated_Revenue']].head(10)

Unnamed: 0,Quantity,Price,Discount,Total_Amount,Calculated_Revenue
0,1,403.27,0.1,403.17,362.943
1,1,403.27,0.1,403.17,362.943
2,7,403.27,0.05,2822.54,2681.7455
3,7,403.27,0.05,2822.54,2681.7455
4,1,148.0,0.0,148.0,148.0
5,1,148.0,0.0,148.0,148.0
6,5,180.34,0.05,856.62,856.615
7,1,63.69,0.1,57.32,57.321
8,3,403.27,0.1,1088.83,1088.829
9,4,180.34,0.2,577.09,577.088


In [63]:
#Feature Engineering

# calculated revenue
full_data['Calculated_Revenue'] = full_data['Quantity'] * full_data['Price'] * (1 - full_data['Discount'])

#compare with the total amount (original dirty )
full_data['Revenue_Difference'] = full_data['Total_Amount'] - full_data['Calculated_Revenue']
print(full_data[['Total_Amount','Calculated_Revenue', 'Revenue_Difference']].head(10))

   Total_Amount  Calculated_Revenue  Revenue_Difference
0        403.17            362.9430             40.2270
1        403.17            362.9430             40.2270
2       2822.54           2681.7455            140.7945
3       2822.54           2681.7455            140.7945
4        148.00            148.0000              0.0000
5        148.00            148.0000              0.0000
6        856.62            856.6150              0.0050
7         57.32             57.3210             -0.0010
8       1088.83           1088.8290              0.0010
9        577.09            577.0880              0.0020


In [65]:
#fill total amount with the calculated amount
full_data['Total_Amount'] = full_data['Total_Amount'].fillna(full_data['Calculated_Revenue'])

# Show full data after filling Total_Amount
full_data[['CustomerID','ProductID','Quantity','Price','Discount','Total_Amount','Calculated_Revenue']].head(20)