In [15]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

In [16]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.float_format', '{:.2f}'.format)
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')

In [17]:
customers = pd.read_csv('customers.csv')
campaigns = pd.read_csv('marketing_campaigns.csv')
spend = pd.read_csv('marketing_spend.csv')
transactions = pd.read_csv('transactions.csv')

In [18]:
print('Data Shapes:\n')
print(f'customers: {customers.shape}')
print(f'campaigns: {campaigns.shape}')
print(f'spend: {spend.shape}')
print(f'transactions: {transactions.shape}')

Data Shapes:

customers: (31, 13)
campaigns: (30, 10)
spend: (30, 9)
transactions: (45, 16)


In [19]:
# Print each dataset name; list per-column missing counts only when >0
missings = {
    'customers': customers.isnull().sum(),
    'campaigns': campaigns.isnull().sum(),
    'spend': spend.isnull().sum(),
    'transactions': transactions.isnull().sum()
}
print('Missing Values:')
for name, series in missings.items():
    # series is a Series of per-column null counts
    nonzero = series[series > 0]
    print(f'{name}:')
    if nonzero.any():
        print(nonzero.to_string())
    else:
        print('  No missing values')
    print()

Missing Values:
customers:
  No missing values

campaigns:
  No missing values

spend:
  No missing values

transactions:
store_id           29
card_bank          41
shipping_amount     3
promo_code_used    26
device_type        16



In [20]:
transactions = transactions.drop(columns = ['card_bank'])

In [21]:
transactions['shipping_amount'] = transactions['shipping_amount'].fillna(transactions['shipping_amount'].mean())
transactions['promo_code_used'] = transactions['promo_code_used'].fillna('NoPromo')
transactions['store_id'] = transactions['store_id'].fillna('OnlinePurchase')
transactions['device_type'] = transactions['device_type'].fillna('InStore')

In [22]:
# Print each dataset name; list per-column missing counts only when >0
missings = {
    'customers': customers.isnull().sum(),
    'campaigns': campaigns.isnull().sum(),
    'spend': spend.isnull().sum(),
    'transactions': transactions.isnull().sum()
}
print('Missing Values:')
for name, series in missings.items():
    # series is a Series of per-column null counts
    nonzero = series[series > 0]
    print(f'{name}:')
    if nonzero.any():
        print(nonzero.to_string())
    else:
        print('  No missing values')
    print()

Missing Values:
customers:
  No missing values

campaigns:
  No missing values

spend:
  No missing values

transactions:
  No missing values



In [23]:
# Check for duplicated rows in each DataFrame and print a short message if none found
datasets = {
    'customers': customers,
    'campaigns': campaigns,
    'spend': spend,
    'transactions': transactions
}
for name, df in datasets.items():
    dup = df.loc[df.duplicated()]
    print(f'{name}:')
    if dup.empty:
        print('  No duplicate rows')
    else:
        print(dup)
    print()

customers:
    customer_id first_name last_name             email     city state  age  \
30         1030       Eric     Young  eric.y@email.com  Houston    TX   46   

   gender customer_segment acquisition_date acquisition_channel  \
30      M          Dormant          12/1/20               Email   

    lifetime_value churn_risk  
30          267.90       High  

campaigns:
  No duplicate rows

spend:
  No duplicate rows

transactions:
  No duplicate rows



In [24]:
customers.drop_duplicates(keep = 'first', inplace = True) 

In [25]:
#confirm dropped duplicates
datasets = {
    'customers': customers,
    'campaigns': campaigns,
    'spend': spend,
    'transactions': transactions
}

for name, df in datasets.items():
    dup = df.loc[df.duplicated()]
    print(f'{name}:')
    if dup.empty:
        print('  No duplicate rows')
    else:
        print(dup)
    print()

customers:
  No duplicate rows

campaigns:
  No duplicate rows

spend:
  No duplicate rows

transactions:
  No duplicate rows



In [26]:
print('Data Shapes:\n')
print(f'customers: {customers.shape}')
print(f'campaigns: {campaigns.shape}')
print(f'spend: {spend.shape}')
print(f'transactions: {transactions.shape}')

Data Shapes:

customers: (30, 13)
campaigns: (30, 10)
spend: (30, 9)
transactions: (45, 15)


In [None]:
print('Customers DataFrame:')
customers.info()
customers['acquisition_date'] = pd.to_datetime(customers['acquisition_date'], format='%Y-%m-%d')

Customers DataFrame:
<class 'pandas.core.frame.DataFrame'>
Index: 30 entries, 0 to 29
Data columns (total 13 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   customer_id          30 non-null     int64  
 1   first_name           30 non-null     object 
 2   last_name            30 non-null     object 
 3   email                30 non-null     object 
 4   city                 30 non-null     object 
 5   state                30 non-null     object 
 6   age                  30 non-null     int64  
 7   gender               30 non-null     object 
 8   customer_segment     30 non-null     object 
 9   acquisition_date     30 non-null     object 
 10  acquisition_channel  30 non-null     object 
 11  lifetime_value       30 non-null     float64
 12  churn_risk           30 non-null     object 
dtypes: float64(1), int64(2), object(10)
memory usage: 3.3+ KB


In [32]:
print('\nCampaigns DataFrame:')
campaigns.info()


Campaigns DataFrame:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30 entries, 0 to 29
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   campaign_id      30 non-null     int64 
 1   campaign_name    30 non-null     object
 2   campaign_type    30 non-null     object
 3   channel          30 non-null     object
 4   start_date       30 non-null     object
 5   end_date         30 non-null     object
 6   budget           30 non-null     int64 
 7   target_audience  30 non-null     object
 8   campaign_goal    30 non-null     object
 9   status           30 non-null     object
dtypes: int64(2), object(8)
memory usage: 2.5+ KB


In [33]:
print('\nSpend DataFrame:')
spend.info()


Spend DataFrame:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30 entries, 0 to 29
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   spend_id            30 non-null     int64  
 1   campaign_id         30 non-null     int64  
 2   spend_date          30 non-null     object 
 3   channel             30 non-null     object 
 4   impressions         30 non-null     int64  
 5   clicks              30 non-null     int64  
 6   spend_amount        30 non-null     float64
 7   conversions         30 non-null     int64  
 8   revenue_attributed  30 non-null     float64
dtypes: float64(2), int64(5), object(2)
memory usage: 2.2+ KB


In [34]:
print('\nTransactions DataFrame:')
transactions.info()


Transactions DataFrame:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45 entries, 0 to 44
Data columns (total 15 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   transaction_id      45 non-null     int64  
 1   customer_id         45 non-null     int64  
 2   order_date          45 non-null     object 
 3   channel             45 non-null     object 
 4   store_id            45 non-null     object 
 5   payment_method      45 non-null     object 
 6   subtotal            45 non-null     float64
 7   discount_amount     45 non-null     float64
 8   tax_amount          45 non-null     float64
 9   shipping_amount     45 non-null     float64
 10  total_amount        45 non-null     float64
 11  promo_code_used     45 non-null     object 
 12  device_type         45 non-null     object 
 13  fulfillment_method  45 non-null     object 
 14  return_flag         45 non-null     bool   
dtypes: bool(1), float64(5), int64(2), 