In [1]:
import pandas as pd
from prophet import Prophet
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import os
import numpy as np
import datetime as dt
import hvplot.pandas
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
from scipy.stats import linregress
import holoviews as hv
hv.extension('bokeh')
from bokeh.models import NumeralTickFormatter



In [2]:
data = pd.read_csv('Resources/credit_card_transactions.csv')
data.head()
data.tail()

Unnamed: 0.1,Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,...,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud,merch_zipcode
1296670,1296670,2020-06-21 12:12:08,30263540414123,fraud_Reichel Inc,entertainment,15.56,Erik,Patterson,M,162 Jessica Row Apt. 072,...,-112.4777,258,Geoscientist,1961-11-24,440b587732da4dc1a6395aba5fb41669,1371816728,36.841266,-111.690765,0,
1296671,1296671,2020-06-21 12:12:19,6011149206456997,fraud_Abernathy and Sons,food_dining,51.7,Jeffrey,White,M,8617 Holmes Terrace Suite 651,...,-77.5101,100,"Production assistant, television",1979-12-11,278000d2e0d2277d1de2f890067dcc0a,1371816739,38.906881,-78.246528,0,22630.0
1296672,1296672,2020-06-21 12:12:32,3514865930894695,fraud_Stiedemann Ltd,food_dining,105.93,Christopher,Castaneda,M,1632 Cohen Drive Suite 639,...,-105.8189,899,Naval architect,1967-08-30,483f52fe67fabef353d552c1e662974c,1371816752,33.619513,-105.130529,0,88351.0
1296673,1296673,2020-06-21 12:13:36,2720012583106919,"fraud_Reinger, Weissnat and Strosin",food_dining,74.9,Joseph,Murray,M,42933 Ryan Underpass,...,-102.5411,1126,Volunteer coordinator,1980-08-18,d667cdcbadaaed3da3f4020e83591c83,1371816816,42.78894,-103.24116,0,69367.0
1296674,1296674,2020-06-21 12:13:37,4292902571056973207,"fraud_Langosh, Wintheiser and Hyatt",food_dining,4.3,Jeffrey,Smith,M,135 Joseph Mountains,...,-113.8748,218,"Therapist, horticultural",1995-08-16,8f7c8e4ab7f25875d753b422917c98c9,1371816817,46.565983,-114.18611,0,59870.0


In [3]:
data.columns

Index(['Unnamed: 0', 'trans_date_trans_time', 'cc_num', 'merchant', 'category',
       'amt', 'first', 'last', 'gender', 'street', 'city', 'state', 'zip',
       'lat', 'long', 'city_pop', 'job', 'dob', 'trans_num', 'unix_time',
       'merch_lat', 'merch_long', 'is_fraud', 'merch_zipcode'],
      dtype='object')

In [4]:
unique_categoreis = data['category'].unique()
print(unique_categoreis)

['misc_net' 'grocery_pos' 'entertainment' 'gas_transport' 'misc_pos'
 'grocery_net' 'shopping_net' 'shopping_pos' 'food_dining' 'personal_care'
 'health_fitness' 'travel' 'kids_pets' 'home']


In [5]:
data['cc_num'] = data['cc_num'].astype(str)
data['amt'] = data['amt'].astype(float)

In [6]:
# Select relevant columns for correlation analysis
# Replace these columns with the ones you want to analyze
relevant_data = data[['amt', 'city_pop']]

# Check for missing values
print(relevant_data.isnull().sum())

# Fill missing values or drop them if necessary
# For instance, you can drop rows with missing values in selected columns
relevant_data = relevant_data.dropna()

# Compute the correlation matrix
correlation_matrix = relevant_data.corr()

# Display the correlation matrix
print(correlation_matrix)


amt         0
city_pop    0
dtype: int64
               amt  city_pop
amt       1.000000  0.005818
city_pop  0.005818  1.000000


In [7]:
# Group by 'cc_num' to calculate total transactions and total spent for each credit card
cc_aggregated = data.groupby('cc_num').agg(
    total_transactions=('cc_num', 'size'),
    total_spent=('amt', 'sum')
).reset_index()

# Calculate the linear regression line
slope, intercept, _, _, _ = linregress(cc_aggregated['total_transactions'], cc_aggregated['total_spent'])
cc_aggregated['trend_line'] = intercept + slope * cc_aggregated['total_transactions']

# Create the scatter plot with the trend line
scatter_plot = cc_aggregated.hvplot.scatter(
    x='total_transactions', 
    y='total_spent', 
    title="Correlation between Total Transactions and Total Amount Spent with Trend Line",
    xlabel="Total Transactions",
    ylabel="Total Amount Spent",
    width=800,
    height=500,
    color='blue',
    alpha=0.6,
    size=8
)

# Add the trend line plot
trend_line = cc_aggregated.hvplot.line(
    x='total_transactions', 
    y='trend_line', 
    color='red', 
    line_width=2, 
    label='Trend Line'
)

# Display the correlation value
correlation = cc_aggregated[['total_transactions', 'total_spent']].corr()
print(correlation)

# Combine scatter plot and trend line
final_plot = scatter_plot * trend_line
final_plot



                    total_transactions  total_spent
total_transactions            1.000000     0.925058
total_spent                   0.925058     1.000000


In [8]:
# Convert transaction time to datetime format
data['trans_date_trans_time'] = pd.to_datetime(data['trans_date_trans_time'])

# Find the most recent transaction date in the dataset
most_recent_date = data['trans_date_trans_time'].max()

# Calculate recency for each credit card
recency = data.groupby('cc_num')['trans_date_trans_time'].max().reset_index()
recency['recency'] = (most_recent_date - recency['trans_date_trans_time']).dt.days

# Calculate total spend for each credit card
total_spent = data.groupby('cc_num')['amt'].sum().reset_index()

# Merge recency and total spend data
recency_data = pd.merge(recency, total_spent, on='cc_num')
recency_data = recency_data.rename(columns={'amt': 'total_spent'})

# Calculate the linear regression line
slope, intercept, _, _, _ = linregress(recency_data['recency'], recency_data['total_spent'])
recency_data['trend_line'] = intercept + slope * recency_data['recency']

# Create the scatter plot with the trend line
scatter_plot = recency_data.hvplot.scatter(
    x='recency', 
    y='total_spent', 
    title="Correlation between Recency and Total Amount Spent with Trend Line",
    xlabel="Recency (Days since last transaction)",
    ylabel="Total Amount Spent",
    width=800,
    height=500,
    color='blue',
    alpha=0.6,
    size=8
)

# Add the trend line plot
trend_line = recency_data.hvplot.line(
    x='recency', 
    y='trend_line', 
    color='red', 
    line_width=2, 
    label='Trend Line'
)

# Display the correlation value
correlation = recency_data[['recency', 'total_spent']].corr()
print(correlation)

# Combine scatter plot and trend line
final_plot = scatter_plot * trend_line
final_plot


             recency  total_spent
recency      1.00000     -0.35378
total_spent -0.35378      1.00000


In [9]:
# Convert transaction time to datetime format
data['trans_date_trans_time'] = pd.to_datetime(data['trans_date_trans_time'])

# Find the most recent transaction date in the dataset
most_recent_date = data['trans_date_trans_time'].max()

# Calculate recency for each credit card
recency = data.groupby('cc_num')['trans_date_trans_time'].max().reset_index()
recency['recency'] = (most_recent_date - recency['trans_date_trans_time']).dt.days

# Calculate total spend for each credit card
total_spent = data.groupby('cc_num')['amt'].sum().reset_index()

# Merge recency and total spend data
recency_data = pd.merge(recency, total_spent, on='cc_num')
recency_data = recency_data.rename(columns={'amt': 'total_spent'})

# Filter out anyone who hasn't made a transaction in the last 30 days
recency_data = recency_data[recency_data['recency'] <= 30]

# Create the scatter plot without the trend line
scatter_plot = recency_data.hvplot.scatter(
    x='recency', 
    y='total_spent', 
    title="Correlation between Recency and Total Amount Spent (Last 30 Days)",
    xlabel="Recency (Days since last transaction)",
    ylabel="Total Amount Spent",
    width=800,
    height=500,
    color='blue',
    alpha=0.6,
    size=8,
    yformatter=NumeralTickFormatter(format='0,0')  # Format the y-axis to show whole numbers
)

# Display the correlation value
correlation = recency_data[['recency', 'total_spent']].corr()
print(correlation)

# Display the scatter plot
scatter_plot


              recency  total_spent
recency      1.000000    -0.216881
total_spent -0.216881     1.000000


In [10]:
# Calculate age based on date of birth and most recent transaction date
data['dob'] = pd.to_datetime(data['dob'])
most_recent_date = data['trans_date_trans_time'].max()
data['age'] = data['dob'].apply(lambda x: most_recent_date.year - x.year - ((most_recent_date.month, most_recent_date.day) < (x.month, x.day)))

# Define age bins
age_bins = [0, 18, 28, 38, 48, 58, 68, 78, np.inf]
age_labels = ['0-18', '19-28', '29-38', '39-48', '49-58', '59-68', '69-78', '79-up']
data['age_bin'] = pd.cut(data['age'], bins=age_bins, labels=age_labels, right=False)

# Aggregate by 'cc_num' to get unique ages and total spend per credit card
# Then group by age bins to find the average spending per age group
cc_aggregated = data.groupby('cc_num').agg(
    age=('age', 'first'),  # Since each cc_num corresponds to one age
    total_spent=('amt', 'sum')
).reset_index()

# Bin ages for aggregated data
cc_aggregated['age_bin'] = pd.cut(cc_aggregated['age'], bins=age_bins, labels=age_labels, right=False)

# Calculate average spending by age bin
age_spending = cc_aggregated.groupby('age_bin')['total_spent'].mean().reset_index()

# Calculate the correlation between age bins and spending
# Convert age bins to numerical values for correlation
age_spending['age_bin_numeric'] = age_spending['age_bin'].apply(lambda x: age_labels.index(x))
correlation = age_spending[['age_bin_numeric', 'total_spent']].corr()

# Display the correlation value
print(correlation)

# Plot spending per age bin
plot = age_spending.hvplot.bar(
    x='age_bin',
    y='total_spent',
    title="Average Spending by Age Bin",
    xlabel="Age Bin",
    ylabel="Average Total Spent",
    width=800,
    height=500,
    color='green'
)

plot


                 age_bin_numeric  total_spent
age_bin_numeric         1.000000    -0.836794
total_spent            -0.836794     1.000000


  age_spending = cc_aggregated.groupby('age_bin')['total_spent'].mean().reset_index()


In [11]:
# Calculate age based on date of birth and most recent transaction date
data['dob'] = pd.to_datetime(data['dob'])
most_recent_date = data['trans_date_trans_time'].max()
data['age'] = data['dob'].apply(lambda x: most_recent_date.year - x.year - ((most_recent_date.month, most_recent_date.day) < (x.month, x.day)))

# Define age bins
age_bins = [0, 18, 28, 38, 48, 58, 68, 78, np.inf]
age_labels = ['0-18', '19-28', '29-38', '39-48', '49-58', '59-68', '69-78', '79-up']
data['age_bin'] = pd.cut(data['age'], bins=age_bins, labels=age_labels, right=False)

# List of categories
categories = ['misc_net', 'grocery_pos', 'entertainment', 'gas_transport', 'misc_pos',
              'grocery_net', 'shopping_net', 'shopping_pos', 'food_dining', 
              'personal_care', 'health_fitness', 'travel', 'kids_pets', 'home']

# Create an empty list to store results
results = []

# Loop through each category to calculate spending by age bin
for category in categories:
    # Filter data for the current category
    category_data = data[data['category'] == category]
    
    # Aggregate by 'cc_num' to get unique ages and total spend per credit card for the category
    cc_aggregated = category_data.groupby('cc_num').agg(
        age=('age', 'first'),  # Since each cc_num corresponds to one age
        total_spent=('amt', 'sum')
    ).reset_index()

    # Bin ages for aggregated data
    cc_aggregated['age_bin'] = pd.cut(cc_aggregated['age'], bins=age_bins, labels=age_labels, right=False)

    # Calculate average spending by age bin
    age_spending = cc_aggregated.groupby('age_bin')['total_spent'].mean().reset_index()

    # Convert age bins to numerical values for correlation
    age_spending['age_bin_numeric'] = age_spending['age_bin'].apply(lambda x: age_labels.index(x))

    # Calculate the correlation between age bins and spending
    correlation = age_spending[['age_bin_numeric', 'total_spent']].corr().iloc[0, 1]  # Get the correlation value

    # Store the results
    results.append({'category': category, 'correlation': correlation, 'age_spending': age_spending})

# Create a DataFrame from results
correlation_df = pd.DataFrame(results)

# Display the correlations
print(correlation_df)

# Create plots for each category
plots = []
for result in results:
    category = result['category']
    age_spending = result['age_spending']

    # Create a bar plot for the category
    plot = age_spending.hvplot.bar(
        x='age_bin',
        y='total_spent',
        title=f"Average Spending by Age Bin for {category}",
        xlabel="Age Bin",
        ylabel="Average Total Spent",
        width=800,
        height=500,
        color='green'
    )
    plots.append(plot)

# Combine all plots into a layout
layout = hv.Layout(plots).cols(2)  # Adjust the number of columns as needed

# Display the combined layout
layout


  age_spending = cc_aggregated.groupby('age_bin')['total_spent'].mean().reset_index()
  age_spending = cc_aggregated.groupby('age_bin')['total_spent'].mean().reset_index()
  age_spending = cc_aggregated.groupby('age_bin')['total_spent'].mean().reset_index()
  age_spending = cc_aggregated.groupby('age_bin')['total_spent'].mean().reset_index()
  age_spending = cc_aggregated.groupby('age_bin')['total_spent'].mean().reset_index()
  age_spending = cc_aggregated.groupby('age_bin')['total_spent'].mean().reset_index()
  age_spending = cc_aggregated.groupby('age_bin')['total_spent'].mean().reset_index()
  age_spending = cc_aggregated.groupby('age_bin')['total_spent'].mean().reset_index()
  age_spending = cc_aggregated.groupby('age_bin')['total_spent'].mean().reset_index()
  age_spending = cc_aggregated.groupby('age_bin')['total_spent'].mean().reset_index()
  age_spending = cc_aggregated.groupby('age_bin')['total_spent'].mean().reset_index()
  age_spending = cc_aggregated.groupby('age_bin')['tot

          category  correlation  \
0         misc_net    -0.828774   
1      grocery_pos     0.087487   
2    entertainment    -0.920825   
3    gas_transport    -0.740955   
4         misc_pos    -0.889444   
5      grocery_net    -0.828854   
6     shopping_net    -0.905898   
7     shopping_pos    -0.908315   
8      food_dining    -0.014856   
9    personal_care    -0.658808   
10  health_fitness    -0.899522   
11          travel    -0.100034   
12       kids_pets     0.097861   
13            home    -0.103355   

                                         age_spending  
0     age_bin   total_spent age_bin_numeric
0    0...  
1     age_bin   total_spent age_bin_numeric
0    0...  
2     age_bin   total_spent age_bin_numeric
0    0...  
3     age_bin   total_spent age_bin_numeric
0    0...  
4     age_bin   total_spent age_bin_numeric
0    0...  
5     age_bin  total_spent age_bin_numeric
0    0-...  
6     age_bin   total_spent age_bin_numeric
0    0...  
7     age_bin   total_spen

In [12]:
# Drop columns and turn transation time into datetime
kade_data = data.drop(columns=['Unnamed: 0','first', 'last', 'street', 'city', 'zip', 'lat', 'long', 'city_pop', 'job','merch_zipcode', 'dob', 'unix_time','trans_num', 'merch_lat', 'merch_long'])
kade_data["trans_date_trans_time"] = pd.to_datetime(
    kade_data["trans_date_trans_time"],
    utc = True 
)
kade_data = kade_data.set_index('trans_date_trans_time')
kade_data.head()

Unnamed: 0_level_0,cc_num,merchant,category,amt,gender,state,is_fraud,age,age_bin
trans_date_trans_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2019-01-01 00:00:18+00:00,2703186189652095,"fraud_Rippin, Kub and Mann",misc_net,4.97,F,NC,0,32,29-38
2019-01-01 00:00:44+00:00,630423337322,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,F,WA,0,42,39-48
2019-01-01 00:00:51+00:00,38859492057661,fraud_Lind-Buckridge,entertainment,220.11,M,ID,0,58,59-68
2019-01-01 00:01:16+00:00,3534093764340240,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,M,MT,0,53,49-58
2019-01-01 00:03:06+00:00,375534208663984,fraud_Keeling-Crist,misc_pos,41.96,M,VA,0,34,29-38


In [13]:
kade_data['cc_num'] = kade_data['cc_num'].astype(str)
kade_data['amt'] = kade_data['amt'].astype(float)

In [14]:
# Function to determine the season 
def get_season(date):
    month = date.month
    if month in [12, 1, 2]:
        return 'Winter'
    elif month in [3, 4, 5]:
        return 'Spring'
    elif month in [6, 7, 8]:
        return 'Summer'
    else:
        return 'Fall'

# Create a new column for the season based on the index
kade_data['season'] = kade_data.index.to_series().apply(get_season)

# List of categories
categories = ['misc_net', 'grocery_pos', 'entertainment', 'gas_transport', 'misc_pos',
              'grocery_net', 'shopping_net', 'shopping_pos', 'food_dining', 
              'personal_care', 'health_fitness', 'travel', 'kids_pets', 'home']

# Create an empty list to store results
results = []

# Loop through each category to calculate spending by season
for category in categories:
    # Filter data for the current category
    category_data = kade_data[kade_data['category'] == category]
    
    # Aggregate by 'cc_num' to get unique seasons and total spend per credit card for the category
    cc_aggregated = category_data.groupby('cc_num').agg(
        season=('season', 'first'),  # Get the season associated with the credit card
        total_spent=('amt', 'sum')
    ).reset_index()

    # Calculate average spending by season
    season_spending = cc_aggregated.groupby('season')['total_spent'].mean().reset_index()

    # Convert seasons to numerical values for correlation
    season_spending['season_numeric'] = season_spending['season'].astype('category').cat.codes

    # Calculate the correlation between season spending and season codes
    correlation = season_spending[['season_numeric', 'total_spent']].corr().iloc[0, 1]  # Corrected line

    # Store the results
    results.append({'category': category, 'correlation': correlation, 'season_spending': season_spending})

# Create a DataFrame from results
correlation_df = pd.DataFrame(results)

# Display the correlations
print(correlation_df)

# Create plots for each category
plots = []
for result in results:
    category = result['category']
    season_spending = result['season_spending']

    # Create a bar plot for the category
    plot = season_spending.hvplot.bar(
        x='season',
        y='total_spent',
        title=f"Average Spending by Season for {category}",
        xlabel="Season",
        ylabel="Average Total Spent",
        width=800,
        height=500,
        color='orange'
    )
    plots.append(plot)

# Combine all plots into a layout
layout = hv.Layout(plots).cols(2)  # Adjust the number of columns as needed

# Display the combined layout
layout


          category  correlation  \
0         misc_net     0.701440   
1      grocery_pos     0.776836   
2    entertainment     0.751157   
3    gas_transport     0.771572   
4         misc_pos     0.747545   
5      grocery_net     0.795443   
6     shopping_net     0.756602   
7     shopping_pos     0.792620   
8      food_dining     0.746322   
9    personal_care     0.831629   
10  health_fitness     0.844347   
11          travel     0.745332   
12       kids_pets     1.000000   
13            home     0.941528   

                                      season_spending  
0      season  total_spent  season_numeric
0    Fa...  
1      season   total_spent  season_numeric
0    F...  
2      season  total_spent  season_numeric
0    Fa...  
3      season  total_spent  season_numeric
0    Fa...  
4      season  total_spent  season_numeric
0    Fa...  
5      season  total_spent  season_numeric
0    Fa...  
6      season  total_spent  season_numeric
0    Fa...  
7      season   total_spen

In [15]:

###DONT USE  DATA MAKES NO SENSE

# Function to get the month from a date
def get_month(date):
    return date.month_name()  # Returns the name of the month

# Create a new column for the month based on the index
kade_data['month'] = kade_data.index.to_series().apply(get_month)

# Define the order of months
month_order = [
    'January', 'February', 'March', 'April', 'May', 'June', 
    'July', 'August', 'September', 'October', 'November', 'December'
]

# Convert 'month' column to a categorical type with ordered categories
kade_data['month'] = pd.Categorical(kade_data['month'], categories=month_order, ordered=True)

# List of categories
categories = ['misc_net', 'grocery_pos', 'entertainment', 'gas_transport', 'misc_pos',
              'grocery_net', 'shopping_net', 'shopping_pos', 'food_dining', 
              'personal_care', 'health_fitness', 'travel', 'kids_pets', 'home']

# Create an empty list to store results
results = []

# Loop through each category to calculate spending by month
for category in categories:
    # Filter data for the current category
    category_data = kade_data[kade_data['category'] == category]
    
    # Aggregate by 'cc_num' to get unique months and total spend per credit card for the category
    cc_aggregated = category_data.groupby('cc_num').agg(
        month=('month', 'first'),  # Get the month associated with the credit card
        total_spent=('amt', 'sum')
    ).reset_index()

    # Calculate average spending by month
    month_spending = cc_aggregated.groupby('month')['total_spent'].mean().reset_index()

    # Convert months to numerical values for correlation (January=0, February=1, etc.)
    month_spending['month_numeric'] = month_spending['month'].astype('category').cat.codes

    # Calculate the correlation between month spending and month codes
    correlation = month_spending[['month_numeric', 'total_spent']].corr().iloc[0, 1]  # Get the correlation value

    # Store the results
    results.append({'category': category, 'correlation': correlation, 'month_spending': month_spending})

# Create a DataFrame from results
correlation_df = pd.DataFrame(results)

# Display the correlations
print(correlation_df)

# Create plots for each category
plots = []
for result in results:
    category = result['category']
    month_spending = result['month_spending']

    # Create a bar plot for the category
    plot = month_spending.hvplot.bar(
        x='month',
        y='total_spent',
        title=f"Average Spending by Month for {category}",
        xlabel="Month",
        ylabel="Average Total Spent",
        width=800,
        height=500,
        color='blue'
    )
    plots.append(plot)

# Combine all plots into a layout
layout = hv.Layout(plots).cols(2)  # Adjust the number of columns as needed

# Display the combined layout
layout


  month_spending = cc_aggregated.groupby('month')['total_spent'].mean().reset_index()
  month_spending = cc_aggregated.groupby('month')['total_spent'].mean().reset_index()
  month_spending = cc_aggregated.groupby('month')['total_spent'].mean().reset_index()
  month_spending = cc_aggregated.groupby('month')['total_spent'].mean().reset_index()
  month_spending = cc_aggregated.groupby('month')['total_spent'].mean().reset_index()
  month_spending = cc_aggregated.groupby('month')['total_spent'].mean().reset_index()
  month_spending = cc_aggregated.groupby('month')['total_spent'].mean().reset_index()
  month_spending = cc_aggregated.groupby('month')['total_spent'].mean().reset_index()
  month_spending = cc_aggregated.groupby('month')['total_spent'].mean().reset_index()
  month_spending = cc_aggregated.groupby('month')['total_spent'].mean().reset_index()
  month_spending = cc_aggregated.groupby('month')['total_spent'].mean().reset_index()
  month_spending = cc_aggregated.groupby('month')['tot

          category  correlation  \
0         misc_net    -0.657763   
1      grocery_pos    -0.641069   
2    entertainment    -0.777125   
3    gas_transport    -0.637635   
4         misc_pos    -0.731922   
5      grocery_net    -0.725241   
6     shopping_net    -0.427856   
7     shopping_pos    -0.619946   
8      food_dining    -0.720390   
9    personal_care    -0.761520   
10  health_fitness    -0.725359   
11          travel    -0.786184   
12       kids_pets    -0.799252   
13            home    -0.720007   

                                       month_spending  
0           month  total_spent  month_numeric
0   ...  
1           month   total_spent  month_numeric
0  ...  
2           month  total_spent  month_numeric
0   ...  
3           month  total_spent  month_numeric
0   ...  
4           month  total_spent  month_numeric
0   ...  
5           month  total_spent  month_numeric
0   ...  
6           month  total_spent  month_numeric
0   ...  
7           month   total_

In [16]:
import pandas as pd
import numpy as np
import hvplot.pandas

# Sample DataFrame assuming 'data' is already loaded with transaction data
# Ensure 'trans_date_trans_time' is in datetime format and in UTC
data['trans_date_trans_time'] = pd.to_datetime(data['trans_date_trans_time'])

# Find the most recent transaction date
most_recent_date = data['trans_date_trans_time'].max()

# Extract the most recent week (you can adjust this if needed)
start_of_week = most_recent_date - pd.DateOffset(days=most_recent_date.weekday())  # Start of the week
end_of_week = most_recent_date  # End of the week is the most recent date

# Filter data for the most recent week
weekly_data = data[(data['trans_date_trans_time'] >= start_of_week) & (data['trans_date_trans_time'] <= end_of_week)]

# Calculate recency in hours for the weekly data
recency = weekly_data.groupby('cc_num')['trans_date_trans_time'].max().reset_index()
recency['recency'] = (most_recent_date - recency['trans_date_trans_time']).dt.total_seconds() / 3600  # Convert to hours

# Calculate frequency: number of transactions per customer for the weekly data
frequency = weekly_data.groupby('cc_num').size().reset_index(name='transaction_count')

# Merge recency and frequency data
customer_engagement = pd.merge(recency[['cc_num', 'recency']], frequency, on='cc_num')

# Categorize recency into bins: 0-6, 6-12, 12-24 hours
recency_bins = [0, 6, 12, 24, np.inf]  # Bins for recency in hours
recency_labels = ['0-6 Hours', '6-12 Hours', '12-24 Hours', '24+ Hours']
customer_engagement['recency_category'] = pd.cut(customer_engagement['recency'], bins=recency_bins, labels=recency_labels, right=False)

# Adjusted frequency bins for weekly transactions: 10 to 70 by 10
frequency_bins = np.arange(0, 40, 5)  # Bins for frequency: 10 to 70 transactions
frequency_labels = [f'{i} Transactions' for i in range(0, 35, 5)]  # Labels for 10 to 60 transactions
frequency_labels.append('70+ Transactions')  # Add label for counts above 70

# Fix the label issue
customer_engagement['frequency_category'] = pd.cut(customer_engagement['transaction_count'], 
                                                    bins=frequency_bins.tolist() + [np.inf], 
                                                    labels=frequency_labels, 
                                                    right=False)

# Analyze engagement levels
engagement_summary = customer_engagement.groupby(['recency_category', 'frequency_category']).size().reset_index(name='customer_count')

# Display the summary
print(engagement_summary)

# Key Insight: Identify highly engaged customers
high_engaged = customer_engagement[(customer_engagement['recency'] < 6) & (customer_engagement['transaction_count'] >= 10)]
print("Highly Engaged Customers:")
print(high_engaged)

# Visualizing the engagement levels as a heatmap
engagement_plot = engagement_summary.hvplot.heatmap(
    x='frequency_category',
    y='recency_category',
    C='customer_count',
    title="Customer Engagement Heatmap for the Most Recent Week",
    xlabel="Frequency of Transactions",
    ylabel="Recency of Transactions",
    colorbar=True,
    width=900,
    height=500,
    cmap='viridis'  # You can change the colormap if desired
)

# Display the heatmap
engagement_plot


   recency_category frequency_category  customer_count
0         0-6 Hours     0 Transactions               4
1         0-6 Hours     5 Transactions              50
2         0-6 Hours    10 Transactions              67
3         0-6 Hours    15 Transactions              89
4         0-6 Hours    20 Transactions              83
5         0-6 Hours    25 Transactions              55
6         0-6 Hours    30 Transactions              45
7         0-6 Hours   70+ Transactions              51
8        6-12 Hours     0 Transactions               6
9        6-12 Hours     5 Transactions              36
10       6-12 Hours    10 Transactions              44
11       6-12 Hours    15 Transactions              38
12       6-12 Hours    20 Transactions              30
13       6-12 Hours    25 Transactions              17
14       6-12 Hours    30 Transactions              17
15       6-12 Hours   70+ Transactions              15
16      12-24 Hours     0 Transactions               8
17      12

  engagement_summary = customer_engagement.groupby(['recency_category', 'frequency_category']).size().reset_index(name='customer_count')
  df = obj.data.set_index(index_cols).groupby(index_cols, sort=False).first()
  dataset.data.groupby(group_by, sort=False)]
