In [1]:
import pandas as pd
from prophet import Prophet
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import os
import numpy as np
import datetime as dt
import hvplot.pandas
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
from scipy.stats import linregress
import holoviews as hv
hv.extension('bokeh')
from bokeh.models import NumeralTickFormatter

In [2]:
data = pd.read_csv('Resources/credit_card_transactions.csv')
data.head()
data.tail()

Unnamed: 0.1,Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,...,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud,merch_zipcode
1296670,1296670,2020-06-21 12:12:08,30263540414123,fraud_Reichel Inc,entertainment,15.56,Erik,Patterson,M,162 Jessica Row Apt. 072,...,-112.4777,258,Geoscientist,1961-11-24,440b587732da4dc1a6395aba5fb41669,1371816728,36.841266,-111.690765,0,
1296671,1296671,2020-06-21 12:12:19,6011149206456997,fraud_Abernathy and Sons,food_dining,51.7,Jeffrey,White,M,8617 Holmes Terrace Suite 651,...,-77.5101,100,"Production assistant, television",1979-12-11,278000d2e0d2277d1de2f890067dcc0a,1371816739,38.906881,-78.246528,0,22630.0
1296672,1296672,2020-06-21 12:12:32,3514865930894695,fraud_Stiedemann Ltd,food_dining,105.93,Christopher,Castaneda,M,1632 Cohen Drive Suite 639,...,-105.8189,899,Naval architect,1967-08-30,483f52fe67fabef353d552c1e662974c,1371816752,33.619513,-105.130529,0,88351.0
1296673,1296673,2020-06-21 12:13:36,2720012583106919,"fraud_Reinger, Weissnat and Strosin",food_dining,74.9,Joseph,Murray,M,42933 Ryan Underpass,...,-102.5411,1126,Volunteer coordinator,1980-08-18,d667cdcbadaaed3da3f4020e83591c83,1371816816,42.78894,-103.24116,0,69367.0
1296674,1296674,2020-06-21 12:13:37,4292902571056973207,"fraud_Langosh, Wintheiser and Hyatt",food_dining,4.3,Jeffrey,Smith,M,135 Joseph Mountains,...,-113.8748,218,"Therapist, horticultural",1995-08-16,8f7c8e4ab7f25875d753b422917c98c9,1371816817,46.565983,-114.18611,0,59870.0


In [3]:
data.columns

Index(['Unnamed: 0', 'trans_date_trans_time', 'cc_num', 'merchant', 'category',
       'amt', 'first', 'last', 'gender', 'street', 'city', 'state', 'zip',
       'lat', 'long', 'city_pop', 'job', 'dob', 'trans_num', 'unix_time',
       'merch_lat', 'merch_long', 'is_fraud', 'merch_zipcode'],
      dtype='object')

In [4]:
unique_categoreis = data['category'].unique()
print(unique_categoreis)

['misc_net' 'grocery_pos' 'entertainment' 'gas_transport' 'misc_pos'
 'grocery_net' 'shopping_net' 'shopping_pos' 'food_dining' 'personal_care'
 'health_fitness' 'travel' 'kids_pets' 'home']


In [5]:
data['cc_num'] = data['cc_num'].astype(str)
data['amt'] = data['amt'].astype(float)

In [6]:
# Convert transaction time to datetime format
data['trans_date_trans_time'] = pd.to_datetime(data['trans_date_trans_time'])

# Calculate age based on date of birth and most recent transaction date
data['dob'] = pd.to_datetime(data['dob'])
most_recent_date = data['trans_date_trans_time'].max()
data['age'] = data['dob'].apply(lambda x: most_recent_date.year - x.year - ((most_recent_date.month, most_recent_date.day) < (x.month, x.day)))

# Define age bins
age_bins = [0, 18, 28, 38, 48, 58, 68, 78, np.inf]
age_labels = ['0-18', '19-28', '29-38', '39-48', '49-58', '59-68', '69-78', '79-up']
data['age_bin'] = pd.cut(data['age'], bins=age_bins, labels=age_labels, right=False)

# Aggregate by 'cc_num' to get unique ages and total spend per credit card
# Then group by age bins to find the average spending per age group
cc_aggregated = data.groupby('cc_num').agg(
    age=('age', 'first'),  # Since each cc_num corresponds to one age
    total_spent=('amt', 'sum')
).reset_index()

# Bin ages for aggregated data
cc_aggregated['age_bin'] = pd.cut(cc_aggregated['age'], bins=age_bins, labels=age_labels, right=False)

# Calculate average spending by age bin
age_spending = cc_aggregated.groupby('age_bin')['total_spent'].mean().reset_index()

# Calculate the correlation between age bins and spending
# Convert age bins to numerical values for correlation
age_spending['age_bin_numeric'] = age_spending['age_bin'].apply(lambda x: age_labels.index(x))
correlation = age_spending[['age_bin_numeric', 'total_spent']].corr()

# Display the correlation value
print(correlation)

# Plot spending per age bin
plot1 = age_spending.hvplot.bar(
    x='age_bin',
    y='total_spent',
    title="Average Spending by Age Bin",
    xlabel="Age Bin",
    ylabel="Average Total Spent",
    width=800,
    height=500,
    color='green'
)

plot1

                 age_bin_numeric  total_spent
age_bin_numeric         1.000000    -0.836794
total_spent            -0.836794     1.000000


  age_spending = cc_aggregated.groupby('age_bin')['total_spent'].mean().reset_index()


In [7]:
# Calculate age based on date of birth and most recent transaction date
data['dob'] = pd.to_datetime(data['dob'])
most_recent_date = data['trans_date_trans_time'].max()
data['age'] = data['dob'].apply(lambda x: most_recent_date.year - x.year - ((most_recent_date.month, most_recent_date.day) < (x.month, x.day)))

# Define age bins
age_bins = [0, 18, 28, 38, 48, 58, 68, 78, np.inf]
age_labels = ['0-18', '19-28', '29-38', '39-48', '49-58', '59-68', '69-78', '79-up']
data['age_bin'] = pd.cut(data['age'], bins=age_bins, labels=age_labels, right=False)

# List of categories
categories = ['misc_net', 'grocery_pos', 'entertainment', 'gas_transport', 'misc_pos',
              'grocery_net', 'shopping_net', 'shopping_pos', 'food_dining', 
              'personal_care', 'health_fitness', 'travel', 'kids_pets', 'home']

# Create an empty list to store results
results = []

# Loop through each category to calculate spending by age bin
for category in categories:
    # Filter data for the current category
    category_data = data[data['category'] == category]
    
    # Aggregate by 'cc_num' to get unique ages and total spend per credit card for the category
    cc_aggregated = category_data.groupby('cc_num').agg(
        age=('age', 'first'),  # Since each cc_num corresponds to one age
        total_spent=('amt', 'sum')
    ).reset_index()

    # Bin ages for aggregated data
    cc_aggregated['age_bin'] = pd.cut(cc_aggregated['age'], bins=age_bins, labels=age_labels, right=False)

    # Calculate average spending by age bin
    age_spending = cc_aggregated.groupby('age_bin')['total_spent'].mean().reset_index()

    # Convert age bins to numerical values for correlation
    age_spending['age_bin_numeric'] = age_spending['age_bin'].apply(lambda x: age_labels.index(x))

    # Calculate the correlation between age bins and spending
    correlation = age_spending[['age_bin_numeric', 'total_spent']].corr().iloc[0, 1]  # Get the correlation value

    # Store the results
    results.append({'category': category, 'correlation': correlation, 'age_spending': age_spending})

# Create a DataFrame from results
correlation_df = pd.DataFrame(results)

# Display the correlations
print(correlation_df)

# Create plots for each category
plots = []
for result in results:
    category = result['category']
    age_spending = result['age_spending']

    # Create a bar plot for the category
    plot = age_spending.hvplot.bar(
        x='age_bin',
        y='total_spent',
        title=f"Average Spending by Age Bin for {category}",
        xlabel="Age Bin",
        ylabel="Average Total Spent",
        width=800,
        height=500,
        color='green'
    )
    plots.append(plot)

# Combine all plots into a layout
layout1 = hv.Layout(plots).cols(2)  # Adjust the number of columns as needed

# Display the combined layout
layout1

  age_spending = cc_aggregated.groupby('age_bin')['total_spent'].mean().reset_index()
  age_spending = cc_aggregated.groupby('age_bin')['total_spent'].mean().reset_index()
  age_spending = cc_aggregated.groupby('age_bin')['total_spent'].mean().reset_index()
  age_spending = cc_aggregated.groupby('age_bin')['total_spent'].mean().reset_index()
  age_spending = cc_aggregated.groupby('age_bin')['total_spent'].mean().reset_index()
  age_spending = cc_aggregated.groupby('age_bin')['total_spent'].mean().reset_index()
  age_spending = cc_aggregated.groupby('age_bin')['total_spent'].mean().reset_index()
  age_spending = cc_aggregated.groupby('age_bin')['total_spent'].mean().reset_index()
  age_spending = cc_aggregated.groupby('age_bin')['total_spent'].mean().reset_index()
  age_spending = cc_aggregated.groupby('age_bin')['total_spent'].mean().reset_index()
  age_spending = cc_aggregated.groupby('age_bin')['total_spent'].mean().reset_index()
  age_spending = cc_aggregated.groupby('age_bin')['tot

          category  correlation  \
0         misc_net    -0.828774   
1      grocery_pos     0.087487   
2    entertainment    -0.920825   
3    gas_transport    -0.740955   
4         misc_pos    -0.889444   
5      grocery_net    -0.828854   
6     shopping_net    -0.905898   
7     shopping_pos    -0.908315   
8      food_dining    -0.014856   
9    personal_care    -0.658808   
10  health_fitness    -0.899522   
11          travel    -0.100034   
12       kids_pets     0.097861   
13            home    -0.103355   

                                         age_spending  
0     age_bin   total_spent age_bin_numeric
0    0...  
1     age_bin   total_spent age_bin_numeric
0    0...  
2     age_bin   total_spent age_bin_numeric
0    0...  
3     age_bin   total_spent age_bin_numeric
0    0...  
4     age_bin   total_spent age_bin_numeric
0    0...  
5     age_bin  total_spent age_bin_numeric
0    0-...  
6     age_bin   total_spent age_bin_numeric
0    0...  
7     age_bin   total_spen

In [8]:
# Drop columns and turn transation time into datetime
kade_data = data.drop(columns=['Unnamed: 0','first', 'last', 'street', 'city', 'zip', 'lat', 'long', 'city_pop', 'job','merch_zipcode', 'dob', 'unix_time','trans_num', 'merch_lat', 'merch_long'])
kade_data["trans_date_trans_time"] = pd.to_datetime(
    kade_data["trans_date_trans_time"],
    utc = True 
)
kade_data = kade_data.set_index('trans_date_trans_time')
kade_data.head()

Unnamed: 0_level_0,cc_num,merchant,category,amt,gender,state,is_fraud,age,age_bin
trans_date_trans_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2019-01-01 00:00:18+00:00,2703186189652095,"fraud_Rippin, Kub and Mann",misc_net,4.97,F,NC,0,32,29-38
2019-01-01 00:00:44+00:00,630423337322,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,F,WA,0,42,39-48
2019-01-01 00:00:51+00:00,38859492057661,fraud_Lind-Buckridge,entertainment,220.11,M,ID,0,58,59-68
2019-01-01 00:01:16+00:00,3534093764340240,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,M,MT,0,53,49-58
2019-01-01 00:03:06+00:00,375534208663984,fraud_Keeling-Crist,misc_pos,41.96,M,VA,0,34,29-38


In [9]:
kade_data['cc_num'] = kade_data['cc_num'].astype(str)
kade_data['amt'] = kade_data['amt'].astype(float)

In [10]:
# Filter the data to only include transactions from the year 2019
kade_data = kade_data[kade_data.index.year == 2019]

# Function to determine the season 
def get_season(date):
    month = date.month
    if month in [12, 1, 2]:
        return 'Winter'
    elif month in [3, 4, 5]:
        return 'Spring'
    elif month in [6, 7, 8]:
        return 'Summer'
    else:
        return 'Fall'

# Create a new column for the season based on the index
kade_data['season'] = kade_data.index.to_series().apply(get_season)

# List of categories
categories = ['misc_net', 'grocery_pos', 'entertainment', 'gas_transport', 'misc_pos',
              'grocery_net', 'shopping_net', 'shopping_pos', 'food_dining', 
              'personal_care', 'health_fitness', 'travel', 'kids_pets', 'home']

# Create an empty list to store results
results = []

# Loop through each category to calculate spending by season
for category in categories:
    # Filter data for the current category
    category_data = kade_data[kade_data['category'] == category]
    
    # Aggregate by 'cc_num' to get unique seasons and total spend per credit card for the category
    cc_aggregated = category_data.groupby('cc_num').agg(
        season=('season', 'first'),  # Get the season associated with the credit card
        total_spent=('amt', 'sum')
    ).reset_index()

    # Calculate average spending by season
    season_spending = cc_aggregated.groupby('season')['total_spent'].mean().reset_index()

    # Convert seasons to numerical values for correlation
    season_spending['season_numeric'] = season_spending['season'].astype('category').cat.codes

    # Calculate the correlation between season spending and season codes
    correlation = season_spending[['season_numeric', 'total_spent']].corr().iloc[0, 1]

    # Store the results
    results.append({'category': category, 'correlation': correlation, 'season_spending': season_spending})

# Create a DataFrame from results
correlation_df = pd.DataFrame(results)

# Display the correlations
print(correlation_df)

# Create plots for each category
plots = []
for result in results:
    category = result['category']
    season_spending = result['season_spending']

    # Create a bar plot for the category
    plot = season_spending.hvplot.bar(
        x='season',
        y='total_spent',
        title=f"Average Spending by Season for {category}",
        xlabel="Season",
        ylabel="Average Total Spent",
        width=800,
        height=500,
        color='orange'
    )
    plots.append(plot)

# Combine all plots into a layout
layout2 = hv.Layout(plots).cols(2)  # Adjust the number of columns as needed

# Display the combined layout
layout2


          category  correlation  \
0         misc_net     0.620480   
1      grocery_pos     0.781363   
2    entertainment     0.743025   
3    gas_transport     0.769633   
4         misc_pos     0.748026   
5      grocery_net     0.791773   
6     shopping_net     0.701164   
7     shopping_pos     0.785024   
8      food_dining     0.969486   
9    personal_care     0.871982   
10  health_fitness     0.835412   
11          travel     0.744857   
12       kids_pets     1.000000   
13            home     0.947999   

                                      season_spending  
0      season  total_spent  season_numeric
0    Fa...  
1      season   total_spent  season_numeric
0    F...  
2      season  total_spent  season_numeric
0    Fa...  
3      season  total_spent  season_numeric
0    Fa...  
4      season  total_spent  season_numeric
0    Fa...  
5      season  total_spent  season_numeric
0    Fa...  
6      season  total_spent  season_numeric
0    Fa...  
7      season  total_spent

In [11]:
# Group by 'category' and calculate the total spending for each category
category_spending = kade_data.groupby('category')['amt'].sum().reset_index()

# Find the category with the highest total spending
highest_spending_category = category_spending.loc[category_spending['amt'].idxmax()]

# Display the result
print("Category with the highest total spending:")
print(highest_spending_category)


Category with the highest total spending:
category    grocery_pos
amt         10268432.18
Name: 4, dtype: object


In [12]:
import hvplot.pandas  # For plotting with hvPlot
import numpy as np

# Filter the data for 'grocery_pos' category
grocery_pos_data = data[data['category'] == 'grocery_pos']

# Calculate the season for each transaction
def get_season(date):
    month = date.month
    if month in [12, 1, 2]:
        return 'Winter'
    elif month in [3, 4, 5]:
        return 'Spring'
    elif month in [6, 7, 8]:
        return 'Summer'
    else:
        return 'Fall'

# Add a season column
grocery_pos_data['season'] = grocery_pos_data['trans_date_trans_time'].apply(get_season)

# Define age bins and labels
age_bins = [0, 18, 28, 38, 48, 58, 68, 78, np.inf]
age_labels = ['0-18', '19-28', '29-38', '39-48', '49-58', '59-68', '69-78', '79-up']

# Bin ages
grocery_pos_data['age_bin'] = pd.cut(grocery_pos_data['age'], bins=age_bins, labels=age_labels, right=False)

# Initialize an empty list for storing the plots
season_plots = []

# Loop through each season and create a plot for 'grocery_pos' spending by age bin
for season in ['Winter', 'Spring', 'Summer', 'Fall']:
    # Filter data for the current season
    season_data = grocery_pos_data[grocery_pos_data['season'] == season]
    
    # Aggregate by age bin and calculate average spending for each age bin
    age_spending = season_data.groupby('age_bin')['amt'].mean().reset_index()
    
    # Create a bar plot for the season
    plot = age_spending.hvplot.bar(
        x='age_bin',
        y='amt',
        title=f"Average Spending by Age Bin for Grocery POS in {season}",
        xlabel="Age Bin",
        ylabel="Average Total Spent",
        width=550,
        height=300,
        color='green'
    )
    
    # Append the plot to the list
    season_plots.append(plot)

# Combine all season plots into a layout
layout3 = hv.Layout(season_plots).cols(2)  # Adjust columns for layout

# Display the layout
layout3


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  grocery_pos_data['season'] = grocery_pos_data['trans_date_trans_time'].apply(get_season)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  grocery_pos_data['age_bin'] = pd.cut(grocery_pos_data['age'], bins=age_bins, labels=age_labels, right=False)
  age_spending = season_data.groupby('age_bin')['amt'].mean().reset_index()
  age_spending = season_data.groupby('age_bin')['amt'].mean().reset_index()
  age_spending = season_data.groupby('age_bin')['amt'].mean().reset_index()
  age_spending = season_data.groupby('age_bi