In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('customer_data_raw.csv')
df.head()

Unnamed: 0,Customer ID,Purchase Date,Product Category,Product Price,Quantity,Total Purchase Amount,Payment Method,Customer Age,Returns,Customer Name,Age,Gender,Churn
0,46251,9/8/20 9:38,Electronics,12,3,740,Credit Card,37,0.0,Christine Hernandez,37,Male,0
1,46251,3/5/22 12:56,Home,468,4,2739,PayPal,37,0.0,Christine Hernandez,37,Male,0
2,46251,5/23/22 18:18,Home,288,2,3196,PayPal,37,0.0,Christine Hernandez,37,Male,0
3,46251,11/12/20 13:13,Clothing,196,1,3509,PayPal,37,0.0,Christine Hernandez,37,Male,0
4,13593,11/27/20 17:55,Home,449,1,3452,Credit Card,49,0.0,James Grant,49,Female,1


In [3]:
# Define age ranges
def categorize_age(age):
    if age < 18:
        return 'Under 18'
    elif 18 <= age < 25:
        return '18-24'
    elif 25 <= age < 35:
        return '25-34'
    elif 35 <= age < 45:
        return '35-44'
    elif 45 <= age < 55:
        return '45-54'
    elif 55 <= age < 65:
        return '55-64'
    else:
        return '65 and above'

# Create a new 'Age Range' column
df['Age Range'] = df['Customer Age'].apply(categorize_age)

# Display the updated DataFrame
df.head()

Unnamed: 0,Customer ID,Purchase Date,Product Category,Product Price,Quantity,Total Purchase Amount,Payment Method,Customer Age,Returns,Customer Name,Age,Gender,Churn,Age Range
0,46251,9/8/20 9:38,Electronics,12,3,740,Credit Card,37,0.0,Christine Hernandez,37,Male,0,35-44
1,46251,3/5/22 12:56,Home,468,4,2739,PayPal,37,0.0,Christine Hernandez,37,Male,0,35-44
2,46251,5/23/22 18:18,Home,288,2,3196,PayPal,37,0.0,Christine Hernandez,37,Male,0,35-44
3,46251,11/12/20 13:13,Clothing,196,1,3509,PayPal,37,0.0,Christine Hernandez,37,Male,0,35-44
4,13593,11/27/20 17:55,Home,449,1,3452,Credit Card,49,0.0,James Grant,49,Female,1,45-54


In [4]:
df.to_csv("customer_data_updated.csv", index=False)

In [5]:
# Analyze purchases
purchase_counts = df['Customer ID'].value_counts().reset_index()
purchase_counts.columns = ['Customer ID', 'Purchase Count']

print("Purchase counts by customer ID:")
purchase_counts

Purchase counts by customer ID:


Unnamed: 0,Customer ID,Purchase Count
0,36437,17
1,39817,17
2,3576,14
3,7442,13
4,379,13
...,...,...
2759,35264,1
2760,2481,1
2761,41217,1
2762,7244,1


In [6]:
df = pd.merge(df, purchase_counts, on='Customer ID', how='left')
df.to_csv("customer_data_updated.csv", index=False)

In [7]:
# Categorize customer types based on purchase count
def categorize_customer_type(purchase_count):
    if purchase_count >= 10:
        return 'Platinum'
    elif 10 < purchase_count >= 7:
        return 'Gold'
    elif 7 < purchase_count >= 4:
        return 'Silver'
    else:
        return 'Bronze'

# Create a new column 'Customer Type'
df['Customer Type'] = df['Purchase Count'].apply(categorize_customer_type)
df

Unnamed: 0,Customer ID,Purchase Date,Product Category,Product Price,Quantity,Total Purchase Amount,Payment Method,Customer Age,Returns,Customer Name,Age,Gender,Churn,Age Range,Purchase Count,Customer Type
0,46251,9/8/20 9:38,Electronics,12,3,740,Credit Card,37,0.0,Christine Hernandez,37,Male,0,35-44,4,Bronze
1,46251,3/5/22 12:56,Home,468,4,2739,PayPal,37,0.0,Christine Hernandez,37,Male,0,35-44,4,Bronze
2,46251,5/23/22 18:18,Home,288,2,3196,PayPal,37,0.0,Christine Hernandez,37,Male,0,35-44,4,Bronze
3,46251,11/12/20 13:13,Clothing,196,1,3509,PayPal,37,0.0,Christine Hernandez,37,Male,0,35-44,4,Bronze
4,13593,11/27/20 17:55,Home,449,1,3452,Credit Card,49,0.0,James Grant,49,Female,1,45-54,5,Bronze
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16378,33417,5/1/20 7:23,Electronics,294,2,242,Credit Card,31,1.0,Melanie Perez,31,Female,0,25-34,5,Bronze
16379,33417,4/16/20 9:34,Clothing,405,3,4057,Cash,31,,Melanie Perez,31,Female,0,25-34,5,Bronze
16380,33417,9/12/22 19:20,Books,127,2,3724,Cash,31,1.0,Melanie Perez,31,Female,0,25-34,5,Bronze
16381,33417,4/30/21 12:13,Home,190,5,2982,PayPal,31,1.0,Melanie Perez,31,Female,0,25-34,5,Bronze


In [8]:
df.to_csv("customer_data_updated.csv", index=False)

In [9]:
popular_categories = df['Product Category'].value_counts()
print("\nPopular product categories:")
print(popular_categories)


Popular product categories:
Clothing       5045
Books          4888
Home           3229
Electronics    3221
Name: Product Category, dtype: int64


In [10]:
age_distribution = df['Age Range'].value_counts()
print("\nAge distribution:")
print(age_distribution)


Age distribution:
25-34           3330
45-54           3166
35-44           2946
55-64           2920
18-24           2216
65 and above    1805
Name: Age Range, dtype: int64


In [11]:
gender_distribution = df['Gender'].value_counts()
print("\nGender distribution:")
print(gender_distribution)


Gender distribution:
Male      8326
Female    8057
Name: Gender, dtype: int64


In [12]:
# Calculate return rate
return_rate = df['Returns'].sum() / df['Purchase Count'].sum()
print("\nReturn rate: {:.2%}".format(return_rate))


Return rate: 6.00%
