In [0]:
# Spark
df = spark.read.csv("/Volumes/workspace/customers/data/data.csv", header=True, inferSchema=True)
df.show(5)

# Pandas
import pandas as pd
pdf = pd.read_csv("/Volumes/workspace/customers/data/data.csv")
print(pdf.head())


In [0]:
# Read CSV from the volume
df = spark.read.csv("/Volumes/workspace/customers/data/data.csv", header=True, inferSchema=True)
df.show(5)
df.printSchema()


In [0]:
#Convert to Pandas (for looping easily)

import pandas as pd

pdf = df.toPandas()
print(pdf.head())


In [0]:
# dictionary for one customer

customer = {
    "id": pdf.loc[0, "Customer Id"],
    "name": pdf.loc[0, "First Name"] + " " + pdf.loc[0, "Last Name"],
    "city": pdf.loc[0, "City"],
    "country": pdf.loc[0, "Country"],
    "email": pdf.loc[0, "Email"]
}
print("Customer Dictionary:", customer)


In [0]:
# list of 5 companies
companies = pdf["Company"].head(5).tolist()

print("\nCompanies in Uppercase:")
for company in companies:
    print(company.upper())

In [0]:
#if-else logic for "Long-Term Customer" vs "New Customer"

print("\nCustomer Subscription Check:")
for _, row in pdf.iterrows():
    year = int(str(row["Subscription Date"]).split("-")[0])  # assuming YYYY-MM-DD
    status = "Long-Term" if year < 2020 else "New"
    print(f"{row['First Name']} {row['Last Name']} → {status} Customer")


In [0]:
# Discount Function (simulate subscription discount)

def apply_discount(amount):
    return amount - (amount * 0.10)

print("\nDiscount Example:")
subscription_fee = 1000
final_price = apply_discount(subscription_fee)
print(f"Original Fee: {subscription_fee} | After Discount: {final_price}")


In [0]:
# ------------------------------
# 1. Customer Segmentation
# ------------------------------
print("\nCustomer Segmentation:")
for _, row in pdf.iterrows():
    # Extract year from Subscription Date (assumes format YYYY-MM-DD)
    year = int(str(row["Subscription Date"]).split("-")[0])
    
    # Apply business rules for segmentation
    if year < 2020:
        segment = "Loyal Customer"
    elif 2020 <= year <= 2022:
        segment = "Regular Customer"
    else:
        segment = "New Customer"
    
    # Print result
    print(f"{row['First Name']} {row['Last Name']} → {segment}")

In [0]:
# ------------------------------
# 2. Country-Based Business Insights
# ------------------------------
print("\nCustomer Count by Country:")

# Count how many customers belong to each country
country_counts = pdf["Country"].value_counts()

# Print summary (useful for regional marketing or support)
print(country_counts)

In [0]:
# ------------------------------
# 3. Company Insights
# ------------------------------
print("\nTop Companies by Customer Count:")

# Find top 5 companies with most customers
company_counts = pdf["Company"].value_counts().head(5)

# Print results (useful for B2B targeting)
print(company_counts)


In [0]:
# ------------------------------
# 4. Contact Validation
# ------------------------------
print("\nMissing Contact Info:")

# Find customers with missing Email OR missing Phone 1
missing_contacts = pdf[(pdf["Email"].isnull()) | (pdf["Phone 1"].isnull())]

# Show only relevant fields
print(missing_contacts[["First Name", "Last Name", "Email", "Phone 1"]])

In [0]:
# ------------------------------
# 5. Subscription Renewal Logic
# ------------------------------
print("\nSubscription Renewal Pricing:")

# Function to calculate discount based on subscription year
def apply_discount(year, fee=1000):
    if year < 2021:  # older customers get more discount
        return fee - (fee * 0.20)  # 20% discount
    else:
        return fee - (fee * 0.10)  # 10% discount

# Apply logic row by row
for _, row in pdf.iterrows():
    year = int(str(row["Subscription Date"]).split("-")[0])
    discounted_price = apply_discount(year)
    print(f"{row['First Name']} {row['Last Name']} | Year: {year} | Renewal Fee: {discounted_price}")