# FinMarks — Dataset Preprocessing (Cross‑Platform Paths)

This notebook loads and cleans three CSV inputs, engineers features, merges them, and writes **`merged_cleaned_dataset.csv`**.
It uses a **generic path resolver** that works across Windows, macOS, and Linux without any user‑specific paths.


In [1]:
# --- Imports & Settings ---
import os
from pathlib import Path
import pandas as pd
import numpy as np

pd.set_option("display.max_columns", 100)


In [2]:
# --- Robust path resolver (generic) ---
# You can optionally set RAW_DIR as an environment variable to point to a folder to search first.
#   e.g., in a terminal before launching Jupyter:  export RAW_DIR=/path/to/data
# Otherwise the resolver will search common locations.

RAW_DIR = os.environ.get("RAW_DIR")

SEARCH_DIRS = []
if RAW_DIR:
    SEARCH_DIRS.append(Path(RAW_DIR))

# Common cross‑platform locations to search (no user-specific paths)
SEARCH_DIRS += [
    Path.cwd(),                          # current notebook directory
    Path.cwd().parent,                   # project root (one level up)
    Path.cwd() / "data",                 # ./data
    Path.home(),                         # home directory
    Path.home() / "Downloads",           # Downloads
]

FILENAMES = {
    "demographics": "customer_demographics_contaminated.csv",
    "transactions": "customer_transactions_contaminated.csv",
    "social":       "social_media_interactions_contaminated.csv",
}

def resolve_path(name: str) -> Path:
    fname = FILENAMES[name]
    # 1) direct check
    for d in SEARCH_DIRS:
        p = d / fname
        if p.exists():
            return p
    # 2) shallow recursive search (bounded)
    for d in SEARCH_DIRS:
        try:
            matches = list(d.glob(f"**/{fname}"))
        except Exception:
            matches = []
        if matches:
            return matches[0]
    raise FileNotFoundError(
        f"Could not find '{fname}'.\n"
        "Tip: Place the CSVs beside this notebook, in a ./data folder, in your Downloads, "
        "or set environment variable RAW_DIR to the directory containing the files."
    )

demographics_path = resolve_path("demographics")
transactions_path = resolve_path("transactions")
social_path       = resolve_path("social")

print("Resolved paths:")
print("  demographics:", demographics_path)
print("  transactions:", transactions_path)
print("  social      :", social_path)


Resolved paths:
  demographics: /Users/arenriquez1/Downloads/customer_demographics_contaminated.csv
  transactions: /Users/arenriquez1/Downloads/customer_transactions_contaminated.csv
  social      : /Users/arenriquez1/Downloads/social_media_interactions_contaminated.csv


In [3]:
# --- Load ---
demographics = pd.read_csv(demographics_path)
transactions = pd.read_csv(transactions_path)
social       = pd.read_csv(social_path)

# Standardize id column name across frames early
for df in (demographics, transactions, social):
    if "CustomerID" in df.columns:
        df["CustomerID"] = df["CustomerID"].astype("string")


In [4]:
# --- Basic Cleaning ---
# Drop exact duplicates
demographics = demographics.drop_duplicates()
transactions = transactions.drop_duplicates()
social       = social.drop_duplicates()

# Coerce selected numeric columns if present
if "Age" in demographics.columns:
    demographics["Age"] = pd.to_numeric(demographics["Age"], errors="coerce")

if "Amount" in transactions.columns:
    transactions["Amount"] = pd.to_numeric(transactions["Amount"], errors="coerce")

# Handle missing values (lightweight defaults; adjust as needed)
if "gender" in demographics.columns:
    demographics["gender"] = demographics["gender"].fillna("Unknown")
if "Age" in demographics.columns:
    demographics["Age"] = demographics["Age"].fillna(demographics["Age"].median())

if "Amount" in transactions.columns:
    transactions["Amount"] = transactions["Amount"].fillna(transactions["Amount"].median())

# For social metrics, fill numeric nulls with 0
num_cols_social = social.select_dtypes(include="number").columns
social[num_cols_social] = social[num_cols_social].fillna(0)


In [5]:
# --- Feature Engineering ---
# Transactions: aggregate spend metrics per customer
if "Amount" in transactions.columns:
    transactions["Amount"] = pd.to_numeric(transactions["Amount"], errors="coerce").fillna(0)

transaction_features = (
    transactions.groupby("CustomerID", observed=True)["Amount"]
    .agg(total_spent="sum", avg_spent="mean", num_transactions="count",
         max_transaction="max", min_transaction="min")
    .reset_index()
)

# Social: compute engagement as sum of numeric columns (excluding CustomerID)
social_numeric = social.drop(columns=[c for c in ["CustomerID"] if c in social.columns]).select_dtypes(include="number")
social["engagement_score"] = social_numeric.sum(axis=1)

agg_dict = {col: "sum" for col in social_numeric.columns}
agg_dict["engagement_score"] = "sum"

social_features = social.groupby("CustomerID", observed=True).agg(agg_dict).reset_index()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  social["engagement_score"] = social_numeric.sum(axis=1)


In [6]:
# --- Merge ---
# Normalize key name
for df in (demographics, transaction_features, social_features):
    df.rename(columns={"CustomerID": "customer_id"}, inplace=True)

merged = demographics.merge(transaction_features, on="customer_id", how="inner") \
                     .merge(social_features, on="customer_id", how="inner")

# Lowercase columns
merged.columns = merged.columns.str.lower()

print("Merged dataset shape:", merged.shape)
print("Columns:", merged.columns.tolist())
print("Unique customers:", merged["customer_id"].nunique())


Merged dataset shape: (1192, 12)
Columns: ['customer_id', 'age', 'gender', 'location', 'incomelevel', 'signupdate', 'total_spent', 'avg_spent', 'num_transactions', 'max_transaction', 'min_transaction', 'engagement_score']
Unique customers: 1181


In [7]:
# --- Save ---
out_path = Path("merged_cleaned_dataset.csv")
merged.to_csv(out_path, index=False)
print(f"Saved: {out_path.resolve()}")


Saved: /Users/arenriquez1/NetBeansProjects/MO-IT162-S3103-Group-2/Milestone 2: Data Visualization on Machine Learning Solution Project/merged_cleaned_dataset.csv
