<a href="https://colab.research.google.com/github/adhithyyaa/23BCS050_DATA_PROCESSING_CHALLENGE/blob/main/23BCS050_Incremental_Data_Processing_Challenge.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q pandas scikit-learn joblib


In [None]:
import pandas as pd
import random

# Base dataset: 100 rows
base_data = pd.DataFrame({
    "customer_id": range(1, 101),
    "transaction_amount": [round(random.uniform(10, 500), 2) for _ in range(100)],
    "age": [random.randint(18, 70) for _ in range(100)],
    "loyalty_score": [random.randint(0, 100) for _ in range(100)]
})

base_path = "/content/base_dataset.csv"
base_data.to_csv(base_path, index=False)
print("Base dataset saved:", base_path)
base_data.head()


Base dataset saved: /content/base_dataset.csv


Unnamed: 0,customer_id,transaction_amount,age,loyalty_score
0,1,474.28,61,17
1,2,342.56,65,86
2,3,456.23,46,1
3,4,201.97,26,59
4,5,438.25,67,30


In [None]:
from sklearn.linear_model import SGDRegressor
from sklearn.model_selection import train_test_split
from joblib import dump, load

X = base_data[["age", "loyalty_score"]]
y = base_data["transaction_amount"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Incremental ML model (supports partial_fit)
model = SGDRegressor(max_iter=1000, tol=1e-3, random_state=42)
model.partial_fit(X_train, y_train)

print("Initial model trained. Test score:", model.score(X_test, y_test))
dump(model, "/content/incremental_model.joblib")
print("Model saved: /content/incremental_model.joblib")


Initial model trained. Test score: -1.2195953462522405e+21
Model saved: /content/incremental_model.joblib


In [None]:
import os

cdc_dir = "/content/cdc_events"
os.makedirs(cdc_dir, exist_ok=True)

# Example: 3 new events (insert/update)
events = [
    {"customer_id": 101, "transaction_amount": 200.0, "age": 30, "loyalty_score": 50},  # new row
    {"customer_id": 5, "transaction_amount": 350.0, "age": 25, "loyalty_score": 60},   # update row 5
    {"customer_id": 20, "transaction_amount": 0.0, "age": 40, "loyalty_score": 10},    # mark delete
]

for i, event in enumerate(events):
    pd.DataFrame([event]).to_csv(f"{cdc_dir}/cdc_{i+1}.csv", index=False)
    print(f"CDC event written: {cdc_dir}/cdc_{i+1}.csv")


CDC event written: /content/cdc_events/cdc_1.csv
CDC event written: /content/cdc_events/cdc_2.csv
CDC event written: /content/cdc_events/cdc_3.csv


In [None]:
import glob

# Load base dataset
data = pd.read_csv(base_path)
model = load("/content/incremental_model.joblib")

# Process CDC events
cdc_files = sorted(glob.glob(f"{cdc_dir}/*.csv"))
for file in cdc_files:
    cdc = pd.read_csv(file)
    for _, row in cdc.iterrows():
        # Delete if transaction_amount == 0
        if row["transaction_amount"] == 0:
            data = data[data.customer_id != row["customer_id"]]
            print(f"Deleted customer_id {row['customer_id']}")
        # Update if customer exists
        elif row["customer_id"] in data["customer_id"].values:
            data.loc[data.customer_id == row["customer_id"], ["transaction_amount","age","loyalty_score"]] = \
                row[["transaction_amount","age","loyalty_score"]].values
            print(f"Updated customer_id {row['customer_id']}")
        # Insert new row
        else:
            data = pd.concat([data, pd.DataFrame([row])], ignore_index=True)
            print(f"Inserted new customer_id {row['customer_id']}")

    # Incrementally update the model
    X_cdc = cdc[["age","loyalty_score"]]
    y_cdc = cdc["transaction_amount"]
    model.partial_fit(X_cdc, y_cdc)
    print(f"Updated model with CDC file: {file}")

# Save updated dataset and model
data.to_csv("/content/updated_dataset.csv", index=False)
dump(model, "/content/incremental_model_updated.joblib")
print("✅ Incremental update complete. Dataset and model saved.")


Inserted new customer_id 101.0
Updated model with CDC file: /content/cdc_events/cdc_1.csv
Updated customer_id 5.0
Updated model with CDC file: /content/cdc_events/cdc_2.csv
Deleted customer_id 20.0
Updated model with CDC file: /content/cdc_events/cdc_3.csv
✅ Incremental update complete. Dataset and model saved.


In [None]:
data = pd.read_csv("/content/updated_dataset.csv")
print("Updated dataset (last 10 rows):")
print(data.tail(10))

from joblib import load
model = load("/content/incremental_model_updated.joblib")
print("Updated model coefficients:", model.coef_)


Updated dataset (last 10 rows):
    customer_id  transaction_amount   age  loyalty_score
90         92.0               76.48  28.0           72.0
91         93.0              132.96  63.0            2.0
92         94.0               38.34  58.0           48.0
93         95.0              113.10  35.0           43.0
94         96.0               36.89  25.0           28.0
95         97.0              464.49  30.0           16.0
96         98.0               51.98  31.0           76.0
97         99.0               88.60  19.0           98.0
98        100.0              193.91  55.0           93.0
99        101.0              200.00  30.0           50.0
Updated model coefficients: [-1.29469957e+11 -6.78314592e+10]
