In [None]:
import warnings
warnings.filterwarnings("ignore")
# Link is https://github.com/Engg-Abhinav/Feature-Engineering/blob/main/blackFriday_train.csv
import os
import urllib
def fetch_dataset(link="https://raw.githubusercontent.com/Engg-Abhinav/Feature-Engineering/refs/heads/main/blackFriday_train.csv", name="blackfriday_train.csv"):
	target_dir = os.path.join("data")
	os.makedirs(target_dir, exist_ok=True)
	target_file = os.path.join(target_dir, name)
	if not os.path.exists(target_file):
		print(f"Downloading {name} from {link}...")
		urllib.request.urlretrieve(link, target_file)
		print(f"File saved to {target_file}")
	else:
		print(f"{name} already exists in {target_dir}, skipping download.")
fetch_dataset()

# Experiment 3 $-$ Dask Implementation
## Section 1: Basic Dask Array Creation and Chunking

In [None]:
import dask.array as da

X = da.arange(101, chunks=5)
print(X.compute())

print(X.chunks)

## Section 2: Convert NumPy array to Dask Array

In [None]:
import numpy as np

X2 = np.arange(21)
y = da.from_array(X2, chunks=5)

result = y.compute() # result is stored in ndarray format
print(result)

## Section 3: Calculate Mean of Large Array Using Dask

In [None]:
X3 = np.arange(1e5)
y2 = da.from_array(X3, chunks=100)

print(y2.mean().compute())

## Section 4: Reading CSV with Pandas vs Dask (Time comparison)

In [None]:
import pandas as pd
%time temp = pd.read_csv("./data/blackfriday_train.csv")

In [None]:
import dask.dataframe as dd
%time df = dd.read_csv("./data/blackfriday_train.csv")

## Section 5: Dask Dataframe Basic Operations

In [None]:
df.Gender.value_counts().compute() # type: ignore

In [None]:
df.groupby(df.Gender).Purchase.max().compute() # type: ignore

## Section 6: Setup Dask ML with Parallel Backend for Scikit-Learn

In [None]:
from dask.distributed import Client
client = Client()

In [None]:
from joblib import parallel_backend
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

# client = Client() # commented because already a client is running
print("Dashboard:", client.dashboard_link)

X, y = make_classification(n_samples=1000, n_features=20, n_informative=10, n_redundant=5, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

with parallel_backend('dask'):
    model = RandomForestClassifier(verbose=2, n_jobs=-1, random_state=42)
    model.fit(X_train, y_train)

print("Model trained successfully!")
print("Accuracy: ", model.score(X_test, y_test))

## Section 7: Dask ML Native Algorithms Example

In [None]:
from dask_ml.linear_model import LogisticRegression
from dask_ml.preprocessing import OneHotEncoder
from dask_ml.cluster import KMeans

# client = Client() # already in use
print("Dashboard:", client.dashboard_link)

X = da.random.random((10000, 20), chunks=(1000, 20))
y = da.random.randint(0, 2, size=(10000,), chunks=(1000,))

print("\n--- Logistic Regression ---")
log_reg = LogisticRegression()
log_reg.fit(X, y)
print("Logistic Regression coefficients shape:", log_reg.coef_.shape)

print("\n--- One Hot Encoder ---")
cat_data = da.random.randint(0, 5, size=(10000, 3), chunks=(1000, 3))
encoder = OneHotEncoder(sparse_output=True)
encoder_result = encoder.fit_transform(cat_data)
print("Encoded shape:", encoder_result.shape)

print("\n--- KMeans ---")
kmeans = KMeans(n_clusters=5, init_max_iter=5, random_state=42)
kmeans.fit(X)
print("KMeans cluster centers shape:", kmeans.cluster_centers_.shape)

## Section 8: Full Example: Dask DataFrame to ML Pipeline on Black Friday Dataset

In [None]:
!pip install dask-ml

In [None]:
# Starts a Dask client and displays the dashboard link for real-time monitoring
from dask.distributed import Client

client2 = Client()
print("Dashboard link for section 8:", client2.dashboard_link)

In [None]:
# Loads the Black Friday dataset with Dask DataFrame for scalable handling of large CSV files.
from dask import dataframe as dd

df = dd.read_csv("./data/blackfriday_train.csv")
print(df.columns)
print(df.head())
df.describe().compute()

In [None]:
# Checks and fills missing values in specific columns.
# Checking for missing values in the dataset
missing_values = df.isnull().sum().compute()
print("Missing values in each column:\n", missing_values)

In [None]:
# Filling missing values in features Product_Category_2, Product_Category_3
df['Product_Category_2'] = df['Product_Category_2'].fillna(df['Product_Category_2'].mean())
df['Product_Category_3'] = df['Product_Category_3'].fillna(df['Product_Category_3'].mean())

In [None]:
# Converts categorical columns into categorical type, then codes them into integers for ML compatibility.
print(df.City_Category.value_counts().compute())
print(df.Stay_In_Current_City_Years.value_counts().compute())
print(df.Age.value_counts().compute())

In [None]:
# Convert these columns to category type
df = df.categorize(columns=['City_Category', 'Stay_In_Current_City_Years']).persist()

# Code them into integers
from dask_ml.preprocessing import OneHotEncoder

encoder = OneHotEncoder(sparse_output=True)
encoded_city = encoder.fit_transform(df[['City_Category']])
encoded_stay = encoder.fit_transform(df[['Stay_In_Current_City_Years']])
age_map = {
    '0-17': 8.5,
    '18-25': 21.5,
    '26-35': 30.5,
    '36-45': 40.5,
    '46-50': 48,
    '51-55': 53,
    '55+': 60
}
df['Age'] = df['Age'].map(age_map)

In [None]:
# Persists data in memory for faster repeated operations.
df = df.persist()

In [None]:
# Splits data into train and test sets using Dask ML’s train_test_split.
from dask_ml.model_selection import train_test_split

# Features: numerical + encoded categoricals
X = dd.concat([df[['User_ID', 'Age', 'Occupation', 'Product_Category_1', 'Product_Category_2', 'Product_Category_3']],
			   encoded_city, encoded_stay], axis=1)  # Features
y = df['Purchase']  # Target variable

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
X_train = X_train.persist()
X_test = X_test.persist()
y_train = y_train.persist()
y_test = y_test.persist()
# Converts dataframes to Dask arrays for compatibility with Dask ML models.
X_train = X_train.to_dask_array(lengths=True)
X_test = X_test.to_dask_array(lengths=True)
y_train = y_train.to_dask_array(lengths=True)
y_test = y_test.to_dask_array(lengths=True)

In [None]:
# Trains a Linear Regression model using Dask ML.
from dask_ml.linear_model import LinearRegression

model = LinearRegression()
model.fit(X_train, y_train)

In [None]:
# Makes predictions and computes R² score using sklearn metrics on computed numpy arrays.
from sklearn.metrics import r2_score

y_pred = model.predict(X_test).compute()
score = r2_score(y_test.compute(), y_pred)
print("R² score:", score)