In [143]:
# pip install "dask[complete]"

In [144]:
# pip install dask-ml

Starting Dask Client

In [145]:
from dask.distributed import Client 
client = Client() # local Dask Client
print("Dashboard: ", client.dashboard_link)

Perhaps you already have a cluster running?
Hosting the HTTP server on port 51424 instead


Dashboard:  http://127.0.0.1:51424/status


Loading Black Friday Dataset w Dask Dataframe

In [146]:
import dask.dataframe as dd

In [147]:
df = dd.read_csv(r"C:\Users\hp\OneDrive\Desktop\SEM7PRACS\BDA\exp2\blackfriday_train.csv")

In [148]:
df.head()

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
0,1000001,P00069042,F,0-17,10,A,2,0,3,,,8370
1,1000001,P00248942,F,0-17,10,A,2,0,1,6.0,14.0,15200
2,1000001,P00087842,F,0-17,10,A,2,0,12,,,1422
3,1000001,P00085442,F,0-17,10,A,2,0,12,14.0,,1057
4,1000002,P00285442,M,55+,16,C,4+,0,8,,,7969


Checking for missing values

In [149]:
missing_counts = df.isnull().sum()
print(missing_counts.compute())

User_ID                            0
Product_ID                         0
Gender                             0
Age                                0
Occupation                         0
City_Category                      0
Stay_In_Current_City_Years         0
Marital_Status                     0
Product_Category_1                 0
Product_Category_2            173638
Product_Category_3            383247
Purchase                           0
dtype: int64


Filling up missing values

In [150]:
df['Product_Category_2'] = df['Product_Category_2'].fillna(0)
df['Product_Category_3'] = df['Product_Category_3'].fillna(0)

Converting categorical columns into categorical type

In [151]:
categorical_cols = ['Gender', 'Age', 'City_Category', 'Stay_In_Current_City_Years']
numeric_cols = [col for col in df.columns if col not in categorical_cols + ['Purchase', 'User_ID', 'Product_ID']]

In [152]:
for col in categorical_cols:
    df[col] = df[col].astype('category')

Persisting keeps the transformed data in distributed memory for faster later access

In [153]:
df = df.persist()

Train-test (80/20)

In [154]:
from dask_ml.model_selection import train_test_split # works lazily on large datasets

In [155]:
X = df.drop(['Purchase', 'User_ID', 'Product_ID'], axis=1)
y = df['Purchase']

In [156]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True)

In [157]:
print(f"Training set size: {len(X_train)}")
print(f"Test set size: {len(X_test)}")

Training set size: 440339
Test set size: 109729


In [158]:
import dask.array as da

In [159]:
# X_train_codes = X_train[categorical_cols].cat.codes.to_dask_array(lengths=True)
# X_test_codes = X_test[categorical_cols].cat.codes.to_dask_array(lengths=True)

Code them into integers (One Hot Encoding)

In [160]:
from dask_ml.preprocessing import OneHotEncoder

In [162]:
encoder = OneHotEncoder(sparse_output=False) 
X_train_cat = encoder.fit_transform(X_train[categorical_cols].categorize())
X_test_cat = encoder.transform(X_test[categorical_cols].categorize())
# print(f"Encoded categorical features shape: {X_train_cat.shape}")

Combining encoded categorical + numeric features

In [163]:
X_train_num = X_train[numeric_cols].to_dask_array(lengths=True)
X_test_num = X_test[numeric_cols].to_dask_array(lengths=True)
print(f"Numeric features shape: {X_train_num.shape}")

Numeric features shape: (440339, 5)


In [164]:
X_train_da = da.hstack([X_train_num, X_train_cat])
X_test_da = da.hstack([X_test_num, X_test_cat])

ValueError: ('Shapes do not align: %s', [(440339, 5), (nan, 17)])

In [165]:
y_train_da = y_train.to_dask_array(lengths=True)
y_test_da = y_test.to_dask_array(lengths=True)

print(f"Final training features shape: {X_train_da.shape}")
print(f"Final test features shape: {X_test_da.shape}")

NameError: name 'X_train_da' is not defined

Converting dataframes to Dask arrays for compatibility with Dask ML models

In [None]:
# from dask_ml import preprocessing
# import dask.array as da

In [None]:
# X_train_da = X_train.to_dask_array(lengths=True)
# X_test_da = X_test.to_dask_array(lengths=True)
# y_train_da = y_train.to_dask_array(lengths=True)
# y_test_da = y_test.to_dask_array(lengths=True)

Train Linear Regression model

In [None]:
from dask_ml.linear_model import LinearRegression

In [None]:
model = LinearRegression()
model.fit(X_train_da, y_train_da)

Predictions

In [None]:
y_pred_da = model.predict(X_test_da)

R2 score

In [None]:
from sklearn.metrics import r2_score

In [None]:
# Converting Dask arrays to NumPy for sklearn
y_pred_np = y_pred_da.compute()
y_test_np = y_test_da.compute()
r2 = r2_score(y_test_np, y_pred_np)

In [None]:
print("R² score:", r2)

In [None]:
print(f"\nDask Dashboard was available at: {client.dashboard_link}")
client.close() # to prevent the port being open forever
print("Dask client closed successfully!")

In [None]:
# hwhe