<a href="https://colab.research.google.com/github/adityashah841/Feature-engineering-and-parallelization-for-polynomial-regression-models-using-CART/blob/main/modified_decision_tree.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd

In [51]:
import psutil
import multiprocessing as mp
import time

def print_cpu_utilization(func, *args):
    
    start = time.time()
    
    # Start a separate process to run the given function
    p = mp.Process(target=func, args=args)
    p.start()

    # Keep printing CPU utilization while the function is running
    while p.is_alive():
        cpu_percent = psutil.cpu_percent(percpu=True)
        print("CPU utilization:", cpu_percent)
        time.sleep(1)

    # Wait for the function to finish and join the process
    p.join()
    
    print(f"Time taken by process: {time.time() - start}")


In [None]:
from sklearn.tree import DecisionTreeRegressor, _tree
def filter_dataset_by_tree(model, X_train, y_train):
    """
    Filters the training set X_train into multiple subsets by going through the decision tree model and
    filtering the dataset, and returns a list of these subsets created at each leaf node. It also concatenates
    the corresponding y_train values with each subset.

    Args:
    - model: A trained scikit-learn decision tree regressor model.
    - X_train: The training set as a numpy array or pandas DataFrame.
    - y_train: The target variable as a numpy array or pandas Series.

    Returns:
    - A list of subsets, where each subset contains concatenated subset of the training set (as a numpy array
      or pandas DataFrame) and the corresponding target variable values (as a numpy array or pandas Series).
      Each subset corresponds to a leaf node in the decision tree.
    """
    # Get the leaf node indices for each data point in the training set
    leaf_node_indices = model.apply(X_train)

    # Get the unique leaf node indices
    unique_leaf_node_indices = np.unique(leaf_node_indices)

    # Create a list to store the subsets of the training set and their corresponding target variable values
    subsets = []

    # For each leaf node, filter the training set and target variable and append the resulting subset and target variable values to the list
    for node_idx in unique_leaf_node_indices:
        mask = leaf_node_indices == node_idx
        subset_X = X_train[mask]
        subset_y = y_train[mask]
        subsets.append(np.concatenate((subset_X, subset_y),axis=1))

    return subsets


In [None]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression

In [None]:
import multiprocessing

cores = multiprocessing.cpu_count() # Count the number of cores in a computer
cores

2

In [None]:
from sklearn.datasets import fetch_covtype

# Load the dataset
data = fetch_covtype(as_frame=True)

# Separate the data into features and target
X = data.data
y = data.target

In [None]:
X,y

(        Elevation  Aspect  Slope  Horizontal_Distance_To_Hydrology  \
 0          2596.0    51.0    3.0                             258.0   
 1          2590.0    56.0    2.0                             212.0   
 2          2804.0   139.0    9.0                             268.0   
 3          2785.0   155.0   18.0                             242.0   
 4          2595.0    45.0    2.0                             153.0   
 ...           ...     ...    ...                               ...   
 581007     2396.0   153.0   20.0                              85.0   
 581008     2391.0   152.0   19.0                              67.0   
 581009     2386.0   159.0   17.0                              60.0   
 581010     2384.0   170.0   15.0                              60.0   
 581011     2383.0   165.0   13.0                              60.0   
 
         Vertical_Distance_To_Hydrology  Horizontal_Distance_To_Roadways  \
 0                                  0.0                            510

In [25]:
from sklearn.tree import DecisionTreeRegressor
regressor = DecisionTreeRegressor(max_depth=3)
regressor.fit(X,y)

In [27]:
subsets = filter_dataset_by_tree(regressor,X,y.to_numpy().reshape(-1,1))
for s in subsets:
  print(len(s))

39726
12886
4485
412
153065
344797
12740
12901


In [34]:
Xy = subsets[0]
Xy.shape
y = Xy[:,-1]
X = Xy[:,:-1]

X.shape, y.shape

((39726, 54), (39726,))

In [35]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,train_size=0.7,random_state=41)


In [36]:
len(X_train)

27808

In [None]:
# Define the training function for each model
def train_model(X, y, model):
    model.fit(X, y)

In [None]:
from sklearn.metrics import r2_score
def get_r2(model,X,y):
  pred = model.predict(X)
  print(f"Model {model} accuracy:", r2_score(y, pred))

In [None]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression

In [55]:
poly = PolynomialFeatures(degree=2, include_bias=False)
poly_X_train = poly.fit_transform(X_train)
poly_X_test = poly.fit_transform(X_test)

In [53]:
model = LinearRegression()
model.fit(poly_X_train,y_train)
# print_cpu_utilization(train_model,poly_X_train,y_train,model)
print('R2:', get_r2(model,poly_X_test,y_test))

Model LinearRegression() accuracy: 0.28139085715726275
R2: None


In [54]:
from sklearn.linear_model import SGDRegressor

model = SGDRegressor(early_stopping = False,
                     validation_fraction = 0.3)

model.fit(poly_X_train,y_train)
# print_cpu_utilization(train_model,poly_X_train,y_train,model)
print('R2:', get_r2(model,poly_X_test,y_test))

Model SGDRegressor(validation_fraction=0.3) accuracy: -6.715379812218542e+41
R2: None


In [42]:
model = SGDRegressor(early_stopping = True,
                     validation_fraction = 0.3)

model.fit(poly_X_train,y_train)
#print_cpu_utilization(train_model,poly_X_train,y_train,model)
print('R2:', get_r2(model,poly_X_test,y_test))

Model SGDRegressor(early_stopping=True, validation_fraction=0.3) accuracy: -1.4235912304056333e+30
R2: None
