# Regression Decision Tree

For implementing Decision Tree with continuous variables, you will be using the LibSVM data. Such a dataset contains many missing values but Decision Trees can perform will with missing data as well. Hence, you will be implementing Decision Tree on this dataset.

In [1]:
# Installing necessary dependent packages
!pip install -q ipython-autotime
!pip install -q pyspark
!pip install -q tqdm

# Loading autotime for the notebook
%load_ext autotime

[K     |████████████████████████████████| 281.3 MB 42 kB/s 
[K     |████████████████████████████████| 199 kB 37.8 MB/s 
[?25h  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
time: 2.12 ms (started: 2022-07-25 13:04:20 +00:00)


In [3]:
def download_file_google_drive(id: str, f_name: str, f_save_path: str=""):
    """
    Download file from Google Drive

    Args:
        id (str): Google Drive File ID
        f_name (str): File name
        f_save_path (str): File save path. 
            default: Current working directory
    """
    import os, requests

    def get_confirm_token(response):
        for key, value in response.cookies.items():
            if key.startswith('download_warning'):
                return value
        return None

    def save_response_content(response, destination):
        CHUNK_SIZE = 32768
        with open(destination, "wb") as f:
            for chunk in response.iter_content(CHUNK_SIZE):
                if chunk: # filter out keep-alive new chunks
                    f.write(chunk)

    URL = "https://docs.google.com/uc?export=download"

    f_save_path = os.getcwd() if not f_save_path else f_save_path.rstrip("/")
    destination = f"{f_save_path}/{f_name}"

    session = requests.Session()
    response = session.get(URL, params = { 'id' : id }, stream = True)
    token = get_confirm_token(response)

    if token:
        params = { 'id' : id, 'confirm' : token }
        response = session.get(URL, params = params, stream = True)
    save_response_content(response, destination)    

time: 20.1 ms (started: 2022-07-25 13:04:20 +00:00)


In [4]:
data_files = {"1JpQa3QOTe_GObKjg_mFpgi0BgXhfzc9N": "libsvm_data.txt"}

for file_id, file_name in data_files.items():
    download_file_google_drive(file_id, file_name)

time: 706 ms (started: 2022-07-25 13:05:12 +00:00)


In [5]:
# Load the libraries required for loading libSVM files
# SparkSession
from pyspark.sql import SparkSession
from pyspark.mllib.util import MLUtils

time: 1.47 ms (started: 2022-07-25 13:05:15 +00:00)


In [6]:
spark = SparkSession.builder.master("local") \
                            .appName("LibSVM") \
                            .config("spark.ui.port", "4050") \
                            .getOrCreate()

spark.sparkContext.setLogLevel("ERROR")

time: 6.5 s (started: 2022-07-25 13:05:17 +00:00)


In [8]:
# Load the data
data = MLUtils.loadLibSVMFile(spark.sparkContext, "/content/libsvm_data.txt")

time: 3.32 s (started: 2022-07-25 14:18:18 +00:00)


In [9]:
# View data
data.take(1)

[LabeledPoint(0.0, (692,[127,128,129,130,131,154,155,156,157,158,159,181,182,183,184,185,186,187,188,189,207,208,209,210,211,212,213,214,215,216,217,235,236,237,238,239,240,241,242,243,244,245,262,263,264,265,266,267,268,269,270,271,272,273,289,290,291,292,293,294,295,296,297,300,301,302,316,317,318,319,320,321,328,329,330,343,344,345,346,347,348,349,356,357,358,371,372,373,374,384,385,386,399,400,401,412,413,414,426,427,428,429,440,441,442,454,455,456,457,466,467,468,469,470,482,483,484,493,494,495,496,497,510,511,512,520,521,522,523,538,539,540,547,548,549,550,566,567,568,569,570,571,572,573,574,575,576,577,578,594,595,596,597,598,599,600,601,602,603,604,622,623,624,625,626,627,628,629,630,651,652,653,654,655,656,657],[51.0,159.0,253.0,159.0,50.0,48.0,238.0,252.0,252.0,252.0,237.0,54.0,227.0,253.0,252.0,239.0,233.0,252.0,57.0,6.0,10.0,60.0,224.0,252.0,253.0,252.0,202.0,84.0,252.0,253.0,122.0,163.0,252.0,252.0,252.0,253.0,252.0,252.0,96.0,189.0,253.0,167.0,51.0,238.0,253.0,253.0,190.0

time: 766 ms (started: 2022-07-25 14:20:22 +00:00)


In [10]:
# View the count of data
data.count()

100

time: 476 ms (started: 2022-07-25 14:20:27 +00:00)


## Split data into train, test 

In [12]:
# Use randomSplit to split data into 70-30%
(trainingData, testData) = data.randomSplit([0.7,0.3])

time: 1.45 ms (started: 2022-07-25 14:22:35 +00:00)


In [13]:
# View the count of training data
trainingData.count()

69

time: 718 ms (started: 2022-07-25 14:22:36 +00:00)


In [14]:
# View the count of test data
testData.count()

31

time: 135 ms (started: 2022-07-25 14:22:46 +00:00)


## Train Decision Tree Regression Model

In [15]:
# Import the libraries required for Decision Tree
from pyspark.mllib.tree import DecisionTree, DecisionTreeModel


time: 5.75 ms (started: 2022-07-25 14:22:57 +00:00)


In [36]:
# Create a Decision Tree model
model = DecisionTree.trainRegressor(trainingData,
                                    categoricalFeaturesInfo={},
                                    impurity="variance",
                                    maxDepth=2)

time: 1.32 s (started: 2022-07-25 15:45:08 +00:00)


In [37]:
# View the model created
model

DecisionTreeModel regressor of depth 1 with 3 nodes

time: 15.1 ms (started: 2022-07-25 15:45:09 +00:00)


## Prediction for Test data

In [38]:
# Predict result based on model created
predictions = model.predict(testData.map(lambda x: x.features))

time: 47 ms (started: 2022-07-25 15:45:09 +00:00)


In [39]:
# View the predictions
predictions.take(10)

[0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0]

time: 600 ms (started: 2022-07-25 15:45:09 +00:00)


## Label & Predict output together

In [40]:
# Place the actual and predicted labels together to get a better understanding
labelAndPreds = testData.map(lambda x: x.label).zip(predictions)

time: 19.8 ms (started: 2022-07-25 15:45:10 +00:00)


In [41]:
# View the actual and predicted labels together to get a better understanding
labelAndPreds.take(10)

[(0.0, 0.0),
 (1.0, 0.0),
 (0.0, 0.0),
 (0.0, 0.0),
 (1.0, 1.0),
 (1.0, 1.0),
 (0.0, 0.0),
 (0.0, 0.0),
 (1.0, 1.0),
 (0.0, 0.0)]

time: 131 ms (started: 2022-07-25 15:45:10 +00:00)


## Calculate MSE

In [42]:
# Calculate and view MSE value
mse_value = labelAndPreds.map(lambda x: (x[0]-x[1])*(x[0]-x[1])).sum() / float(testData.count())
mse_value

0.03225806451612903

time: 672 ms (started: 2022-07-25 15:45:10 +00:00)


## Visualize Model in Debug Mode

In [43]:
# Print tree using model.toDebugString()
print(model.toDebugString())

DecisionTreeModel regressor of depth 1 with 3 nodes
  If (feature 434 <= 70.5)
   Predict: 0.0
  Else (feature 434 > 70.5)
   Predict: 1.0

time: 2.23 ms (started: 2022-07-25 15:45:12 +00:00)
