<a href="https://colab.research.google.com/github/amien1410/colab-notebooks/blob/main/Colab_Pyspark_Linear_Regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#@title Download the dataset
!wget https://archive.ics.uci.edu/static/public/222/bank+marketing.zip
!unzip -q "/content/bank+marketing.zip"
!unzip -q "/content/bank.zip"

--2025-06-26 14:57:05--  https://archive.ics.uci.edu/static/public/222/bank+marketing.zip
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified
Saving to: ‘bank+marketing.zip’

bank+marketing.zip      [ <=>                ] 999.85K  6.00MB/s    in 0.2s    

2025-06-26 14:57:05 (6.00 MB/s) - ‘bank+marketing.zip’ saved [1023843]



In [2]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.regression import LinearRegression
import pandas as pd

# 1. Start Spark session and read the data
filename = "/content/bank-full.csv"
spark = SparkSession.builder.getOrCreate()
data = spark.read.csv(filename, header=True, inferSchema=True, sep=';')
data.show()

# 2. Define a function to assemble feature vectors
def assemble_vectors(df, features_list, target_variable_name):
    assembler = VectorAssembler(inputCols=features_list, outputCol='features')
    pipeline = Pipeline(stages=[assembler])

    # Fit and transform data using the pipeline
    model = pipeline.fit(df)
    df_transformed = model.transform(df)

    # Select only necessary columns
    selectedCols = [target_variable_name, 'features'] + features_list
    return df_transformed.select(selectedCols)

# 3. Select numeric columns for regression
linear_df = data.select(['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous'])
target_variable_name = 'balance'
features_list = linear_df.columns
features_list.remove(target_variable_name)  # Exclude the target

# 4. Assemble the feature vectors
df = assemble_vectors(linear_df, features_list, target_variable_name)

# 5. Fit a linear regression model
reg = LinearRegression(featuresCol='features', labelCol='balance')
reg_model = reg.fit(df)

# 6. View coefficients and intercept
for k, v in df.schema['features'].metadata['ml_attr']['attrs'].items():
    features_df = pd.DataFrame(v)
    features_df['coefficients'] = reg_model.coefficients[:len(features_df)]
    print(features_df)

print(f"Intercept: {reg_model.intercept}")

# 7. Predict results on the same dataset
pred_result = reg_model.transform(df)
pred_result.show()

+---+------------+--------+---------+-------+-------+-------+----+-------+---+-----+--------+--------+-----+--------+--------+---+
|age|         job| marital|education|default|balance|housing|loan|contact|day|month|duration|campaign|pdays|previous|poutcome|  y|
+---+------------+--------+---------+-------+-------+-------+----+-------+---+-----+--------+--------+-----+--------+--------+---+
| 58|  management| married| tertiary|     no|   2143|    yes|  no|unknown|  5|  may|     261|       1|   -1|       0| unknown| no|
| 44|  technician|  single|secondary|     no|     29|    yes|  no|unknown|  5|  may|     151|       1|   -1|       0| unknown| no|
| 33|entrepreneur| married|secondary|     no|      2|    yes| yes|unknown|  5|  may|      76|       1|   -1|       0| unknown| no|
| 47| blue-collar| married|  unknown|     no|   1506|    yes|  no|unknown|  5|  may|      92|       1|   -1|       0| unknown| no|
| 33|     unknown|  single|  unknown|     no|      1|     no|  no|unknown|  5|  may