<a href="https://colab.research.google.com/github/amien1410/colab-notebooks/blob/main/Colab_Pyspark_Linear_Regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#@title Download the dataset
!wget https://archive.ics.uci.edu/static/public/222/bank+marketing.zip
!unzip -q "/content/bank+marketing.zip"
!unzip -q "/content/bank.zip"

--2025-06-26 14:57:05--  https://archive.ics.uci.edu/static/public/222/bank+marketing.zip
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified
Saving to: ‘bank+marketing.zip’

bank+marketing.zip      [ <=>                ] 999.85K  6.00MB/s    in 0.2s    

2025-06-26 14:57:05 (6.00 MB/s) - ‘bank+marketing.zip’ saved [1023843]



In [2]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.regression import LinearRegression
import pandas as pd

# 1. Start Spark session and read the data
filename = "/content/bank-full.csv"
spark = SparkSession.builder.getOrCreate()
data = spark.read.csv(filename, header=True, inferSchema=True, sep=';')
data.show()

# 2. Define a function to assemble feature vectors
def assemble_vectors(df, features_list, target_variable_name):
    assembler = VectorAssembler(inputCols=features_list, outputCol='features')
    pipeline = Pipeline(stages=[assembler])

    # Fit and transform data using the pipeline
    model = pipeline.fit(df)
    df_transformed = model.transform(df)

    # Select only necessary columns
    selectedCols = [target_variable_name, 'features'] + features_list
    return df_transformed.select(selectedCols)

# 3. Select numeric columns for regression
linear_df = data.select(['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous'])
target_variable_name = 'balance'
features_list = linear_df.columns
features_list.remove(target_variable_name)  # Exclude the target

# 4. Assemble the feature vectors
df = assemble_vectors(linear_df, features_list, target_variable_name)

# 5. Fit a linear regression model
reg = LinearRegression(featuresCol='features', labelCol='balance')
reg_model = reg.fit(df)

# 6. View coefficients and intercept
for k, v in df.schema['features'].metadata['ml_attr']['attrs'].items():
    features_df = pd.DataFrame(v)
    features_df['coefficients'] = reg_model.coefficients[:len(features_df)]
    print(features_df)

print(f"Intercept: {reg_model.intercept}")

# 7. Predict results on the same dataset
pred_result = reg_model.transform(df)
pred_result.show()

+---+------------+--------+---------+-------+-------+-------+----+-------+---+-----+--------+--------+-----+--------+--------+---+
|age|         job| marital|education|default|balance|housing|loan|contact|day|month|duration|campaign|pdays|previous|poutcome|  y|
+---+------------+--------+---------+-------+-------+-------+----+-------+---+-----+--------+--------+-----+--------+--------+---+
| 58|  management| married| tertiary|     no|   2143|    yes|  no|unknown|  5|  may|     261|       1|   -1|       0| unknown| no|
| 44|  technician|  single|secondary|     no|     29|    yes|  no|unknown|  5|  may|     151|       1|   -1|       0| unknown| no|
| 33|entrepreneur| married|secondary|     no|      2|    yes| yes|unknown|  5|  may|      76|       1|   -1|       0| unknown| no|
| 47| blue-collar| married|  unknown|     no|   1506|    yes|  no|unknown|  5|  may|      92|       1|   -1|       0| unknown| no|
| 33|     unknown|  single|  unknown|     no|      1|     no|  no|unknown|  5|  may

In [3]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
import pandas as pd

# 1. Initialize Spark
spark = SparkSession.builder.appName("BankLinearRegression").getOrCreate()

# 2. Load the data
filename = "/content/bank-full.csv"
data = spark.read.csv(filename, header=True, inferSchema=True, sep=';')
data.printSchema()
data.show(5)

# 3. Select relevant numeric columns for regression
numeric_cols = ['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']
target_col = 'balance'
features_list = [col for col in numeric_cols if col != target_col]

# 4. Assemble features into vector
assembler = VectorAssembler(inputCols=features_list, outputCol="features")
pipeline = Pipeline(stages=[assembler])

# 5. Transform the data
pipeline_model = pipeline.fit(data)
processed_data = pipeline_model.transform(data).select(target_col, "features")

# 6. Split data into train and test sets
train_data, test_data = processed_data.randomSplit([0.8, 0.2], seed=42)

# 7. Fit Linear Regression model
lr = LinearRegression(featuresCol="features", labelCol=target_col)
lr_model = lr.fit(train_data)

# 8. Show coefficients and intercept
print("\n📊 Coefficients and Intercept:")
for k, v in processed_data.schema["features"].metadata["ml_attr"]["attrs"].items():
    features_df = pd.DataFrame(v)
    features_df["coefficient"] = lr_model.coefficients[:len(features_df)]
    print(features_df)
print(f"Intercept: {lr_model.intercept}")

# 9. Evaluate model on test data
predictions = lr_model.transform(test_data)
predictions.select("balance", "prediction").show(5)

evaluator_rmse = RegressionEvaluator(labelCol=target_col, predictionCol="prediction", metricName="rmse")
evaluator_r2 = RegressionEvaluator(labelCol=target_col, predictionCol="prediction", metricName="r2")

rmse = evaluator_rmse.evaluate(predictions)
r2 = evaluator_r2.evaluate(predictions)

print(f"\n✅ Model Performance on Test Set:\n- RMSE: {rmse:.2f}\n- R²: {r2:.4f}")


root
 |-- age: integer (nullable = true)
 |-- job: string (nullable = true)
 |-- marital: string (nullable = true)
 |-- education: string (nullable = true)
 |-- default: string (nullable = true)
 |-- balance: integer (nullable = true)
 |-- housing: string (nullable = true)
 |-- loan: string (nullable = true)
 |-- contact: string (nullable = true)
 |-- day: integer (nullable = true)
 |-- month: string (nullable = true)
 |-- duration: integer (nullable = true)
 |-- campaign: integer (nullable = true)
 |-- pdays: integer (nullable = true)
 |-- previous: integer (nullable = true)
 |-- poutcome: string (nullable = true)
 |-- y: string (nullable = true)

+---+------------+-------+---------+-------+-------+-------+----+-------+---+-----+--------+--------+-----+--------+--------+---+
|age|         job|marital|education|default|balance|housing|loan|contact|day|month|duration|campaign|pdays|previous|poutcome|  y|
+---+------------+-------+---------+-------+-------+-------+----+-------+---+-----+

In [4]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator

# 1. Start Spark session
spark = SparkSession.builder.appName("ImprovedBankRegression").getOrCreate()

# 2. Load the dataset
filename = "/content/bank-full.csv"
data = spark.read.csv(filename, header=True, inferSchema=True, sep=';')
data.printSchema()

# 3. Define features
numeric_cols = ['age', 'day', 'duration', 'campaign', 'pdays', 'previous']
categorical_cols = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'poutcome']
target_col = 'balance'

# 4. Index and encode categorical columns
indexers = [StringIndexer(inputCol=col, outputCol=col + "_index", handleInvalid='keep') for col in categorical_cols]
encoders = [OneHotEncoder(inputCol=col + "_index", outputCol=col + "_encoded") for col in categorical_cols]

# 5. Assemble all features
feature_cols = numeric_cols + [col + "_encoded" for col in categorical_cols]
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")

# 6. Define pipeline
pipeline = Pipeline(stages=indexers + encoders + [assembler])

# 7. Transform data
pipeline_model = pipeline.fit(data)
processed_data = pipeline_model.transform(data).select(target_col, "features")

# 8. Train-test split
train_data, test_data = processed_data.randomSplit([0.8, 0.2], seed=42)

# 9. Fit Linear Regression model
lr = LinearRegression(featuresCol="features", labelCol=target_col)
lr_model = lr.fit(train_data)

# 10. Evaluate
predictions = lr_model.transform(test_data)
evaluator_rmse = RegressionEvaluator(labelCol=target_col, predictionCol="prediction", metricName="rmse")
evaluator_r2 = RegressionEvaluator(labelCol=target_col, predictionCol="prediction", metricName="r2")

rmse = evaluator_rmse.evaluate(predictions)
r2 = evaluator_r2.evaluate(predictions)

print(f"\n✅ Improved Model Performance:\n- RMSE: {rmse:.2f}\n- R²: {r2:.4f}")

# 11. Show prediction results
predictions.select(target_col, "prediction").show(5)


root
 |-- age: integer (nullable = true)
 |-- job: string (nullable = true)
 |-- marital: string (nullable = true)
 |-- education: string (nullable = true)
 |-- default: string (nullable = true)
 |-- balance: integer (nullable = true)
 |-- housing: string (nullable = true)
 |-- loan: string (nullable = true)
 |-- contact: string (nullable = true)
 |-- day: integer (nullable = true)
 |-- month: string (nullable = true)
 |-- duration: integer (nullable = true)
 |-- campaign: integer (nullable = true)
 |-- pdays: integer (nullable = true)
 |-- previous: integer (nullable = true)
 |-- poutcome: string (nullable = true)
 |-- y: string (nullable = true)


✅ Improved Model Performance:
- RMSE: 2792.43
- R²: 0.0534
+-------+------------------+
|balance|        prediction|
+-------+------------------+
|  -4057|1951.3662453085174|
|  -2827|-359.0293431723794|
|  -2604|  1191.64433128908|
|  -2049|1609.2305426714872|
|  -1884|1473.9955527153568|
+-------+------------------+
only showing top 5 row

In [5]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator

# 1. Start Spark session
spark = SparkSession.builder.appName("ImprovedBankRegression").getOrCreate()

# 2. Load dataset
filename = "bank-full.csv"
data = spark.read.csv(filename, header=True, inferSchema=True, sep=';')
data.printSchema()

# 3. Define columns
numeric_cols = ['age', 'day', 'duration', 'campaign', 'pdays', 'previous']
categorical_cols = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'poutcome']
target_col = 'balance'

# 4. Index and encode categorical columns
indexers = [StringIndexer(inputCol=col, outputCol=f"{col}_index", handleInvalid='keep') for col in categorical_cols]
encoders = [OneHotEncoder(inputCol=f"{col}_index", outputCol=f"{col}_encoded") for col in categorical_cols]

# 5. Assemble features
assembler_inputs = numeric_cols + [f"{col}_encoded" for col in categorical_cols]
assembler = VectorAssembler(inputCols=assembler_inputs, outputCol="features")

# 6. Build pipeline
pipeline = Pipeline(stages=indexers + encoders + [assembler])

# 7. Transform data
pipeline_model = pipeline.fit(data)
processed_data = pipeline_model.transform(data).select(target_col, "features")

# 8. Train/test split
train_data, test_data = processed_data.randomSplit([0.8, 0.2], seed=42)

# 9. Fit Linear Regression
lr = LinearRegression(featuresCol="features", labelCol=target_col)
lr_model = lr.fit(train_data)

# 10. Evaluate
predictions = lr_model.transform(test_data)

evaluator_rmse = RegressionEvaluator(labelCol=target_col, predictionCol="prediction", metricName="rmse")
evaluator_r2 = RegressionEvaluator(labelCol=target_col, predictionCol="prediction", metricName="r2")

rmse = evaluator_rmse.evaluate(predictions)
r2 = evaluator_r2.evaluate(predictions)

print(f"\n✅ Improved Linear Regression Performance:")
print(f"RMSE: {rmse:.2f}")
print(f"R²: {r2:.4f}")

# 11. Show predictions
predictions.select(target_col, "prediction").show(5)

root
 |-- age: integer (nullable = true)
 |-- job: string (nullable = true)
 |-- marital: string (nullable = true)
 |-- education: string (nullable = true)
 |-- default: string (nullable = true)
 |-- balance: integer (nullable = true)
 |-- housing: string (nullable = true)
 |-- loan: string (nullable = true)
 |-- contact: string (nullable = true)
 |-- day: integer (nullable = true)
 |-- month: string (nullable = true)
 |-- duration: integer (nullable = true)
 |-- campaign: integer (nullable = true)
 |-- pdays: integer (nullable = true)
 |-- previous: integer (nullable = true)
 |-- poutcome: string (nullable = true)
 |-- y: string (nullable = true)


✅ Improved Linear Regression Performance:
RMSE: 2792.43
R²: 0.0534
+-------+------------------+
|balance|        prediction|
+-------+------------------+
|  -4057|1951.3662453085174|
|  -2827|-359.0293431723794|
|  -2604|  1191.64433128908|
|  -2049|1609.2305426714872|
|  -1884|1473.9955527153568|
+-------+------------------+
only showing t