Importing the libraries and Creating SparkSession

In [1]:
import pandas as pd
from pyspark.sql.functions import col, sum
import pyspark.sql.functions as F
from pyspark.ml.feature import OneHotEncoder, StringIndexer
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.functions import vector_to_array
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Income Prediction").config("spark.memory.offHeap.enabled", "true").config("spark.memory.offHeap.size", "10g").getOrCreate()

Reading my data and putting it into a dataframe

In [2]:
df1 = spark.read.csv(
    "adult test.csv",
    header=True,
    inferSchema=True,
    sep=",",
    quote="\"",
    escape="\""
)
df1.show(5, truncate=False)
df1.printSchema()

+---+----------+------+-------------+-------------+-------------------+------------------+------------+------+-------+------------+-------------+--------------+--------------+-------+
|Age|workclass |fnlwgt|education    |education-num|marital-status     |occupation        |relationship|race  |sex    |capital-gain|capital-loss |hours-per-week|native-country|label  |
+---+----------+------+-------------+-------------+-------------------+------------------+------------+------+-------+------------+-------------+--------------+--------------+-------+
|25 | Private  |226802| 11th        |7            | Never-married     | Machine-op-inspct| Own-child  | Black| Male  |0           |0            |40            | United-States| <=50K.|
|38 | Private  |89814 | HS-grad     |9            | Married-civ-spouse| Farming-fishing  | Husband    | White| Male  |0           |0            |50            | United-States| <=50K.|
|28 | Local-gov|336951| Assoc-acdm  |12           | Married-civ-spouse| Protecti

In [None]:
#from pyspark.sql.types import StructType, StructField, IntegerType, StringType

#schema = StructType([
 #   StructField("age", IntegerType(), True),
 #   StructField("workclass", StringType(), True),
 #   StructField("fnlwgt", IntegerType(), True),
 #   StructField("education", StringType(), True),
 #   StructField("education_num", IntegerType(), True),
 #   StructField("marital_status", StringType(), True),
 #   StructField("occupation", StringType(), True),
 #   StructField("relationship", StringType(), True),
 #   StructField("race", StringType(), True),
 #   StructField("sex", StringType(), True),
 #   StructField("capital_gain", IntegerType(), True),
 #   StructField("capital_loss", IntegerType(), True),
 #   StructField("hours_per_week", IntegerType(), True),
 #   StructField("native_country", StringType(), True),
 #   StructField("income", StringType(), True)
#])

#df = spark.read.csv(
 #   "adult test.csv",
  #  header=True,
 #   #schema=schema,
  #  sep=",",
  #  quote="\"",
  #  escape="\""
#)
#pandas_df = df.toPandas()
#pandas_df

In [3]:
df2 = spark.read.csv(
    "adult train.csv",
    header=True,
    inferSchema=True,
    sep=",",
    quote="\"",
    escape="\""
)
df2.show(5, truncate=False)
df2.printSchema()

+---+-----------------+------+----------+-------------+-------------------+------------------+--------------+------+-------+------------+------------+--------------+--------------+------+
|Age|workclass        |fnlwgt|education |education-num|marital-status     |occupation        |relationship  |race  |sex    |capital-gain|capital-loss|hours-per-week|native-country|label |
+---+-----------------+------+----------+-------------+-------------------+------------------+--------------+------+-------+------------+------------+--------------+--------------+------+
|39 | State-gov       |77516 | Bachelors|13           | Never-married     | Adm-clerical     | Not-in-family| White| Male  |2174        |0           |40            | United-States| <=50K|
|50 | Self-emp-not-inc|83311 | Bachelors|13           | Married-civ-spouse| Exec-managerial  | Husband      | White| Male  |0           |0           |13            | United-States| <=50K|
|38 | Private         |215646| HS-grad  |9            | Divo

EDA

In [4]:
df1.describe().show()

+-------+------------------+------------+------------------+-------------+-----------------+--------------+-----------------+------------+-------------------+-------+------------------+------------------+------------------+--------------+-------+
|summary|               Age|   workclass|            fnlwgt|    education|    education-num|marital-status|       occupation|relationship|               race|    sex|      capital-gain|     capital-loss |    hours-per-week|native-country|  label|
+-------+------------------+------------+------------------+-------------+-----------------+--------------+-----------------+------------+-------------------+-------+------------------+------------------+------------------+--------------+-------+
|  count|             16281|       16281|             16281|        16281|            16281|         16281|            16281|       16281|              16281|  16281|             16281|             16281|             16281|         16281|  16281|
|   mean| 38

In [5]:
df2.describe().show()

+-------+------------------+------------+------------------+-------------+-----------------+--------------+-----------------+-------------+-------------------+-------+------------------+----------------+------------------+--------------+------+
|summary|               Age|   workclass|            fnlwgt|    education|    education-num|marital-status|       occupation|relationship |               race|    sex|      capital-gain|    capital-loss|    hours-per-week|native-country| label|
+-------+------------------+------------+------------------+-------------+-----------------+--------------+-----------------+-------------+-------------------+-------+------------------+----------------+------------------+--------------+------+
|  count|             32561|       32561|             32561|        32561|            32561|         32561|            32561|        32561|              32561|  32561|             32561|           32561|             32561|         32561| 32561|
|   mean| 38.5816467

Data Preprocessing

Checking for missing values in dataframe 1 (adult test) & dataframe 2 (adult train)

In [6]:
df1.select([(F.sum(F.col(c).isNull().cast("int")).alias(c)) for c in df1.columns]).show()

+---+---------+------+---------+-------------+--------------+----------+------------+----+---+------------+-------------+--------------+--------------+-----+
|Age|workclass|fnlwgt|education|education-num|marital-status|occupation|relationship|race|sex|capital-gain|capital-loss |hours-per-week|native-country|label|
+---+---------+------+---------+-------------+--------------+----------+------------+----+---+------------+-------------+--------------+--------------+-----+
|  0|        0|     0|        0|            0|             0|         0|           0|   0|  0|           0|            0|             0|             0|    0|
+---+---------+------+---------+-------------+--------------+----------+------------+----+---+------------+-------------+--------------+--------------+-----+



We can see from here that there are no missing values in df1

In [7]:
df2.select([(F.sum(F.col(c).isNull().cast("int")).alias(c)) for c in df2.columns]).show()

+---+---------+------+---------+-------------+--------------+----------+-------------+----+---+------------+------------+--------------+--------------+-----+
|Age|workclass|fnlwgt|education|education-num|marital-status|occupation|relationship |race|sex|capital-gain|capital-loss|hours-per-week|native-country|label|
+---+---------+------+---------+-------------+--------------+----------+-------------+----+---+------------+------------+--------------+--------------+-----+
|  0|        0|     0|        0|            0|             0|         0|            0|   0|  0|           0|           0|             0|             0|    0|
+---+---------+------+---------+-------------+--------------+----------+-------------+----+---+------------+------------+--------------+--------------+-----+



We can see from here that there are no missing values in df2 as well

Encoding categorical variables in both data frames

In [17]:
data = df1

# List of categorical columns to encode - remove 'label' from this list
categorical_columns = ["workclass", "education", "marital-status", "occupation",
                      "relationship", "race", "sex", "native-country"]

# Creating StringIndexers for each categorical column
indexers = [
    StringIndexer(inputCol=col, outputCol=f"{col}_index")
    for col in categorical_columns
]

# Creating OneHotEncoders for each indexed column
encoders = [
    OneHotEncoder(inputCol=f"{col}_index", outputCol=f"{col}_encoded")
    for col in categorical_columns
]

# Add StringIndexer for the label column separately
label_indexer = StringIndexer(inputCol="label", outputCol="label_index")

# Combining indexers and encoders to a pipeline
pipeline = Pipeline(stages=indexers + [label_indexer] + encoders)

# Fitting the pipeline to the data and transform it
pipeline_model = pipeline.fit(data)
transformed_data = pipeline_model.transform(data)

# Create a list of the encoded feature column names for later use
encoded_feature_cols = [f"{col}_encoded" for col in categorical_columns]

# Setting pandas display option to show all columns
pd.set_option("display.max_columns", None)

# Displaying the transformed dataframe
x = transformed_data.toPandas()
x.head()

Unnamed: 0,Age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,label,workclass_index,education_index,marital-status_index,occupation_index,relationship_index,race_index,sex_index,native-country_index,label_index,workclass_encoded,education_encoded,marital-status_encoded,occupation_encoded,relationship_encoded,race_encoded,sex_encoded,native-country_encoded
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K.,0.0,5.0,1.0,6.0,2.0,1.0,0.0,0.0,0.0,"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ...","(0.0, 0.0, 1.0, 0.0, 0.0)","(0.0, 1.0, 0.0, 0.0)",(1.0),"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K.,0.0,0.0,0.0,11.0,0.0,0.0,0.0,0.0,0.0,"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(1.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0)",(1.0),"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K.,2.0,6.0,0.0,12.0,0.0,0.0,0.0,0.0,1.0,"(0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ...","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(1.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0)",(1.0),"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K.,0.0,1.0,0.0,6.0,0.0,1.0,0.0,0.0,1.0,"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ...","(1.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 1.0, 0.0, 0.0)",(1.0),"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K.,3.0,1.0,1.0,7.0,2.0,0.0,1.0,0.0,0.0,"(0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ...","(0.0, 0.0, 1.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0)",(0.0),"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [11]:
x.columns

Index(['Age', 'workclass', 'fnlwgt', 'education', 'education-num',
       'marital-status', 'occupation', 'relationship', 'race', 'sex',
       'capital-gain', 'capital-loss ', 'hours-per-week', 'native-country',
       'label', 'workclass_index', 'education_index', 'marital-status_index',
       'occupation_index', 'relationship_index', 'race_index', 'sex_index',
       'native-country_index', 'label_index', 'workclass_encoded',
       'education_encoded', 'marital-status_encoded', 'occupation_encoded',
       'relationship_encoded', 'race_encoded', 'sex_encoded',
       'native-country_encoded'],
      dtype='object')

In [16]:
df2

DataFrame[Age: int, workclass: string, fnlwgt: int, education: string, education-num: int, marital-status: string, occupation: string, relationship : string, race: string, sex: string, capital-gain: int, capital-loss: int, hours-per-week: int, native-country: string, label: string]

In [19]:
data1 = df2

# List of categorical columns - MODIFY THIS based on your actual columns
categorical_columns = [col for col in [
    "workclass", "education", "marital-status", "occupation",
    "relationship", "race", "sex", "native-country"
] if col in data1.columns]

# Creating StringIndexers for each categorical column
indexers = [
    StringIndexer(inputCol=col, outputCol=f"{col}_index")
    for col in categorical_columns
]

# Creating OneHotEncoders for each indexed column
encoders = [
    OneHotEncoder(inputCol=f"{col}_index", outputCol=f"{col}_encoded")
    for col in categorical_columns
]

# Add StringIndexer for the label column separately
label_indexer = StringIndexer(inputCol="label", outputCol="label_index")

# Combining indexers and encoders to a pipeline
pipeline = Pipeline(stages=indexers + [label_indexer] + encoders)

# Fitting the pipeline to the data and transform it
pipeline_model1 = pipeline.fit(data1)
transformed_data1 = pipeline_model1.transform(data1)

# Create a list of the encoded feature column names for later use
encoded_feature_cols = [f"{col}_encoded" for col in categorical_columns]

# Setting pandas display option to show all columns
pd.set_option("display.max_columns", None)

# Displaying the transformed dataframe
x = transformed_data1.toPandas()
x.head()

Categorical columns found: ['workclass', 'education', 'marital-status', 'occupation', 'race', 'sex', 'native-country']


Unnamed: 0,Age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,label,workclass_index,education_index,marital-status_index,occupation_index,race_index,sex_index,native-country_index,label_index,workclass_encoded,education_encoded,marital-status_encoded,occupation_encoded,race_encoded,sex_encoded,native-country_encoded
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K,4.0,2.0,1.0,3.0,0.0,0.0,0.0,0.0,"(0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0)","(0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(1.0, 0.0, 0.0, 0.0)",(1.0),"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K,1.0,2.0,0.0,2.0,0.0,0.0,0.0,0.0,"(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(1.0, 0.0, 0.0, 0.0)",(1.0),"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K,0.0,0.0,2.0,9.0,0.0,0.0,0.0,0.0,"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 1.0, 0.0, 0.0, 0.0)","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(1.0, 0.0, 0.0, 0.0)",(1.0),"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K,0.0,5.0,0.0,9.0,1.0,0.0,0.0,0.0,"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 1.0, 0.0, 0.0)",(1.0),"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K,0.0,2.0,0.0,0.0,1.0,1.0,9.0,0.0,"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 1.0, 0.0, 0.0)",(0.0),"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


Scaling our numerical features in both dataframes to a common scale

In [24]:
#list of numerical columns to scale
numerical_columns = ["Age", "fnlwgt", "education-num", "capital-gain","hours-per-week"]

#Assembling numerical columns into a feature vector
assembler = VectorAssembler(inputCols = numerical_columns, outputCol = "numerical_features")
df_vector = assembler.transform(data)

#Initialize the StandardScaler
scaler = StandardScaler(inputCol = "numerical_features", outputCol = "scaled_features", withMean = True, withStd  = True)

#Fit the scaler to the first dataframe and transform
scaler_model = scaler.fit(df_vector)
df_scaled = scaler_model.transform(df_vector)

#Converting vector to an array column
df_final = df_scaled.withColumn("scaled_features_array", vector_to_array("scaled_features"))
for i, col_name in enumerate(numerical_columns):
    df_final = df_final.withColumn(f"scaled_{col_name}", df_final["scaled_features_array"][i])

#Drop intermediate columns
df_final = df_final.drop("numerical features", "scaled_features", "scaled_features_array")

In [21]:
test_set = df_final.toPandas()
test_set

Unnamed: 0,Age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,label,numerical_features,scaled_Age,scaled_fnlwgt,scaled_education-num,scaled_capital-gain,scaled_hours-per-week
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K.,"[25.0, 226802.0, 7.0, 0.0, 40.0]",-0.994099,0.353463,-1.196827,-0.142657,-0.031431
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K.,"[38.0, 89814.0, 9.0, 0.0, 50.0]",-0.055415,-0.942362,-0.417873,-0.142657,0.769894
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K.,"[28.0, 336951.0, 12.0, 0.0, 40.0]",-0.777480,1.395407,0.750559,-0.142657,-0.031431
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K.,"[44.0, 160323.0, 10.0, 7688.0, 40.0]",0.377823,-0.275389,-0.028396,0.871064,-0.031431
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K.,"[18.0, 103497.0, 10.0, 0.0, 30.0]",-1.499544,-0.812929,-0.028396,-0.142657,-0.832756
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16276,39,Private,215419,Bachelors,13,Divorced,Prof-specialty,Not-in-family,White,Female,0,0,36,United-States,<=50K.,"[39.0, 215419.0, 13.0, 0.0, 36.0]",0.016791,0.245787,1.140036,-0.142657,-0.351961
16277,64,?,321403,HS-grad,9,Widowed,?,Other-relative,Black,Male,0,0,40,United-States,<=50K.,"[64.0, 321403.0, 9.0, 0.0, 40.0]",1.821951,1.248332,-0.417873,-0.142657,-0.031431
16278,38,Private,374983,Bachelors,13,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,50,United-States,<=50K.,"[38.0, 374983.0, 13.0, 0.0, 50.0]",-0.055415,1.755167,1.140036,-0.142657,0.769894
16279,44,Private,83891,Bachelors,13,Divorced,Adm-clerical,Own-child,Asian-Pac-Islander,Male,5455,0,40,United-States,<=50K.,"[44.0, 83891.0, 13.0, 5455.0, 40.0]",0.377823,-0.998390,1.140036,0.576626,-0.031431


In [30]:
# List of numerical columns to scale
numerical_columns = ["Age", "fnlwgt", "education-num", "capital-gain", "hours-per-week"]

# Assembling numerical columns into a feature vector for data1
assembler = VectorAssembler(inputCols=numerical_columns, outputCol="numerical_features")
df_vector1 = assembler.transform(data1)

# Initialize the StandardScaler (reusing the same scaler as before)
scaler = StandardScaler(inputCol="numerical_features", outputCol="scaled_features", withMean=True, withStd=True)

# Fit the scaler to data1 and transform
scaler_model1 = scaler.fit(df_vector1)
df_scaled1 = scaler_model1.transform(df_vector1)

# Converting vector to an array column for data1
df_final1 = df_scaled1.withColumn("scaled_features_array", vector_to_array("scaled_features"))
for i, col_name in enumerate(numerical_columns):
    df_final1 = df_final1.withColumn(f"scaled_{col_name}", df_final1["scaled_features_array"][i])

# Drop intermediate columns for data1
df_final1 = df_final1.drop("numerical_features", "scaled_features", "scaled_features_array")

In [None]:
df_final1.show(5, 0)

+---+-----------------+------+----------+-------------+-------------------+------------------+--------------+------+-------+------------+------------+--------------+--------------+------+--------------------+-------------------+--------------------+--------------------+---------------------+
|Age|workclass        |fnlwgt|education |education-num|marital-status     |occupation        |relationship  |race  |sex    |capital-gain|capital-loss|hours-per-week|native-country|label |scaled_Age          |scaled_fnlwgt      |scaled_education-num|scaled_capital-gain |scaled_hours-per-week|
+---+-----------------+------+----------+-------------+-------------------+------------------+--------------+------+-------+------------+------------+--------------+--------------+------+--------------------+-------------------+--------------------+--------------------+---------------------+
|39 | State-gov       |77516 | Bachelors|13           | Never-married     | Adm-clerical     | Not-in-family| White| Male

In [None]:
df_final1.columns

['Age',
 'workclass',
 'fnlwgt',
 'education',
 'education-num',
 'marital-status',
 'occupation',
 'relationship ',
 'race',
 'sex',
 'capital-gain',
 'capital-loss',
 'hours-per-week',
 'native-country',
 'label',
 'scaled_Age',
 'scaled_fnlwgt',
 'scaled_education-num',
 'scaled_capital-gain',
 'scaled_hours-per-week']

Perfroming cross validation on my train set to avoid the model from being high bias or high variance, meaning we want a our model to not just perform well on our training data but equally perform well on our test data

In [32]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.feature import StringIndexer

# First, convert the label column from string to numeric using StringIndexer
label_indexer = StringIndexer(inputCol="label", outputCol="label_index")
data_indexed = label_indexer.fit(df_final1).transform(df_final1)

# Prepare the feature vector for logistic regression
assembler_final = VectorAssembler(
    inputCols=[f"scaled_{col}" for col in numerical_columns],
    outputCol="features"
)
data_ready = assembler_final.transform(data_indexed)

# Initialize Logistic Regression model
lr = LogisticRegression(
    featuresCol="features",
    labelCol="label_index",  # Changed to use the indexed label
    maxIter=100
)

# Create parameter grid for hyperparameter tuning
paramGrid = ParamGridBuilder() \
    .addGrid(lr.regParam, [0.01, 0.1, 1.0]) \
    .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0]) \
    .build()

# Create evaluator
evaluator = BinaryClassificationEvaluator(
    labelCol="label_index",  # Changed to use the indexed label
    metricName="areaUnderROC"
)

# Create CrossValidator
crossval = CrossValidator(
    estimator=lr,
    estimatorParamMaps=paramGrid,
    evaluator=evaluator,
    numFolds=5,
    parallelism=2
)

# Split the data
train_data, test_data = data_ready.randomSplit([0.8, 0.2], seed=42)

# Fit the CrossValidator to find the best model
cv_model = crossval.fit(train_data)

# Get the best model
best_model = cv_model.bestModel

# Make predictions on test data
predictions = best_model.transform(test_data)

# Evaluate the model
auc_roc = evaluator.evaluate(predictions)
print(f"\nBest Model Performance:")
print(f"Area Under ROC: {auc_roc:.3f}")

# Print best parameters
print("\nBest Model Parameters:")
print(f"RegParam: {best_model.getRegParam()}")
print(f"ElasticNetParam: {best_model.getElasticNetParam()}")

# Get feature coefficients
coefficients = best_model.coefficients
print("\nFeature Coefficients:")
for feature, coef in zip(numerical_columns, coefficients):
    print(f"{feature}: {coef:.4f}")

# Calculate additional metrics
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Create evaluator for accuracy
multi_evaluator = MulticlassClassificationEvaluator(
    labelCol="label_index",  # Changed to use the indexed label
    predictionCol="prediction"
)

# Calculate accuracy
accuracy = multi_evaluator.evaluate(predictions, {multi_evaluator.metricName: "accuracy"})
print(f"\nAccuracy: {accuracy:.3f}")

# Show confusion matrix
predictions.groupBy("label_index", "prediction").count().show()


Best Model Performance:
Area Under ROC: 0.817

Best Model Parameters:
RegParam: 0.01
ElasticNetParam: 0.0

Feature Coefficients:
Age: 0.5564
fnlwgt: 0.0581
education-num: 0.7806
capital-gain: 1.0982
hours-per-week: 0.4758

Accuracy: 0.801
+-----------+----------+-----+
|label_index|prediction|count|
+-----------+----------+-----+
|        1.0|       1.0|  514|
|        0.0|       1.0|  227|
|        1.0|       0.0| 1063|
|        0.0|       0.0| 4681|
+-----------+----------+-----+

