In [1]:
import findspark
findspark.init('/home/ubuntu/spark-3.2.1-bin-hadoop2.7')
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('basics').getOrCreate()

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/05/23 16:04:42 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
# Read in dataset
data = spark.read.csv('Dataset/new_data.csv', header=True, inferSchema=True)

In [3]:
data.printSchema()

root
 |-- Heart Attack Risk (1: Yes): integer (nullable = true)
 |-- Income: integer (nullable = true)
 |-- Cholesterol: integer (nullable = true)
 |-- Country_indexed: integer (nullable = true)
 |-- Diastolic: integer (nullable = true)
 |-- Exercise Hours Per Week: double (nullable = true)
 |-- Heart Rate: integer (nullable = true)
 |-- Triglycerides: integer (nullable = true)
 |-- BMI: double (nullable = true)
 |-- Sedentary Hours Per Day: double (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Stress Level: integer (nullable = true)



## 1st Iteration

In [4]:
data1 = data.drop("Stress Level", "Sedentary Hours Per Day", "Cholesterol", "Heart Rate" )

In [5]:
data1.printSchema()

root
 |-- Heart Attack Risk (1: Yes): integer (nullable = true)
 |-- Income: integer (nullable = true)
 |-- Country_indexed: integer (nullable = true)
 |-- Diastolic: integer (nullable = true)
 |-- Exercise Hours Per Week: double (nullable = true)
 |-- Triglycerides: integer (nullable = true)
 |-- BMI: double (nullable = true)
 |-- Age: integer (nullable = true)



In [6]:
from pyspark.sql.functions import col
from pyspark.ml.feature import VectorAssembler

# Assuming 'data' is your original DataFrame
target_column = "Heart Attack Risk (1: Yes)"
feature_columns = [col for col in data1.columns if col != target_column]

# Combine feature columns into a single vector column
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
data1 = assembler.transform(data1).select("features", target_column)

# Split the assembled data into training and testing sets
train_data, test_data = data1.randomSplit([0.8, 0.2], seed=42)

In [7]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.sql.functions import expr

# Fit the Logistic Regression model
lr = LogisticRegression(labelCol=target_column, featuresCol="features")
lr_model = lr.fit(train_data)

# Make predictions on the training and testing sets
train_predictions = lr_model.transform(train_data)
test_predictions = lr_model.transform(test_data)

# Define the evaluator for accuracy
evaluator = MulticlassClassificationEvaluator(
    labelCol=target_column, predictionCol="prediction", metricName="accuracy"
)

# Calculate accuracy scores for training and testing sets
train_accuracy = evaluator.evaluate(train_predictions)
test_accuracy = evaluator.evaluate(test_predictions)

# Print accuracy scores
print(f"Training Accuracy: {train_accuracy:.2f}")
print(f"Testing Accuracy: {test_accuracy:.2f}")

24/05/23 16:04:52 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
24/05/23 16:04:52 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.ForeignLinkerBLAS


Training Accuracy: 0.53
Testing Accuracy: 0.47


In [8]:
# Create a cross-tabulation of predicted vs. actual classes for the test set
cross_tab = test_predictions.groupBy("prediction").pivot(target_column).count()

# Display the cross-tabulation
cross_tab.show()

+----------+---+---+
|prediction|  0|  1|
+----------+---+---+
|       0.0| 29| 34|
|       1.0| 52| 47|
+----------+---+---+



## 2nd Iteration

In [12]:
boost_data = data.sample(withReplacement=True, fraction=2.0, seed = 42)

In [13]:
print(f"Number of rows: {boost_data.count()}")
print(f"Number of columns: {len(boost_data.columns)}")

Number of rows: 2022
Number of columns: 12


In [14]:
data2 = boost_data.drop("Stress Level", "Sedentary Hours Per Day", "Cholesterol", "Heart Rate" )

In [15]:
data2.printSchema()
print(f"Number of rows: {data2.count()}")
print(f"Number of columns: {len(data2.columns)}")

root
 |-- Heart Attack Risk (1: Yes): integer (nullable = true)
 |-- Income: integer (nullable = true)
 |-- Country_indexed: integer (nullable = true)
 |-- Diastolic: integer (nullable = true)
 |-- Exercise Hours Per Week: double (nullable = true)
 |-- Triglycerides: integer (nullable = true)
 |-- BMI: double (nullable = true)
 |-- Age: integer (nullable = true)

Number of rows: 2022
Number of columns: 8


In [16]:
from pyspark.sql.functions import col
from pyspark.ml.feature import VectorAssembler

# Assuming 'data' is your original DataFrame
target_column = "Heart Attack Risk (1: Yes)"
feature_columns = [col for col in data2.columns if col != target_column]

# Combine feature columns into a single vector column
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
data2 = assembler.transform(data2).select("features", target_column)

# Split the assembled data into training and testing sets
train_data, test_data = data2.randomSplit([0.8, 0.2], seed=42)

In [17]:
# Fit the Logistic Regression model
lr = LogisticRegression(labelCol=target_column, featuresCol="features")
lr_model = lr.fit(train_data)

# Make predictions on the training and testing sets
train_predictions = lr_model.transform(train_data)
test_predictions = lr_model.transform(test_data)

# Define the evaluator for accuracy
evaluator = MulticlassClassificationEvaluator(
    labelCol=target_column, predictionCol="prediction", metricName="accuracy"
)

# Calculate accuracy scores for training and testing sets
train_accuracy = evaluator.evaluate(train_predictions)
test_accuracy = evaluator.evaluate(test_predictions)

# Print accuracy scores
print(f"Training Accuracy: {train_accuracy:.2f}")
print(f"Testing Accuracy: {test_accuracy:.2f}")

24/05/23 16:11:01 WARN BlockManager: Asked to remove block broadcast_53_piece0, which does not exist


Training Accuracy: 0.53
Testing Accuracy: 0.51


In [18]:
# Create a cross-tabulation of predicted vs. actual classes for the test set
cross_tab = test_predictions.groupBy("prediction").pivot(target_column).count()

# Display the cross-tabulation
cross_tab.show()

+----------+---+---+
|prediction|  0|  1|
+----------+---+---+
|       0.0| 65| 77|
|       1.0|101|117|
+----------+---+---+



## 3rd Iteration

In [19]:
from pyspark.sql.functions import col
from pyspark.ml.feature import VectorAssembler

# Assuming 'data' is your original DataFrame
target_column = "Heart Attack Risk (1: Yes)"
feature_columns = [col for col in boost_data.columns if col != target_column]

# Combine feature columns into a single vector column
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
data3 = assembler.transform(boost_data).select("features", target_column)

# Split the assembled data into training and testing sets
train_data, test_data = data3.randomSplit([0.8, 0.2], seed=42)

In [21]:
# Fit the Logistic Regression model
lr = LogisticRegression(labelCol=target_column, featuresCol="features")
lr_model = lr.fit(train_data)

# Make predictions on the training and testing sets
train_predictions = lr_model.transform(train_data)
test_predictions = lr_model.transform(test_data)

# Define the evaluator for accuracy
evaluator = MulticlassClassificationEvaluator(
    labelCol=target_column, predictionCol="prediction", metricName="accuracy"
)

# Calculate accuracy scores for training and testing sets
train_accuracy = evaluator.evaluate(train_predictions)
test_accuracy = evaluator.evaluate(test_predictions)

# Print accuracy scores
print(f"Training Accuracy: {train_accuracy:.2f}")
print(f"Testing Accuracy: {test_accuracy:.2f}")

# Create a cross-tabulation of predicted vs. actual classes for the test set
cross_tab = test_predictions.groupBy("prediction").pivot(target_column).count()

# Display the cross-tabulation
cross_tab.show()

Training Accuracy: 0.55
Testing Accuracy: 0.51
+----------+---+---+
|prediction|  0|  1|
+----------+---+---+
|       0.0| 73| 84|
|       1.0| 93|110|
+----------+---+---+

