In [5]:
# Initialize the SparkContext
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.sql.functions import col
from pyspark.sql.types import StringType, DoubleType
from pyspark.ml import Pipeline
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler


In [6]:
# Initialize a Spark session
spark = SparkSession.builder.appName("SparkStandaloneTest").getOrCreate()

spark = SparkSession.builder \
    .appName("GCSFilesRead") \
    .getOrCreate()

# file path 
file_location = "/home/bx2051/glucose.csv"

def read_csv_with_inferred_schema(file_path):
    """
    Reads a CSV file with inferred schema.
    transfer string data type to factor

    Args:
        file_path (str): The path to the CSV file.

    Returns:
        pyspark.sql.DataFrame: The DataFrame with inferred schema.
    """
    # Read the CSV file with inferred schema
    data = spark.read \
        .option("header", "true") \
        .option("inferSchema", "true") \
        .csv(file_path)
    
    
    return data


df = read_csv_with_inferred_schema(file_location)

In [None]:
from pyspark.sql.functions import col, min, max, when

def normalize_columns(data):
    """
    Normalizes all columns in the DataFrame except for "glucose" and "sy_glucose".

    Args:
        data (pyspark.sql.DataFrame): The input DataFrame.

    Returns:
        pyspark.sql.DataFrame: The DataFrame with normalized columns.
    """
    # Get the list of columns to normalize (excluding "glucose" and "sy_glucose")
    columns_to_normalize = [col_name for col_name in data.columns if col_name not in ["glucose", "sy_glucose"]]

    # Compute the minimum and maximum values for each column
    min_max_dict = {}
    for col_name in columns_to_normalize:
        min_value = data.agg(min(col_name)).collect()[0][0]
        max_value = data.agg(max(col_name)).collect()[0][0]
        min_max_dict[col_name] = (min_value, max_value)

    # Normalize each column
    for col_name, (min_value, max_value) in min_max_dict.items():
        if min_value != max_value:  # Avoid division by zero
            data = data.withColumn(col_name, (col(col_name) - min_value) / (max_value - min_value))
        else:
            data = data.withColumn(col_name, col(col_name) - min_value)

    return data