In [None]:
storage_account_name = "mlopsdatastore"
container_name = "titanic"
storage_account_key = dbutils.secrets.get(scope="creds", key="azuretoken")

In [None]:
# Set the Spark configuration
spark.conf.set(
  "fs.azure.account.key.{0}.blob.core.windows.net".format(storage_account_name),
  storage_account_key
)

# Mount the Blob Storage container
try:
  dbutils.fs.mount(
    source = "wasbs://{0}@{1}.blob.core.windows.net".format(container_name, storage_account_name),
    mount_point = "/mnt/datamount/data",
    extra_configs = {"fs.azure.account.key.{0}.blob.core.windows.net".format(storage_account_name): storage_account_key}
  )
except Exception as e:
  print("Already Mounted")

In [None]:
# Define the file path
file_path = "/mnt/datamount/data/train.csv"

# Read the CSV file into a DataFrame
train_df = spark.read.format("csv").option("header", "true").option("inferSchema", "true").load(file_path)


In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.sql.window import Window
from pyspark.sql import functions as F


spark = SparkSession.builder.appName("PandasToPySpark").getOrCreate()

#'train_df' is a PySpark DataFrame
train_df = spark.read.format("csv").option("header", "true").option("inferSchema", "true").load("/mnt/datamount/data/train.csv")

# Drop the 'Cabin' column
train_df = train_df.drop('Cabin')

# Drop rows with missing values
train_df = train_df.na.drop()

# Drop duplicates
train_df = train_df.dropDuplicates()

# Calculate the Z-scores for 'Fare' column
w = Window.orderBy(F.lit(0))
train_df = train_df.withColumn("FareZScore", F.abs(F.col("Fare") - F.avg("Fare").over(w)) / F.stddev("Fare").over(w))

# Set a Z-score threshold for outlier removal
threshold = 0.1

# Create a new DataFrame with outliers removed
df_no_outliers = train_df.filter(train_df['FareZScore'] <= threshold).drop("FareZScore")

df_no_outliers.show()

In [None]:

df_no_outliers.write.format("delta").mode("overwrite").save("/mnt/datamount/delta_table")

In [None]:
dbutils.notebook.entry_point.getDbutils().notebook().getContext().notebookPath().get()