In [0]:
%sql
SELECT * FROM workspace.raw.protein 
WHERE blast_of_id IS NULL
LIMIT 10

In [0]:
from pyspark.sql.functions import regexp_extract, col, lag, collect_list, concat_ws
from pyspark.sql.window import Window

fasta_df = spark.read.text("/Volumes/workspace/raw/input/700.fasta")

fasta_df.display()


In [0]:
from pyspark.sql.functions import regexp_extract, col

fasta_df = spark.read.text("/Volumes/workspace/raw/input/700.fasta")

accession_df = fasta_df.filter(fasta_df.value.startswith(">"))
accession_df = accession_df.withColumn("accession", regexp_extract(col("value"), ">(.+)", 1))
accession_numbers_df = accession_df.select("accession")

accession_numbers_df.display()

In [0]:
from pyspark.sql.functions import monotonically_increasing_id, regexp_extract, col, lag, collect_list, concat_ws
from pyspark.sql.window import Window

fasta_df = spark.read.text("/Volumes/workspace/raw/input/700.fasta")

# Add a unique line number column using monotonically_increasing_id()
fasta_df = fasta_df.withColumn("line_num", monotonically_increasing_id())

# Filter the accession lines and extract accession numbers
accession_df = fasta_df.filter(fasta_df.value.startswith(">"))
accession_df = accession_df.withColumn("accession", regexp_extract(col("value"), ">(.+)", 1))

# Window specification ordered by line number
window_spec = Window.orderBy("line_num")

# Get line numbers of next accession lines for slicing sequences
accession_line_df = accession_df.select("line_num", "accession")
accession_line_df = accession_line_df.withColumn("next_line_num", lag("line_num", -1).over(window_spec))

# Join sequence lines between accession lines
seq_df = fasta_df.join(
    accession_line_df,
    (fasta_df.line_num > accession_line_df.line_num) & 
    ((fasta_df.line_num < accession_line_df.next_line_num) | accession_line_df.next_line_num.isNull())
    ).select("accession", "value")

# Group sequences by accession and concatenate sequence lines
seq_by_accession = seq_df.groupBy("accession").agg(concat_ws("", collect_list("value")).alias("sequence"))

# Show accession numbers and their sequences
seq_by_accession.display()


In [0]:
from pyspark.sql.functions import monotonically_increasing_id, regexp_extract, col, lag, collect_list, concat_ws, current_timestamp, concat, lit
from pyspark.sql.window import Window

fasta_df = spark.read.text("/Volumes/workspace/raw/input/700.fasta")

# Add a unique line number
fasta_df = fasta_df.withColumn("line_num", monotonically_increasing_id())

# Extract accession rows
accession_df = fasta_df.filter(fasta_df.value.startswith(">"))
accession_df = accession_df.withColumn("accession", regexp_extract(col("value"), ">(.+)", 1))

# Window spec for ordering by line_num
window_spec = Window.orderBy("line_num")

# Get line number of next accession line for boundary
accession_line_df = accession_df.select("line_num", "accession")
accession_line_df = accession_line_df.withColumn("next_line_num", lag("line_num", -1).over(window_spec))

# Join sequences between acc lines
seq_df = fasta_df.join(
    accession_line_df,
    (fasta_df.line_num >= accession_line_df.line_num) & 
    ((fasta_df.line_num < accession_line_df.next_line_num) | accession_line_df.next_line_num.isNull())
    ).select("accession", "value")

# Group and concatenate sequence lines including header line
seq_by_accession = seq_df.groupBy("accession").agg(
    concat_ws("\n", collect_list("value")).alias("fasta_sequence")
)

# Add columns for create and update timestamp, rename accession to id
final_df = seq_by_accession.withColumn("record_create_ts", current_timestamp()) \
                           .withColumn("record_update_ts", current_timestamp()) \
                           .withColumnRenamed("accession", "id") \
                           .select("record_create_ts", "record_update_ts", "id", "fasta_sequence")

final_df.display()


In [0]:
final_df.groupBy("id").count().filter(col("count") > 1).display()

In [0]:
final_df.count()

In [0]:
%sql
SELECT COUNT(*) FROM workspace.raw.protein 

In [0]:
target_table = "workspace.raw.protein"

# Append the final DataFrame to the UC table
final_df.write.format("delta") \
                .mode("append") \
                .option("mergeSchema", "true") \
                .saveAsTable(target_table)