## Load the data to hdfs

In [5]:
!/hadoop/bin/hdfs dfs -mkdir /data
!/hadoop/bin/hdfs dfs -put /LC528232.csv /data/
!/hadoop/bin/hdfs dfs -ls /data

SLF4J: Class path contains multiple SLF4J bindings.
SLF4J: Found binding in [jar:file:/hadoop-3.3.1/share/hadoop/common/lib/slf4j-log4j12-1.7.30.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: Found binding in [jar:file:/apache-tez-0.10.2-bin/lib/slf4j-reload4j-1.7.36.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: See http://www.slf4j.org/codes.html#multiple_bindings for an explanation.
SLF4J: Actual binding is of type [org.slf4j.impl.Log4jLoggerFactory]
mkdir: `/data': File exists
SLF4J: Class path contains multiple SLF4J bindings.
SLF4J: Found binding in [jar:file:/hadoop-3.3.1/share/hadoop/common/lib/slf4j-log4j12-1.7.30.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: Found binding in [jar:file:/apache-tez-0.10.2-bin/lib/slf4j-reload4j-1.7.36.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: See http://www.slf4j.org/codes.html#multiple_bindings for an explanation.
SLF4J: Actual binding is of type [org.slf4j.impl.Log4jLoggerFactory]
SLF4J: Class path contains multi

In [117]:
!/hadoop/bin/hdfs dfs -ls /data

SLF4J: Class path contains multiple SLF4J bindings.
SLF4J: Found binding in [jar:file:/hadoop-3.3.1/share/hadoop/common/lib/slf4j-log4j12-1.7.30.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: Found binding in [jar:file:/apache-tez-0.10.2-bin/lib/slf4j-reload4j-1.7.36.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: See http://www.slf4j.org/codes.html#multiple_bindings for an explanation.
SLF4J: Actual binding is of type [org.slf4j.impl.Log4jLoggerFactory]
Found 1 items
-rw-r--r--   1 root supergroup  532830464 2024-02-25 19:36 /data/GlobalLandTemperaturesByCity.csv


## Step 1: Initializing the PySpark Session

In [1]:
import findspark

findspark.init()

from pyspark.sql import SparkSession

# Initialize SparkSession
spark = SparkSession.builder.appName("Genomic Data Analysis").enableHiveSupport().getOrCreate()

24/02/25 20:45:38 WARN Utils: Your hostname, localhost resolves to a loopback address: 127.0.0.1; using 172.17.0.2 instead (on interface eth0)
24/02/25 20:45:38 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/02/25 20:45:39 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/02/25 20:45:41 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


## Step 2: Loading the Environmental Data

In [116]:
# Define schema for the CSV file
from pyspark.sql.types import StructType, StringType
schema1 = StructType().add("sequence", StringType(), True)
# Load genomic data into PySpark DataFrame
genomic_data_df = spark.read.csv("hdfs://0.0.0.0:9000/data/LC528232.csv", schema=schema1)
genomic_data_df.show()

+--------------------+
|            sequence|
+--------------------+
|TGCTTATGAAAATTTTA...|
|AAACGAACTTTAAAATC...|
|AACTAATTACTGTCGTT...|
|GTGTTGCAGCCGATCAT...|
|GTCCCTGGTTTCAACGA...|
|TACGTGGCTTTGGAGAC...|
|TGGCTTAGTAGAAGTTG...|
|GATGCTCGAACTGCACC...|
|GTCGTAGTGGTGAGACA...|
|TCTTCTTCGTAAGAACG...|
|TTAGGCGACGAGCTTGG...|
|GTGTTACCCGTGAACTC...|
|TGGCCCTGATGGCTACC...|
|TTGTCCGAACAACTGGA...|
|TTGCTTGGTACACGGAA...|
|GAAATTTGACACCTTCA...|
|CAACCAAGGGTTGAAAA...|
|CACCAAATGAATGCAAC...|
|GCAGACGGGCGATTTTG...|
|ACTACTTGTGGTTACTT...|
+--------------------+
only showing top 20 rows



In [21]:
# Load genomic data into PySpark DataFrame
genomic_data_df = spark.read.csv("hdfs://0.0.0.0:9000/data/LC528232.csv", schema=schema1)

In [22]:
genomic_data_df.show()

+--------------------+
|            sequence|
+--------------------+
|TGCTTATGAAAATTTTA...|
|AAACGAACTTTAAAATC...|
|AACTAATTACTGTCGTT...|
|GTGTTGCAGCCGATCAT...|
|GTCCCTGGTTTCAACGA...|
|TACGTGGCTTTGGAGAC...|
|TGGCTTAGTAGAAGTTG...|
|GATGCTCGAACTGCACC...|
|GTCGTAGTGGTGAGACA...|
|TCTTCTTCGTAAGAACG...|
|TTAGGCGACGAGCTTGG...|
|GTGTTACCCGTGAACTC...|
|TGGCCCTGATGGCTACC...|
|TTGTCCGAACAACTGGA...|
|TTGCTTGGTACACGGAA...|
|GAAATTTGACACCTTCA...|
|CAACCAAGGGTTGAAAA...|
|CACCAAATGAATGCAAC...|
|GCAGACGGGCGATTTTG...|
|ACTACTTGTGGTTACTT...|
+--------------------+
only showing top 20 rows



## Step 3: Data Cleaning and Preprocessing

In [107]:
# Data processing and feature extraction
# Example: Extracting features from DNA sequences
from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType

def count_gc_content(sequence):
    print(sequence, sequence.count("G"), sequence.count("C") )
    gc_count = sequence.count("G") + sequence.count("C")
    percent  = gc_count / len(sequence)*100
    print(percent)
    return int(percent)

count_gc_content_udf = udf(count_gc_content, IntegerType())

genomic_data_df =  genomic_data_df.select(count_gc_content_udf(genomic_data_df["sequence"]))

In [108]:
genomic_data_df.printSchema()

root
 |-- count_gc_content(sequence): integer (nullable = true)



In [115]:
genomic_data_df.agg({'count_gc_content(sequence)': 'avg'}).show()

+-------------------------------+
|avg(count_gc_content(sequence))|
+-------------------------------+
|             37.558411214953274|
+-------------------------------+



TGCTTATGAAAATTTTAATCTCCCAGGTAACAAACCAACCAACTTTCGATCTCTTGTAGATCTGTTCTCT 8 17
35.714285714285715
AAACGAACTTTAAAATCTGTGTGGCTGTCACTCGGCTGCATGCTTAGTGCACTCACGCAGTATAATTAAT 14 15
41.42857142857143
AACTAATTACTGTCGTTGACAGGACACGAGTAACTCGTCTATCTTCTGCAGGCTGCTTACGGTTTCGTCC 15 18
47.14285714285714
GTGTTGCAGCCGATCATCAGCACATCTAGGTTTCGTCCGGGTGTGACCGAAAGGTAAGATGGAGAGCCTT 22 15
52.85714285714286
GTCCCTGGTTTCAACGAGAAAACACACGTCCAACTCAGTTTGCCTGTTTTACAGGTTCGCGACGTGCTCG 16 20
51.42857142857142
TACGTGGCTTTGGAGACTCCGTGGAGGAGGTCTTATCAGAGGCACGTCAACATCTTAAAGATGGCACTTG 21 14
50.0
TGGCTTAGTAGAAGTTGAAAAAGGCGTTTTGCCTCAACTTGAACAGCCCTATGTGTTCATCAAACGTTCG 16 14
42.857142857142854
GATGCTCGAACTGCACCTCATGGTCATGTTATGGTTGAGCTGGTAGCAGAACTCGAAGGCATTCAGTACG 20 15
50.0
GTCGTAGTGGTGAGACACTTGGTGTCCTTGTCCCTCATGTGGGCGAAATACCAGTGGCTTACCGCAAGGT 22 16
54.285714285714285
TCTTCTTCGTAAGAACGGTAATAAAGGAGCTGGTGGCCATAGTTACGGCGCCGATCTAAAGTCATTTGAC 18 14
45.714285714285715
TTAGGCGACGAGCTTGGCACTGATCCTTATGAAGATTTTCAAGAAAACTGGAACACTAAACATAGCAGTG 16