In [8]:
import pandas as pd
from pyspark.sql import SparkSession
import os

### Augment data for benchmarking purposes
Provided data was too low in volume. Multiply existing data into millions of row to clearly see the significant difference between reading,processing data using Spark.

In [9]:
df = pd.read_csv("../data/creditcard.csv")
df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


Sampling the data into 20M Rows

In [10]:
# df_20M = df.sample(20000000, replace=True)
# df_20M.to_csv("../data/creditfraud_20M.csv", index=False)

### Spark Reading CSV
Build a session to read a csv file with Spark

In [11]:
%env KUBECONFIG=/home/agung/.kube/config

env: KUBECONFIG=/home/agung/.kube/config


In [13]:
# os.environ['PYSPARK_SUBMIT_ARGS'] = "--packages com.amazonaws:aws-java-sdk-bundle:1.11.375,org.apache.hadoop:hadoop-aws:3.2.0 pyspark-shell"

spark = SparkSession.builder \
    .appName("SparkK8sRead") \
    .master("k8s://https://127.0.0.1:35069") \
    .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:3.3.4") \
    .config("spark.executor.instances", 5) \
    .config("spark.kubernetes.container.image", "spark:kube-spark-hadoop-aws") \
    .config("spark.kubernetes.container.image.pullPolicy", "IfNotPresent") \
    .config("spark.kubernetes.authenticate.driver.serviceAccountName", "spark") \
    .getOrCreate()

# sc = spark.sparkContext
spark._jsc.hadoopConfiguration().set("fs.s3a.access.key", "nightingale")
spark._jsc.hadoopConfiguration().set("fs.s3a.secret.key", "nightingale")
spark._jsc.hadoopConfiguration().set("fs.s3a.impl","org.apache.hadoop.fs.s3a.S3AFileSystem")
spark._jsc.hadoopConfiguration().set("fs.s3a.path.style.access", "true")
# spark._jsc.hadoopConfiguration().set("com.amazonaws.services.s3.enableV4", "true")
# spark._jsc.hadoopConfiguration().set("fs.s3a.aws.credentials.provider","org.apache.hadoop.fs.s3a.BasicAWSCredentialsProvider")
spark._jsc.hadoopConfiguration().set("fs.s3a.endpoint", "http://10.44.101.211:39000/")

df_spark_20m = spark.read.csv("s3a://sparkbucket/data/creditfraud_20M.csv", header=True)
df_spark_20m.show()

[Stage 1:>                                                          (0 + 1) / 1]

+--------+-------------------+-------------------+-------------------+------------------+-------------------+------------------+-------------------+-------------------+-------------------+-------------------+------------------+-------------------+-------------------+------------------+-------------------+-------------------+------------------+-------------------+-------------------+-------------------+-------------------+------------------+-------------------+-------------------+-------------------+-------------------+-------------------+-------------------+------+-----+
|     _c0|                _c1|                _c2|                _c3|               _c4|                _c5|               _c6|                _c7|                _c8|                _c9|               _c10|              _c11|               _c12|               _c13|              _c14|               _c15|               _c16|              _c17|               _c18|               _c19|               _c20|               _c

                                                                                

In [17]:
df_spark_20m.select(df_spark_20m.columns[:5]).show(10)

+--------+-------------------+------------------+-------------------+------------------+
|     _c0|                _c1|               _c2|                _c3|               _c4|
+--------+-------------------+------------------+-------------------+------------------+
|    Time|                 V1|                V2|                 V3|                V4|
|108921.0|-0.0940021119790702|  0.85221037333301|  0.744657875085305|-0.139024074859472|
|  7593.0| -0.460249752945011|  1.09588431287446|   1.95487766837381|  0.51713346164733|
|113906.0| 0.0915385249907437| 0.707589688676519|  -0.28127915377564| -1.02397334535992|
| 86140.0| -0.722942813018065| 0.374673523740068|   1.88960258084084|  1.28355523629354|
| 43446.0|  -1.24450229819724| 0.574682117134877|   1.64583397567348|-0.842825968023768|
|153702.0|   1.91581246720304| -1.03033353332467|  -1.17853860385592|-0.446221227936367|
|136247.0|  -1.99123091763043| 0.487900302734793|-0.0856821365617053|  -1.9935523663715|
|109465.0| -0.3039668

In [18]:
# Get the number of rows
num_rows = df_spark_20m.count()

# Get the number of columns
num_columns = len(df_spark_20m.columns)

# Print the dimensions
print(f"Number of rows: {num_rows}")
print(f"Number of columns: {num_columns}")



Number of rows: 20000001
Number of columns: 31


                                                                                

In [7]:
spark.stop()

24/06/25 08:15:10 WARN ExecutorPodsWatchSnapshotSource: Kubernetes client has been closed.
