In [1]:
# !pip install raydp==0.1.1

In [1]:
# !pip install kubernetes==18.20 --quiet

In [1]:
import os
import ray
import raydp
import pandas as pd
import pyspark

print(f'ray version {ray.__version__}')
print(f'pandas version {pd.__version__}')
print(f'raydp version {raydp.__version__}')
print(f'pyspark version {pyspark.__version__}')

ray version 1.2.0
pandas version 1.1.4
raydp version 0.1.1
pyspark version 3.0.3


In [2]:
!java --version

openjdk 11.0.11 2021-04-20
OpenJDK Runtime Environment (build 11.0.11+9-Ubuntu-0ubuntu2.20.04)
OpenJDK 64-Bit Server VM (build 11.0.11+9-Ubuntu-0ubuntu2.20.04, mixed mode, sharing)


### start ray cluster, since we are on the head node, use default

In [None]:
from hyperplane.ray_common import initialize_ray_cluster, stop_ray_cluster, find_ray_workers
num_workers = 2
cpu_core_per_worker = 15
ram_gb_per_worker = 12 #110 GB allocatible for 16_128 nodes, 12 for 16_16 nodes, 27 for 32_32 nodes
ray_cluster = initialize_ray_cluster(num_workers, cpu_core_per_worker, ram_gb_per_worker)

👉 Hyperplane: selecting worker node pool
best pool spec {'pool_env_var': 'DASK_POOL_16_16', 'allocatable_cores': 15.0, 'allocatable_ram': 12.0}


2021-12-14 05:39:50,251	INFO services.py:1270 -- View the Ray dashboard at [1m[32mhttp://127.0.0.1:8265[39m[22m


Waiting for worker ray-worker-a61f6f56-d770-4dee-a9e0-e4b645a21368...
Waiting for worker ray-worker-a598d2c0-f6fb-4962-964f-21cfc2817443...


### change the logging level of spark


In [7]:
from pyspark import SparkContext
sc = SparkContext()
log4j = sc._jvm.org.apache.log4j
log4j.LogManager.getRootLogger().setLevel(log4j.Level.ERROR)


21/09/05 20:31:36 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


### start spark session 

In [8]:
spark = raydp.init_spark('example', num_executors=2, executor_cores=4, executor_memory='2G')

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
SLF4J: Class path contains multiple SLF4J bindings.
SLF4J: Found binding in [jar:file:/opt/conda/lib/python3.8/site-packages/ray/jars/ray_dist.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: Found binding in [jar:file:/opt/conda/lib/python3.8/site-packages/pyspark/jars/slf4j-log4j12-1.7.30.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: See http://www.slf4j.org/codes.html#multiple_bindings for an explanation.
SLF4J: Actual binding is of type [org.slf4j.impl.Log4jLoggerFactory]


2021-09-05 20:31:41 WARN  NativeCodeLoader:60 - Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


### read tsv data from s3

In [9]:
# hadoopConf = spark.sparkContext._jsc.hadoopConfiguration()
# hadoopConf.set("fs.s3a.access.key", os.environ.get('AWS_ACCESS_KEY_ID'))
# hadoopConf.set("fs.s3a.secret.key", os.environ.get('AWS_SECRET_ACCESS_KEY'))
# hadoopConf.set("fs.s3a.path.style.access", "true")
# hadoopConf.set("fs.s3a.connection.ssl.enabled", "true")

In [10]:
ds = spark.read.csv(path='s3a://d2v-tmp/demo/bach_inference/data/imdb_reviews.tsv', sep ='\t', header = True)

In [11]:
ds.show()

+-------+---------+--------------------+
|     id|sentiment|              review|
+-------+---------+--------------------+
| 5814_8|        1|With all this stu...|
| 2381_9|        1|"The Classic War ...|
| 7759_3|        0|The film starts w...|
| 3630_4|        0|It must be assume...|
| 9495_8|        1|Superbly trashy a...|
| 8196_8|        1|I dont know why p...|
| 7166_2|        0|This movie could ...|
|10633_1|        0|I watched this vi...|
|  319_1|        0|A friend of mine ...|
|8713_10|        1|<br /><br />This ...|
| 2486_3|        0|What happens when...|
|6811_10|        1|Although I genera...|
|11744_9|        1|"Mr. Harvey Light...|
| 7369_1|        0|I had a feeling t...|
|12081_1|        0|note to George Li...|
| 3561_4|        0|Stephen King adap...|
| 4489_1|        0|`The Matrix' was ...|
| 3951_2|        0|Ulli Lommel's 198...|
|3304_10|        1|This movie is one...|
|9352_10|        1|Most people, espe...|
+-------+---------+--------------------+
only showing top

### do some cleaning 

In [12]:
## dropna
ds = ds.dropna()
ds.count()

                                                                                

25000

In [13]:
## remove html tags
from pyspark.sql.functions import col, udf,regexp_replace,isnull
ds = ds.withColumn("review_clean",regexp_replace(col('review'), '<[^>]+>', ''))
ds.show(5)

+------+---------+--------------------+--------------------+
|    id|sentiment|              review|        review_clean|
+------+---------+--------------------+--------------------+
|5814_8|        1|With all this stu...|With all this stu...|
|2381_9|        1|"The Classic War ...|"The Classic War ...|
|7759_3|        0|The film starts w...|The film starts w...|
|3630_4|        0|It must be assume...|It must be assume...|
|9495_8|        1|Superbly trashy a...|Superbly trashy a...|
+------+---------+--------------------+--------------------+
only showing top 5 rows



### save cleaned data to parquet on s3 

In [14]:
try:
    ds.write.parquet("s3a://d2v-tmp/demo/bach_inference/data/imdb_reviews_clean.parquet")
except:
    pass

### read back parquet data with pandas to do downstream tasks

In [16]:
import pandas as pd
df = pd.read_parquet("s3://d2v-tmp/demo/bach_inference/data/imdb_reviews_clean.parquet")
print(df.shape)
df.head(2)

(25000, 4)


Unnamed: 0,id,sentiment,review,review_clean
0,5814_8,1,With all this stuff going down at the moment w...,With all this stuff going down at the moment w...
1,2381_9,1,"""The Classic War of the Worlds"" by Timothy Hin...","""The Classic War of the Worlds"" by Timothy Hin..."


In [17]:
stop_ray_cluster(ray_cluster)

Deleting ray-worker-a45edba0-5819-4ae3-9da1-4cc4eb9b498a
Deleting ray-worker-2b12e267-54ff-413d-876a-13902e99ee93


In [9]:
#Use this in case you forgot your workers
w = find_ray_workers()

ray-worker-3de1410a-96f9-4dcf-880f-3eea1cb607e1	Running	10.0.204.5
ray-worker-fcfa1786-ebe2-4d33-a547-1c0244c2fcbe	Running	10.0.204.6
