In [1]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql import functions as F


spark = SparkSession.builder \
    .master("spark://spark-master:7077") \
    .config("spark.executor.instances", 2) \
    .config("spark.executor.memory", "1g") \
    .config("spark.executor.cores", "1") \
    .config("spark.driver.cores", "1") \
    .config("spark.driver.memory", "6g") \
    .config("spark.driver.maxResultSize", "3g") \
    .config("spark.deploy.defaultCores", 3) \
    .config("spark.dynamicAllocation.enabled", True) \
    .appName("MySparkApp") \
    .getOrCreate()

In [2]:
%%time
l = [('Alice', 1)]

spark.createDataFrame(l, ['name', 'age']).collect()

CPU times: user 237 ms, sys: 69.2 ms, total: 306 ms
Wall time: 9.78 s


[Row(name='Alice', age=1)]

In [13]:
%%time
data = [("{\"key1\": \"value1\", \"key2\": \"value2\"}",)]
df = spark.createDataFrame(data, ["json_field"])

# Define the schema for the JSON struct
json_schema = "struct<key1: string, key2: string>"

# Convert JSON string to struct
df = df.withColumn("struct_field", F.from_json(df["json_field"], json_schema))

# Show the DataFrame
df.show(truncate=False)

+------------------------------------+----------------+
|json_field                          |struct_field    |
+------------------------------------+----------------+
|{"key1": "value1", "key2": "value2"}|{value1, value2}|
+------------------------------------+----------------+



In [14]:
# spark.stop()

# ----

In [2]:
import datetime

# Sample data
data = [(['953', '7392770', None, None]),
        (['917', '4037011', None, None]),
        (['916', '5411266', None, None]),
        (['927', '2400412', None, None]),
        (['495', '9504813', None, None]),
        (['499', '7243730', None, '2024-02-12'])]

# Create a DataFrame
df = spark.createDataFrame(data, ['phonelist'])

# Define a function to find the phone number with the maximum verification date
def find_max_verified_phone(phones):
    max_date = datetime.datetime.min
    max_phone = None
    for phone in phones:
        verification_date = phone['_verificationdate']
        if verification_date is not None and verification_date != 'null':
            date = datetime.datetime.strptime(verification_date, '%Y-%m-%d')
            if date > max_date:
                max_date = date
                max_phone = phone['_number']
        elif max_phone is None:
            max_phone = phone['_number']
    return max_phone

# Define a UDF
find_max_verified_phone_udf = udf(find_max_verified_phone, StringType())

# Apply the UDF to the DataFrame
df = df.withColumn('max_verified_phone', find_max_verified_phone_udf('phonelist.phone'))

# Show the result
df.show(truncate=False)

# Stop the SparkSession

ValueError: Some of types cannot be determined after inferring