In [1]:
import os
os.getcwd()
os.chdir("H:\pyspark_advanced-coding_interview")
print(os.getcwd())


from pyspark.sql import SparkSession

# Create a Spark session with optimized settings
spark = (
    SparkSession.builder 
    .appName("OptimizedLocalSpark") 
    .config("spark.driver.memory", "8g")        
    .config("spark.executor.memory", "8g")    
    .config("spark.executor.cores", "4")       
    .config("spark.cores.max", "12")           
    .config("spark.sql.shuffle.partitions", "28")  
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") 
    .getOrCreate()
)
sc = spark.sparkContext

H:\pyspark_advanced-coding_interview


# find value in multiple columns

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

# Create a Spark session
spark = SparkSession.builder.appName("FindValueInColumns").getOrCreate()

# Sample data
data = [
    (1, "Alice", 30, "Developer"),
    (2, "Bob", 25, "Analyst"),
    (3, "Charlie", 35, "Developer"),
    (4, "David", 28, "Manager"),
    (5, "Eve", 40, "Developer")
]

# Column names
columns = ["ID", "Name", "Age", "Job"]

# Create a DataFrame
df = spark.createDataFrame(data, columns)

# Show the DataFrame
df.show()

# Register the DataFrame as a temporary SQL table
df.createOrReplaceTempView("employee_table")


+---+-------+---+---------+
| ID|   Name|Age|      Job|
+---+-------+---+---------+
|  1|  Alice| 30|Developer|
|  2|    Bob| 25|  Analyst|
|  3|Charlie| 35|Developer|
|  4|  David| 28|  Manager|
|  5|    Eve| 40|Developer|
+---+-------+---+---------+



In [3]:
# Using PySpark `filter` to find rows where any column matches a specific value
value_to_find = "Developer"
filtered_df = df.filter((col("Name") == value_to_find) | 
                        (col("Age").cast("string") == value_to_find) | 
                        (col("Job") == value_to_find))

filtered_df.show()


+---+-------+---+---------+
| ID|   Name|Age|      Job|
+---+-------+---+---------+
|  1|  Alice| 30|Developer|
|  3|Charlie| 35|Developer|
|  5|    Eve| 40|Developer|
+---+-------+---+---------+



In [4]:
# Using Spark SQL to find rows where any column matches a specific value
value_to_find = "Developer"
query = f"""
SELECT *
FROM employee_table
WHERE Name = '{value_to_find}' OR
      CAST(Age AS STRING) = '{value_to_find}' OR
      Job = '{value_to_find}'
"""

res = spark.sql(query)
res.show()


+---+-------+---+---------+
| ID|   Name|Age|      Job|
+---+-------+---+---------+
|  1|  Alice| 30|Developer|
|  3|Charlie| 35|Developer|
|  5|    Eve| 40|Developer|
+---+-------+---+---------+



In [5]:
from pyspark.sql.functions import when

# Using `withColumn` to create a new column indicating if the value exists in any column
value_to_find = "Developer"
df_with_flag = df.withColumn("Match", when(
    (col("Name") == value_to_find) |
    (col("Age").cast("string") == value_to_find) |
    (col("Job") == value_to_find), "Yes").otherwise("No"))

df_with_flag.filter(col("Match") == "Yes").show()


+---+-------+---+---------+-----+
| ID|   Name|Age|      Job|Match|
+---+-------+---+---------+-----+
|  1|  Alice| 30|Developer|  Yes|
|  3|Charlie| 35|Developer|  Yes|
|  5|    Eve| 40|Developer|  Yes|
+---+-------+---+---------+-----+



In [6]:
from pyspark.sql import SparkSession

# Initialize Spark session
spark = SparkSession.builder.appName("FindValueInColumns").getOrCreate()

# Sample data from the image
data = [
    ("Demo scheduled", "Completed demo", "Training postponed", "Cancelled", "Demo planned"),
    ("Completed training", "Completed demo", "Demo postponed", "Free day", "Cancelled")
]
columns = ["col1", "col2", "col3", "col4", "col5"]

# Create DataFrame
df1 = spark.createDataFrame(data, columns)

# Show the DataFrame
df1.show(truncate=False)


+------------------+--------------+------------------+---------+------------+
|col1              |col2          |col3              |col4     |col5        |
+------------------+--------------+------------------+---------+------------+
|Demo scheduled    |Completed demo|Training postponed|Cancelled|Demo planned|
|Completed training|Completed demo|Demo postponed    |Free day |Cancelled   |
+------------------+--------------+------------------+---------+------------+



In [8]:
# Register the DataFrame as a temporary SQL table
df1.createOrReplaceTempView("events_table")

# Use Spark SQL to find rows where any column contains the value 'Cancelled'
result_sql = spark.sql("""
SELECT *
FROM events_table
WHERE col1 = 'Cancelled'
   OR col2 = 'Cancelled'
   OR col3 = 'Cancelled'
   OR col4 = 'Cancelled'
   OR col5 = 'Cancelled'
""")

# Show the results
result_sql.show(truncate=False)


+------------------+--------------+------------------+---------+------------+
|col1              |col2          |col3              |col4     |col5        |
+------------------+--------------+------------------+---------+------------+
|Demo scheduled    |Completed demo|Training postponed|Cancelled|Demo planned|
|Completed training|Completed demo|Demo postponed    |Free day |Cancelled   |
+------------------+--------------+------------------+---------+------------+



In [10]:
from pyspark.sql.functions import col

# Use PySpark DataFrame API to find rows where any column contains the value 'Cancelled'
result_df = df1.filter(
    (col("col1") == "Cancelled") |
    (col("col2") == "Cancelled") |
    (col("col3") == "Cancelled") |
    (col("col4") == "Cancelled") |
    (col("col5") == "Cancelled")
)

# Show the results
result_df.show(truncate=False)


+------------------+--------------+------------------+---------+------------+
|col1              |col2          |col3              |col4     |col5        |
+------------------+--------------+------------------+---------+------------+
|Demo scheduled    |Completed demo|Training postponed|Cancelled|Demo planned|
|Completed training|Completed demo|Demo postponed    |Free day |Cancelled   |
+------------------+--------------+------------------+---------+------------+

