In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName("Spark_Interview_Questions").getOrCreate()

In [9]:
students = [
    ("101", "Alice", "1234567890"),        # Valid data
    ("102", "Bob", "0987654321"),          # Valid data
    ("A103", "Charlie", "12345ABCDE"),   # Bad data: alphanumeric phone number and alphanumeric ID
    ("104", "David", "12345678"),          # Bad data: phone number too short
    ("105B", "Eve", "12345678901"),      # Bad data: alphanumeric ID, phone number too long
    ("106", "Frank", "987654321a"),        # Bad data: phone number contains non-numeric character
    ("107", "Grace", "1234567890"),        # Valid data
    ("X108", "Hank", "abcdefghij"),      # Bad data: alphanumeric phone number and alphanumeric ID
    ("109", "Ivy", "2468135790"),          # Valid data
    ("110", "Jack", "1112223334")          # Valid data
]

In [6]:
df1_schema = "id string, name string, phone_no string"

In [10]:
columns = ["id" , "name" ,"phone_no"]

In [11]:
df1 = spark.createDataFrame(students, schema = columns)

In [12]:
df1.show()

+----+-------+-----------+
|  id|   name|   phone_no|
+----+-------+-----------+
| 101|  Alice| 1234567890|
| 102|    Bob| 0987654321|
|A103|Charlie| 12345ABCDE|
| 104|  David|   12345678|
|105B|    Eve|12345678901|
| 106|  Frank| 987654321a|
| 107|  Grace| 1234567890|
|X108|   Hank| abcdefghij|
| 109|    Ivy| 2468135790|
| 110|   Jack| 1112223334|
+----+-------+-----------+



#### Data Cleansing step 1 : Remove rows with invalid "id" column

In [15]:
from pyspark.sql.functions import col

In [16]:
df2 = df1.filter(col("id").rlike("^[0-9]*$"))

In [24]:
df2.show()

+---+-----+----------+
| id| name|  phone_no|
+---+-----+----------+
|101|Alice|1234567890|
|102|  Bob|0987654321|
|104|David|  12345678|
|106|Frank|987654321a|
|107|Grace|1234567890|
|109|  Ivy|2468135790|
|110| Jack|1112223334|
+---+-----+----------+



In [25]:
df2.count()

7

In [28]:
df2.filter(col("phone_no").rlike("^[0-9]{10}$")
           # & col("phone_no").rlike("^[0-9]{10}$")
          ).show()

+---+-----+----------+
| id| name|  phone_no|
+---+-----+----------+
|101|Alice|1234567890|
|102|  Bob|0987654321|
|107|Grace|1234567890|
|109|  Ivy|2468135790|
|110| Jack|1112223334|
+---+-----+----------+



#### Or in one step

In [31]:
df1.filter(col("id").rlike("^[0-9]*$")
            & col("phone_no").rlike("^[0-9]{10}$")
    ).show()

+---+-----+----------+
| id| name|  phone_no|
+---+-----+----------+
|101|Alice|1234567890|
|102|  Bob|0987654321|
|107|Grace|1234567890|
|109|  Ivy|2468135790|
|110| Jack|1112223334|
+---+-----+----------+

