# find number of emails from the same domain

In [1]:
import os
os.getcwd()
os.chdir("H:\pyspark_advanced-coding_interview")
os.getcwd()

'H:\\pyspark_advanced-coding_interview'

# Spark SQL

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

# Initialize Spark Session (if not already done)
spark = SparkSession.builder.appName("EmailDomainCount").getOrCreate()

# Define schema using StructType and StructField
schema = StructType([
    StructField("UserID", IntegerType(), True),
    StructField("Email", StringType(), True)
])

# Sample email data
data = [
    (1, "alice@gmail.com"),
    (2, "bob@yahoo.com"),
    (3, "charlie@gmail.com"),
    (4, "david@hotmail.com"),
    (5, "eve@gmail.com"),
    (6, "frank@yahoo.com"),
    (7, "grace@outlook.com"),
    (8, "heidi@gmail.com"),
    (9, "ivan@hotmail.com"),
    (10, "judy@outlook.com"),
    (11, "kevin@gmail.com"),
    (12, "laura@company.com"),
    (13, "mallory@company.com"),
    (14, "oscar@gmail.com"),
    (15, "peggy@yahoo.com")
]

# Create DataFrame
df = spark.createDataFrame(data, schema)
df.show(truncate=False)


+------+-------------------+
|UserID|Email              |
+------+-------------------+
|1     |alice@gmail.com    |
|2     |bob@yahoo.com      |
|3     |charlie@gmail.com  |
|4     |david@hotmail.com  |
|5     |eve@gmail.com      |
|6     |frank@yahoo.com    |
|7     |grace@outlook.com  |
|8     |heidi@gmail.com    |
|9     |ivan@hotmail.com   |
|10    |judy@outlook.com   |
|11    |kevin@gmail.com    |
|12    |laura@company.com  |
|13    |mallory@company.com|
|14    |oscar@gmail.com    |
|15    |peggy@yahoo.com    |
+------+-------------------+



In [2]:
# Register the DataFrame as a temporary table
df.createOrReplaceTempView("Users")


In [4]:
# Execute Method 1 Query
query1 = spark.sql("""
SELECT 
    REGEXP_EXTRACT(Email, '@(.*)', 1) AS Domain, 
    COUNT(*) AS EmailCount
FROM Users
GROUP BY Domain
ORDER BY EmailCount DESC
""")
query1.show(truncate=False)

# @(.*) captures everything after the @ symbol



# Execute Method 2 Query
query2 = spark.sql("""
SELECT 
    SPLIT(Email, '@')[1] AS Domain, 
    COUNT(*) AS EmailCount
FROM Users
GROUP BY Domain
ORDER BY EmailCount DESC
""")
query2.show(truncate=False)

# Splits the email string at the @ symbol and retrieves the domain part

query3 = spark.sql("""   
  
  SELECT 
    SUBSTRING_INDEX(Email, '@', -1) AS Domain, 
    COUNT(*) AS EmailCount
FROM Users
GROUP BY Domain
ORDER BY EmailCount DESC;
                   
                   """)
query3.show()

#  substring extracts everything after the @ symbol, giving us the domain.

# Execute Method 4 Query (Using CASE Statements)
query4 = spark.sql("""
SELECT 
    CASE 
        WHEN Email LIKE '%@gmail.com' THEN 'gmail.com'
        WHEN Email LIKE '%@yahoo.com' THEN 'yahoo.com'
        WHEN Email LIKE '%@hotmail.com' THEN 'hotmail.com'
        ELSE 'Other'
    END AS DomainCategory, 
    COUNT(*) AS EmailCount
FROM Users
GROUP BY DomainCategory
ORDER BY EmailCount DESC
""")
query4.show(truncate=False)


+-----------+----------+
|Domain     |EmailCount|
+-----------+----------+
|gmail.com  |6         |
|yahoo.com  |3         |
|hotmail.com|2         |
|outlook.com|2         |
|company.com|2         |
+-----------+----------+

+-----------+----------+
|Domain     |EmailCount|
+-----------+----------+
|gmail.com  |6         |
|yahoo.com  |3         |
|hotmail.com|2         |
|outlook.com|2         |
|company.com|2         |
+-----------+----------+

+-----------+----------+
|     Domain|EmailCount|
+-----------+----------+
|  gmail.com|         6|
|  yahoo.com|         3|
|hotmail.com|         2|
|outlook.com|         2|
|company.com|         2|
+-----------+----------+

+--------------+----------+
|DomainCategory|EmailCount|
+--------------+----------+
|gmail.com     |6         |
|Other         |4         |
|yahoo.com     |3         |
|hotmail.com   |2         |
+--------------+----------+



# Pyspark

In [9]:
df.show()

+------+-------------------+
|UserID|              Email|
+------+-------------------+
|     1|    alice@gmail.com|
|     2|      bob@yahoo.com|
|     3|  charlie@gmail.com|
|     4|  david@hotmail.com|
|     5|      eve@gmail.com|
|     6|    frank@yahoo.com|
|     7|  grace@outlook.com|
|     8|    heidi@gmail.com|
|     9|   ivan@hotmail.com|
|    10|   judy@outlook.com|
|    11|    kevin@gmail.com|
|    12|  laura@company.com|
|    13|mallory@company.com|
|    14|    oscar@gmail.com|
|    15|    peggy@yahoo.com|
+------+-------------------+



In [5]:
from pyspark.sql.functions import regexp_extract

# Extract domain using regular expressions
df_domain_count = df.withColumn("Domain", regexp_extract("Email", "@(.*)", 1)) \
                    .groupBy("Domain") \
                    .count() \
                    .withColumnRenamed("count", "EmailCount") \
                    .orderBy("EmailCount", ascending=False)

df_domain_count.show(truncate=False)


+-----------+----------+
|Domain     |EmailCount|
+-----------+----------+
|gmail.com  |6         |
|yahoo.com  |3         |
|hotmail.com|2         |
|outlook.com|2         |
|company.com|2         |
+-----------+----------+



In [6]:
from pyspark.sql.functions import split

# Extract domain using split
df_split_domain = df.withColumn("Domain", split("Email", "@")[1]) \
                    .groupBy("Domain") \
                    .count() \
                    .withColumnRenamed("count", "EmailCount") \
                    .orderBy("EmailCount", ascending=False)

df_split_domain.show(truncate=False)


+-----------+----------+
|Domain     |EmailCount|
+-----------+----------+
|gmail.com  |6         |
|yahoo.com  |3         |
|hotmail.com|2         |
|outlook.com|2         |
|company.com|2         |
+-----------+----------+



In [7]:
from pyspark.sql.functions import expr

# Extract domain using SQL expression
df_expr_domain = df.withColumn("Domain", expr("split(Email, '@')[1]")) \
                   .groupBy("Domain") \
                   .count() \
                   .withColumnRenamed("count", "EmailCount") \
                   .orderBy("EmailCount", ascending=False)

df_expr_domain.show(truncate=False)


+-----------+----------+
|Domain     |EmailCount|
+-----------+----------+
|gmail.com  |6         |
|yahoo.com  |3         |
|hotmail.com|2         |
|outlook.com|2         |
|company.com|2         |
+-----------+----------+



In [8]:
# Convert DataFrame to RDD and manually extract domains
rdd_domain_count = df.rdd.map(lambda row: (row.Email.split('@')[1], 1)) \
                         .reduceByKey(lambda a, b: a + b) \
                         .sortBy(lambda x: x[1], ascending=False)

# Convert back to DataFrame for display
df_rdd_result = spark.createDataFrame(rdd_domain_count, ["Domain", "EmailCount"])
df_rdd_result.show(truncate=False)


+-----------+----------+
|Domain     |EmailCount|
+-----------+----------+
|gmail.com  |6         |
|yahoo.com  |3         |
|outlook.com|2         |
|hotmail.com|2         |
|company.com|2         |
+-----------+----------+

