# identify Overlapping Date Ranges

In [1]:
import os
os.getcwd()
os.chdir("H:\pyspark_advanced-coding_interview")
os.getcwd()

'H:\\pyspark_advanced-coding_interview'

# Pyspark

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, DateType
from datetime import datetime

# Initialize Spark Session
spark = SparkSession.builder.appName("OverlappingDateRanges").getOrCreate()

# Define schema using StructType and StructField
schema = StructType([
    StructField("FirstName", StringType(), True),
    StructField("LastName", StringType(), True),
    StructField("Title", StringType(), True),
    StructField("DepartmentName", StringType(), True),
    StructField("Phone", StringType(), True),
    StructField("StartDate", DateType(), True),
    StructField("EndDate", DateType(), True)
])

# Sample data based on the image
data = [
    ("Guy", "Gilbert", "Production Technician - WC60", "Production", "320-555-0195", datetime(2006, 1, 6), None),
    ("Kevin", "Brown", "Marketing Assistant", "Marketing", "150-555-0189", datetime(2006, 8, 26), None),
    ("Roberto", "Tamburello", "Engineering Manager", "Engineering", "212-555-0187", datetime(2007, 6, 11), None),
    ("Rob", "Walters", "Senior Tool Designer", "Tool Design", "612-555-0149", datetime(2007, 7, 5), datetime(2009, 12, 28)),
    ("Rob", "Walters", "Senior Tool Designer", "Tool Design", "412-555-6754", datetime(2007, 7, 6), datetime(2009, 12, 27)),
    ("Thierry", "D'Hers", "Tool Designer", "Tool Design", "168-555-0183", datetime(2007, 7, 13), None),
    ("David", "Bradley", "Marketing Manager", "Marketing", "412-555-1234", datetime(2007, 7, 20), datetime(2009, 2, 11)),
    ("David", "Bradley", "Marketing Manager", "Marketing", "913-555-0172", datetime(2009, 2, 12), None),
    ("JoLynn", "Dobney", "Production Supervisor - WC60", "Production", "903-555-0143", datetime(2007, 7, 26), None),
    ("Ruth", "Ellerbrock", "Production Technician - WC10", "Production", "145-555-0130", datetime(2007, 8, 6), None),
    ("Gail", "Erickson", "Design Engineer", "Engineering", "849-555-0513", datetime(2007, 8, 6), None)
]

# Create DataFrame
df = spark.createDataFrame(data, schema)
df.show(truncate=False)


+---------+----------+----------------------------+--------------+------------+----------+----------+
|FirstName|LastName  |Title                       |DepartmentName|Phone       |StartDate |EndDate   |
+---------+----------+----------------------------+--------------+------------+----------+----------+
|Guy      |Gilbert   |Production Technician - WC60|Production    |320-555-0195|2006-01-06|null      |
|Kevin    |Brown     |Marketing Assistant         |Marketing     |150-555-0189|2006-08-26|null      |
|Roberto  |Tamburello|Engineering Manager         |Engineering   |212-555-0187|2007-06-11|null      |
|Rob      |Walters   |Senior Tool Designer        |Tool Design   |612-555-0149|2007-07-05|2009-12-28|
|Rob      |Walters   |Senior Tool Designer        |Tool Design   |412-555-6754|2007-07-06|2009-12-27|
|Thierry  |D'Hers    |Tool Designer               |Tool Design   |168-555-0183|2007-07-13|null      |
|David    |Bradley   |Marketing Manager           |Marketing     |412-555-1234|200

In [3]:
# Register the DataFrame as a temporary table
df.createOrReplaceTempView("Employees")


In [9]:
from pyspark.sql.functions import col

# Perform self-join to find overlapping ranges
overlapping_df = df.alias("a").join(df.alias("b"),
                                   (col("a.FirstName") == col("b.FirstName")) &
                                   (col("a.LastName") == col("b.LastName")) &
                                   (col("a.StartDate") < col("b.EndDate")) &
                                   (col("b.StartDate") < col("a.EndDate")) &
                                   (col("a.Phone") != col("b.Phone")), "inner") \
    .select("a.FirstName", "a.LastName", "a.StartDate", "a.EndDate", "b.StartDate", "b.EndDate")

overlapping_df.show(truncate=False)



+---------+--------+----------+----------+----------+----------+
|FirstName|LastName|StartDate |EndDate   |StartDate |EndDate   |
+---------+--------+----------+----------+----------+----------+
|Rob      |Walters |2007-07-05|2009-12-28|2007-07-06|2009-12-27|
|Rob      |Walters |2007-07-06|2009-12-27|2007-07-05|2009-12-28|
+---------+--------+----------+----------+----------+----------+



# Spark SQL

In [8]:
# SQL Query to identify overlapping date ranges
sql_query = """
SELECT a.FirstName, a.LastName, a.StartDate, a.EndDate, b.StartDate, b.EndDate
FROM Employees a
JOIN Employees b
ON a.FirstName = b.FirstName
AND a.LastName = b.LastName
AND a.StartDate < b.EndDate
AND b.StartDate < a.EndDate
AND a.Phone != b.Phone
"""

# Execute the query
overlapping_sql_df = spark.sql(sql_query)
overlapping_sql_df.show(truncate=False)



+---------+--------+----------+----------+----------+----------+
|FirstName|LastName|StartDate |EndDate   |StartDate |EndDate   |
+---------+--------+----------+----------+----------+----------+
|Rob      |Walters |2007-07-05|2009-12-28|2007-07-06|2009-12-27|
|Rob      |Walters |2007-07-06|2009-12-27|2007-07-05|2009-12-28|
+---------+--------+----------+----------+----------+----------+



# Python

In [7]:
import pandas as pd

# Sample Data - Assuming data from the Spark DataFrame is converted to pandas DataFrame
data = [
    ("Guy", "Gilbert", "Production Technician - WC60", "Production", "320-555-0195", "2006-01-06", None),
    ("Kevin", "Brown", "Marketing Assistant", "Marketing", "150-555-0189", "2006-08-26", None),
    ("Roberto", "Tamburello", "Engineering Manager", "Engineering", "212-555-0187", "2007-06-11", None),
    ("Rob", "Walters", "Senior Tool Designer", "Tool Design", "612-555-0149", "2007-07-05", "2009-12-28"),
    ("Rob", "Walters", "Senior Tool Designer", "Tool Design", "412-555-6754", "2007-07-06", "2009-12-27"),
    ("Thierry", "D'Hers", "Tool Designer", "Tool Design", "168-555-0183", "2007-07-13", None),
    ("David", "Bradley", "Marketing Manager", "Marketing", "412-555-1234", "2007-07-20", "2009-02-11"),
    ("David", "Bradley", "Marketing Manager", "Marketing", "913-555-0172", "2009-02-12", None),
    ("JoLynn", "Dobney", "Production Supervisor - WC60", "Production", "903-555-0143", "2007-07-26", None),
    ("Ruth", "Ellerbrock", "Production Technician - WC10", "Production", "145-555-0130", "2007-08-06", None),
    ("Gail", "Erickson", "Design Engineer", "Engineering", "849-555-0513", "2007-08-06", None)
]

# Create pandas DataFrame
df_pandas = pd.DataFrame(data, columns=["FirstName", "LastName", "Title", "DepartmentName", "Phone", "StartDate", "EndDate"])
df_pandas["StartDate"] = pd.to_datetime(df_pandas["StartDate"])
df_pandas["EndDate"] = pd.to_datetime(df_pandas["EndDate"])

# Identify overlapping date ranges using nested loops
overlaps = []
for i, row1 in df_pandas.iterrows():
    for j, row2 in df_pandas.iterrows():
        if i != j and row1["FirstName"] == row2["FirstName"] and row1["LastName"] == row2["LastName"]:
            if row1["StartDate"] < row2["EndDate"] and row2["StartDate"] < row1["EndDate"]:
                overlaps.append((row1["FirstName"], row1["LastName"], row1["StartDate"], row1["EndDate"],
                                 row2["StartDate"], row2["EndDate"]))

# Convert to DataFrame to display
overlap_df = pd.DataFrame(overlaps, columns=["FirstName", "LastName", "StartDate1", "EndDate1", "StartDate2", "EndDate2"])
print(overlap_df)


  FirstName LastName StartDate1   EndDate1 StartDate2   EndDate2
0       Rob  Walters 2007-07-05 2009-12-28 2007-07-06 2009-12-27
1       Rob  Walters 2007-07-06 2009-12-27 2007-07-05 2009-12-28
