# Problem Statement
https://medium.com/data-engineer-things/linkedin-pyspark-interview-question-hard-level-2daa1b7baedf

User worked at Microsoft and next job was Google

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

# Initialize Spark session
spark = SparkSession.builder.master("local").appName("LinkedInUsers").getOrCreate()

# Sample data for LinkedIn users
linkedin_data = [
    (1, 'Microsoft', 'developer', '2020-04-13', '2021-11-01'),
    (1, 'Google', 'developer', '2021-11-01', None),
    (2, 'Google', 'manager', '2021-01-01', '2021-01-11'),
    (2, 'Microsoft', 'manager', '2021-01-11', None),
    (3, 'Microsoft', 'analyst', '2019-03-15', '2020-07-24'),
    (3, 'Amazon', 'analyst', '2020-08-01', '2020-11-01'),
    (3, 'Google', 'senior analyst', '2020-11-01', '2021-03-04'),
    (4, 'Google', 'junior developer', '2018-06-01', '2021-11-01'),
    (4, 'Google', 'senior developer', '2021-11-01', None),
    (5, 'Microsoft', 'manager', '2017-09-26', None),
    (6, 'Google', 'CEO', '2015-10-02', None)
]

# Create a DataFrame from the sample data
linkedin_df = spark.createDataFrame(linkedin_data, ['user_id', 'company', 'position', 'start_date', 'end_date'])
# linkedin_df.show()

In [None]:
linked_in_users = linkedin_df \
    .orderBy("start_date", ascending=True) \
    .groupBy('user_id') \
    .agg(
        F.collect_list(
            F.struct('company', 'start_date')
        ).alias('jobs'))

# linked_in_users.show(truncate=False)

In [None]:
linked_in_users_ms_google = linked_in_users \
    .filter(
        F.array_contains('jobs.company', 'Microsoft') &
        F.array_contains('jobs.company', 'Google')
    )

# linked_in_users_ms_google.show(truncate=False)

In [7]:
linked_in_users_ms_next_google = linked_in_users_ms_google \
    .filter(
        F.expr("array_position(jobs.company, 'Microsoft') + 1 == array_position(jobs.company, 'Google')")
    )

linked_in_users_ms_next_google.show(truncate=False)

+-------+-----------------------------------------------+
|user_id|jobs                                           |
+-------+-----------------------------------------------+
|1      |[{Microsoft, 2020-04-13}, {Google, 2021-11-01}]|
+-------+-----------------------------------------------+

