## Condition Priority

In [6]:
import pandas as pd

dummy_data = [
    {
        "name": "John Doe",
        "work_experiences": [
            {
                "company": "ABC Corporation",
                "position": "Software Engineer",
                "office_address": "123 Main Street, New York",
                "date": "2018-2020",
                "data_sources": ["LinkedIn"],
                "home_addresses": ["789 Elm Road, Chicago"]
            },
            {
                "company": "XYZ Inc.",
                "position": "Senior Developer",
                "office_address": "456 Oak Avenue, San Francisco",
                "date": "2020-2022",
                "data_sources": ["HR Records"],
                "home_addresses": ["987 Cedar Street, Los Angeles"]
            }
        ]
    },
    {
        "name": "Jane Smith",
        "work_experiences": [
            {
                "company": "PQR Ltd.",
                "position": "Marketing Manager",
                "office_address": "789 Elm Road, Chicago",
                "date": "2016-2019",
                "data_sources": ["Company Website"],
                "home_addresses": ["123 Main Street, New York"]
            },
            {
                "company": "LMN Enterprises",
                "position": "Sales Representative",
                "office_address": "321 Maple Lane, Boston",
                "date": "2019-2021",
                "data_sources": ["HR Records"],
                "home_addresses": ["456 Oak Avenue, San Francisco"]
            },
            {
                "company": "DEF Company",
                "position": "Business Analyst",
                "office_address": "987 Cedar Street, Los Angeles",
                "date": "2021-2023",
                "data_sources": ["Referral"],
                "home_addresses": ["456 Oak Avenue, San Francisco"]
            }
        ]
    },
    {
        "name": "Michael Johnson",
        "work_experiences": [
            {
                "company": "GHI Corporation",
                "position": "Project Manager",
                "office_address": "654 Pine Avenue, Seattle",
                "date": "2017-2022",
                "data_sources": ["DOSM"],
                "home_addresses": ["789 Elm Road, Chicago"]
            }
        ]
    }
]

# Create an empty DataFrame
df = pd.DataFrame(columns=["Name", "Company", "Position", "Office Address", "Date", "Home Address", "Data Sources"])

# Iterate over the dummy data and populate the DataFrame
for person in dummy_data:
    name = person["name"]
    work_experiences = person["work_experiences"]
    for experience in work_experiences:
        company = experience["company"]
        position = experience["position"]
        office_address = experience["office_address"]
        date = experience["date"]
        data_sources = ", ".join(experience["data_sources"])
        home_addresses = ", ".join(experience["home_addresses"])
        df = df.append({"Name": name, "Company": company, "Position": position,
                        "Office Address": office_address, "Date": date,
                        "Home Address": home_addresses, "Data Sources": data_sources},
                       ignore_index=True)

# Display the DataFrame
df


Unnamed: 0,Name,Company,Position,Office Address,Date,Home Address,Data Sources
0,John Doe,ABC Corporation,Software Engineer,"123 Main Street, New York",2018-2020,"789 Elm Road, Chicago",LinkedIn
1,John Doe,XYZ Inc.,Senior Developer,"456 Oak Avenue, San Francisco",2020-2022,"987 Cedar Street, Los Angeles",HR Records
2,Jane Smith,PQR Ltd.,Marketing Manager,"789 Elm Road, Chicago",2016-2019,"123 Main Street, New York",Company Website
3,Jane Smith,LMN Enterprises,Sales Representative,"321 Maple Lane, Boston",2019-2021,"456 Oak Avenue, San Francisco",HR Records
4,Jane Smith,DEF Company,Business Analyst,"987 Cedar Street, Los Angeles",2021-2023,"456 Oak Avenue, San Francisco",Referral
5,Michael Johnson,GHI Corporation,Project Manager,"654 Pine Avenue, Seattle",2017-2022,"789 Elm Road, Chicago",DOSM


## filter by Recent Date

In [7]:
# Split the date range into separate start and end date columns
df[['Start Date', 'End Date']] = df['Date'].str.split('-', expand=True)

# Convert the date columns to datetime type
df['Start Date'] = pd.to_datetime(df['Start Date'])
df['End Date'] = pd.to_datetime(df['End Date'])

# Filter the latest data for each individual
latest_data = df.loc[df.groupby('Name')['End Date'].idxmax()]

latest_data.drop(columns=['Start Date','End Date'], inplace=True)

# Display the filtered data
latest_data

Unnamed: 0,Name,Company,Position,Office Address,Date,Home Address,Data Sources
4,Jane Smith,DEF Company,Business Analyst,"987 Cedar Street, Los Angeles",2021-2023,"456 Oak Avenue, San Francisco",Referral
1,John Doe,XYZ Inc.,Senior Developer,"456 Oak Avenue, San Francisco",2020-2022,"987 Cedar Street, Los Angeles",HR Records
5,Michael Johnson,GHI Corporation,Project Manager,"654 Pine Avenue, Seattle",2017-2022,"789 Elm Road, Chicago",DOSM


## filter by priority Data Sources (perlu tetapkan priority setiap data sources)<br>
1 LinkedIn<br>
2 HR Records<br>
3 Company Website<br>
4 Employee Database<br>
5 Referral<br>

In [8]:

# Define the priority of data sources
priority = ["LinkedIn", "HR Records", "Company Website", "Employee Database", "Referral"]

# Convert the "Data Sources" column to categorical type with priority order
df["Data Sources"] = pd.Categorical(df["Data Sources"], categories=priority, ordered=True)

# Sort the DataFrame based on the priority of data sources
df = df.sort_values("Data Sources")

# Filter the valid data by dropping duplicates based on Name and keeping the first occurrence (highest priority)
valid_data = df.drop_duplicates(subset="Name", keep="first")

# Reset the index of the filtered data
valid_data = valid_data.reset_index(drop=True)

# Display the filtered data
valid_data = valid_data.drop(columns=['Start Date','End Date','Date'])
valid_data

Unnamed: 0,Name,Company,Position,Office Address,Home Address,Data Sources
0,John Doe,ABC Corporation,Software Engineer,"123 Main Street, New York","789 Elm Road, Chicago",LinkedIn
1,Jane Smith,LMN Enterprises,Sales Representative,"321 Maple Lane, Boston","456 Oak Avenue, San Francisco",HR Records
2,Michael Johnson,GHI Corporation,Project Manager,"654 Pine Avenue, Seattle","789 Elm Road, Chicago",


## print remaining Data Sources

In [11]:
import pandas as pd
import numpy as np

dummy_data = [
    {
        "name": "John Doe",
        "work_experiences": [
            {
                "company": "ABC Corporation",
                "position": "Software Engineer",
                "office_address": "123 Main Street, New York",
                "date": "2018-2020",
                "data_sources": ["LinkedIn"],
                "home_addresses": ["789 Elm Road, Chicago"]
            },
            {
                "company": "XYZ Inc.",
                "position": "Senior Developer",
                "office_address": "456 Oak Avenue, San Francisco",
                "date": "2020-2022",
                "data_sources": ["HR Records"],
                "home_addresses": ["987 Cedar Street, Los Angeles"]
            }
        ]
    },
    {
        "name": "Jane Smith",
        "work_experiences": [
            {
                "company": "PQR Ltd.",
                "position": "Marketing Manager",
                "office_address": "789 Elm Road, Chicago",
                "date": "2016-2019",
                "data_sources": ["Company Website"],
                "home_addresses": ["123 Main Street, New York"]
            },
            {
                "company": "LMN Enterprises",
                "position": "Sales Representative",
                "office_address": "321 Maple Lane, Boston",
                "date": "2019-2021",
                "data_sources": ["HR Records"],
                "home_addresses": ["456 Oak Avenue, San Francisco"]
            },
            {
                "company": "DEF Company",
                "position": "Business Analyst",
                "office_address": "987 Cedar Street, Los Angeles",
                "date": "2021-2023",
                "data_sources": ["Referral"],
                "home_addresses": ["456 Oak Avenue, San Francisco"]
            }
        ]
    },
    {
        "name": "Michael Johnson",
        "work_experiences": [
            {
                "company": "GHI Corporation",
                "position": "Project Manager",
                "office_address": "654 Pine Avenue, Seattle",
                "date": "2017-2022",
                "data_sources": ["DOSM"],
                "home_addresses": ["789 Elm Road, Chicago"]
            }
        ]
    }
]

# Create an empty DataFrame
df = pd.DataFrame(columns=["Name", "Company", "Position", "Office Address", "Date", "Home Address", "Data Sources"])

# Iterate over the dummy data and populate the DataFrame
for person in dummy_data:
    name = person["name"]
    work_experiences = person["work_experiences"]
    for experience in work_experiences:
        company = experience["company"]
        position = experience["position"]
        office_address = experience["office_address"]
        date = experience["date"]
        data_sources = ", ".join(experience["data_sources"])
        home_addresses = ", ".join(experience["home_addresses"])
        df = df.append({"Name": name, 
                        "Company": company, 
                        "Position": position,
                        "Office Address": office_address, 
                        "Date": date,
                        "Home Address": home_addresses, 
                        "Data Sources": data_sources},
                       ignore_index=True)

# Get unique values in the "Data Sources" column
unique_sources = df["Data Sources"].unique()

# Remove missing values (NaN)
unique_sources = unique_sources[~pd.isnull(unique_sources)]

# Define the priority of data sources
priority = ["LinkedIn", "HR Records", "Company Website", "Employee Database", "Referral"]

# Find the values not present in the priority list
remaining_sources = np.setdiff1d(unique_sources, priority)

# Define the new priority list
new_priority = list(priority) + list(remaining_sources)

# Convert the "Data Sources" column to categorical type with updated priority order
df["Data Sources"] = pd.Categorical(df["Data Sources"], categories=new_priority, ordered=True)

# Fill missing values in "Data Sources" column with the last value in the priority list
df["Data Sources"] = df["Data Sources"].fillna(new_priority[0])

# Sort the DataFrame based on the priority of data sources
df = df.sort_values("Data Sources")

# Filter the valid data by dropping duplicates based on Name and keeping the first occurrence (highest priority)
valid_data = df.drop_duplicates(subset="Name", keep="first")

# Reset the index of the filtered data
valid_data = valid_data.reset_index(drop=True)

# Display the filtered data
valid_data = valid_data.drop(columns=['Date'])
display(df,valid_data)


Unnamed: 0,Name,Company,Position,Office Address,Date,Home Address,Data Sources
0,John Doe,ABC Corporation,Software Engineer,"123 Main Street, New York",2018-2020,"789 Elm Road, Chicago",LinkedIn
1,John Doe,XYZ Inc.,Senior Developer,"456 Oak Avenue, San Francisco",2020-2022,"987 Cedar Street, Los Angeles",HR Records
3,Jane Smith,LMN Enterprises,Sales Representative,"321 Maple Lane, Boston",2019-2021,"456 Oak Avenue, San Francisco",HR Records
2,Jane Smith,PQR Ltd.,Marketing Manager,"789 Elm Road, Chicago",2016-2019,"123 Main Street, New York",Company Website
4,Jane Smith,DEF Company,Business Analyst,"987 Cedar Street, Los Angeles",2021-2023,"456 Oak Avenue, San Francisco",Referral
5,Michael Johnson,GHI Corporation,Project Manager,"654 Pine Avenue, Seattle",2017-2022,"789 Elm Road, Chicago",DOSM


Unnamed: 0,Name,Company,Position,Office Address,Home Address,Data Sources
0,John Doe,ABC Corporation,Software Engineer,"123 Main Street, New York","789 Elm Road, Chicago",LinkedIn
1,Jane Smith,LMN Enterprises,Sales Representative,"321 Maple Lane, Boston","456 Oak Avenue, San Francisco",HR Records
2,Michael Johnson,GHI Corporation,Project Manager,"654 Pine Avenue, Seattle","789 Elm Road, Chicago",DOSM
