<a href="https://colab.research.google.com/github/animesh-11/AI_ML/blob/main/EDA_Coding_Practice.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
from datetime import datetime

# NOTE: This global df_sales is from a previous problem and is not used in the solution below.
# The solution uses the filename passed to the aggregate_monthly_sales function.

def aggregate_monthly_sales(input_tuple, filename='https://d3ejq4mxgimsmf.cloudfront.net/Product_sales_data-9f83ae7a11c340d1884ae214aadbacac.csv'):
    # Read the CSV file
    df = pd.read_csv(filename)

    product_category, start_date_str, end_date_str = input_tuple

    # Convert sale_date to datetime
    df['sale_date'] = pd.to_datetime(df['sale_date'])

    # Convert input dates to datetime
    start_date = pd.to_datetime(start_date_str)
    end_date = pd.to_datetime(end_date_str)

    # Filter by product category and date range
    filtered_df = df[
        (df['product_category'] == product_category) &
        (df['sale_date'] >= start_date) &
        (df['sale_date'] <= end_date)
    ].copy()

    # Check if the date range spans more than one month
    if start_date.to_period('M') == end_date.to_period('M'):
        # Daily aggregation for a single calendar month
        grouped_sales = filtered_df.groupby(filtered_df['sale_date'].dt.strftime('%Y-%m-%d'))['sales_amount'].sum().reset_index()
        grouped_sales.columns = ['date', 'total_sales']
    else:
        # Monthly aggregation for multiple months
        grouped_sales = filtered_df.groupby(filtered_df['sale_date'].dt.to_period('M'))['sales_amount'].sum().reset_index()
        grouped_sales['date'] = grouped_sales['sale_date'].astype(str) + '-01'
        grouped_sales = grouped_sales[['date', 'total_sales']]

    # Round sales_amount to 2 decimal places
    grouped_sales['total_sales'] = grouped_sales['total_sales'].round(2)

    # Convert to list of tuples
    result = list(grouped_sales.itertuples(index=False, name=None))
    return result

input_data = input()
product_category, start_date, end_date = map(str.strip, input_data.split(','))
output = aggregate_monthly_sales((product_category, start_date, end_date))
print(output)

In [None]:
def identify_invalid_email(record):
    """
    Scans the provided data entry (a dictionary with the candidate name and their email ID)
    and replaces their email ID with a string 'invalid' if it is invalid according to specific rules.
    If not, the email ID is unmodified.

    Args:
        record (dict): A dictionary with keys 'Candidate Name' and 'Email ID'.
                       The values are the candidate's name (str) and their email ID (str) respectively.

    Returns:
        dict: The dictionary after updating the email ID if it's invalid.
    """
    email_id = record.get('Email ID')

    # Rule 1: It is a non-empty string
    if not email_id:
        record['Email ID'] = 'invalid'
        return record

    # Rule 2: It must contain both an '@' symbol and a '.' symbol
    if '@' not in email_id or '.' not in email_id:
        record['Email ID'] = 'invalid'
        return record

    at_index = email_id.find('@')
    dot_index = email_id.find('.')

    # Rule 3: The '.' symbol must come after the '@' symbol with at least one character in between them
    # (dot_index > at_index + 1)
    if not (at_index != -1 and dot_index != -1 and dot_index > at_index + 1):
        record['Email ID'] = 'invalid'
        return record

    return record

Let's test the function with the example cases provided and additional scenarios.

In [None]:
# Testcase 1
input_record_1 = {'Candidate Name': 'Nikhil', 'Email ID': 'nikhil98singhdomain.com'}
expected_output_1 = {'Candidate Name': 'Nikhil', 'Email ID': 'invalid'}
result_1 = identify_invalid_email(input_record_1)
print(f"Testcase 1 Input: {input_record_1}")
print(f"Testcase 1 Output: {result_1}")
print(f"Testcase 1 Expected: {expected_output_1}")
print(f"Testcase 1 {'PASS' if result_1 == expected_output_1 else 'FAIL'}\n")

# Testcase 2
input_record_2 = {'Candidate Name': 'Dravid', 'Email ID': 'dravidlee@domaincom'}
expected_output_2 = {'Candidate Name': 'Dravid', 'Email ID': 'invalid'}
result_2 = identify_invalid_email(input_record_2)
print(f"Testcase 2 Input: {input_record_2}")
print(f"Testcase 2 Output: {result_2}")
print(f"Testcase 2 Expected: {expected_output_2}")
print(f"Testcase 2 {'PASS' if result_2 == expected_output_2 else 'FAIL'}\n")

# Additional Test Case 3: Valid Email
input_record_3 = {'Candidate Name': 'John Doe', 'Email ID': 'john.doe@example.com'}
expected_output_3 = {'Candidate Name': 'John Doe', 'Email ID': 'john.doe@example.com'}
result_3 = identify_invalid_email(input_record_3)
print(f"Testcase 3 Input: {input_record_3}")
print(f"Testcase 3 Output: {result_3}")
print(f"Testcase 3 Expected: {expected_output_3}")
print(f"Testcase 3 {'PASS' if result_3 == expected_output_3 else 'FAIL'}\n")

# Additional Test Case 4: Email with '.' before '@'
input_record_4 = {'Candidate Name': 'Alice', 'Email ID': 'alice.example@com'}
expected_output_4 = {'Candidate Name': 'Alice', 'Email ID': 'invalid'}
result_4 = identify_invalid_email(input_record_4)
print(f"Testcase 4 Input: {input_record_4}")
print(f"Testcase 4 Output: {result_4}")
print(f"Testcase 4 Expected: {expected_output_4}")
print(f"Testcase 4 {'PASS' if result_4 == expected_output_4 else 'FAIL'}\n")

# Additional Test Case 5: Email with '.' immediately after '@'
input_record_5 = {'Candidate Name': 'Bob', 'Email ID': 'bob@.com'}
expected_output_5 = {'Candidate Name': 'Bob', 'Email ID': 'invalid'}
result_5 = identify_invalid_email(input_record_5)
print(f"Testcase 5 Input: {input_record_5}")
print(f"Testcase 5 Output: {result_5}")
print(f"Testcase 5 Expected: {expected_output_5}")
print(f"Testcase 5 {'PASS' if result_5 == expected_output_5 else 'FAIL'}\n")

# Additional Test Case 6: Empty Email ID
input_record_6 = {'Candidate Name': 'Charlie', 'Email ID': ''}
expected_output_6 = {'Candidate Name': 'Charlie', 'Email ID': 'invalid'}
result_6 = identify_invalid_email(input_record_6)
print(f"Testcase 6 Input: {input_record_6}")
print(f"Testcase 6 Output: {result_6}")
print(f"Testcase 6 Expected: {expected_output_6}")
print(f"Testcase 6 {'PASS' if result_6 == expected_output_6 else 'FAIL'}\n")

# Additional Test Case 7: Missing both '@' and '.'
input_record_7 = {'Candidate Name': 'David', 'Email ID': 'davidexamplecom'}
expected_output_7 = {'Candidate Name': 'David', 'Email ID': 'invalid'}
result_7 = identify_invalid_email(input_record_7)
print(f"Testcase 7 Input: {input_record_7}")
print(f"Testcase 7 Output: {result_7}")
print(f"Testcase 7 Expected: {expected_output_7}")
print(f"Testcase 7 {'PASS' if result_7 == expected_output_7 else 'FAIL'}\n")

You are working as an analyst in an aviation company. You have records of historical passenger counts for the flights from 1949 to 1960. Your task is to compute the mean number of passengers per year for this duration. You will be using a dataset that contains the two key pieces of information which are stored in their respective columns.



Here is the data description

'Month': This is a column with str entries that represent the year and the month for the record. For instance, the first entry in this column is the string '1949-01'.
'Passengers': This is a column with int entries that represent the number of passengers catered to in that specific month and year. For instance, the first entry in this column is the integer 112.
The number of rows in the data is 144.


Compute the mean number of passengers per year and store the results year-wise in a Pandas Series mean_passengers_by_year.



Note: The test cases for this question will check specific values in the computed series. The inputs for the test cases are the index labels for the computed series, and the outputs are the expected values of your series in those locations.



Input format

The year (for example, 1955)


Output format

The expected cell value at the target location as specified in the input label


Constraints

The index of the computed series is the year


Testcases



Testcase 1



Input

1949



Expected Output

127



Testcase 2



Input

1950



Expected Output

140

In [None]:
import pandas as pd

# Load the data from the provided URL
df = pd.read_csv('https://d3ejq4mxgimsmf.cloudfront.net/AirPassengers-1eaa575774ad408691e22a1edfcab2ec.csv')

# Convert 'Month' to datetime objects to easily extract the year
df['Year'] = pd.to_datetime(df['Month']).dt.year

# Group by 'Year' and compute the mean number of passengers
mean_passengers_by_year = df.groupby('Year')['Passengers'].mean()

# Display the result for verification
print(mean_passengers_by_year)

# Example Test Case Verification
# The expected values are based on the original problem statement's test cases
print(f"\nTestcase 1 (1949) Expected: 127, Actual: {mean_passengers_by_year.loc[1949]:.0f}")
print(f"Testcase 2 (1950) Expected: 140, Actual: {mean_passengers_by_year.loc[1950]:.0f}")

As a reporter for a renowned chess magazine, your task is to identify the top grandmasters in each federation, typically defined as the country in which the grandmasters compete the most. Specifically, you need to determine which grandmasters in each federation have a rating greater than or equal to the mean rating of that federation.



The dataset is from March 2025 and contains 10 columns and 1303 rows. However, for this analysis, you need to only consider the following columns:

'Fed': This is a column with str entries that represent the federation which the grandmaster is a part of. For instance, 'IND' refers to the All India Chess Federation.
'Rating': This is a column with int entries that represent the Elo rating of the grandmaster. The Elo rating system measures the relative strength of a player in some games, such as chess, compared to other players. For instance, Magnus Carlsen, the highest-rated player in this dataset, has an Elo rating of 2833.
'Name': This is a column with str entries which represent the name of the grandmaster. For instance, 'Carlsen, Magnus'.
All the three columns 'Fed', 'Name', 'Rating' contain valid and complete data


Create a Pandas Series, players_above_mean, where the index represents the federation and the values are lists of grandmasters' names whose ratings are greater than or equal to the average Elo rating of that federation.



Note: The test cases for this question will check specific values in the computed series. The inputs for the test cases are the index labels (federations) for the computed series, and the outputs are the expected values (lists of grandmasters' names) of your series in those locations.



Input format

The federation


Output format

The expected cell value at the target location as specified in the input label
The dataset is already sorted in descending order of Elo rating; the output lists must preserve this order


Constraints

N/A


Sample Testcases



Testcase 1



Input

NOR



Expected Output

['Carlsen, Magnus', 'Christiansen, Johan-Sebastian', 'Tari, Aryan', 'Hammer, Jon Ludvig', 'Agdestein, Simen', 'Urkedal, Frode Olav Olsen', 'Amar, Elham']



Testcase 2



Input

IND



Expected Output

['Gukesh D', 'Erigaisi Arjun', 'Praggnanandhaa R', 'Anand, Viswanathan', 'Aravindh, Chithambaram VR.', 'Vidit, Santosh Gujrathi', 'Harikrishna, Pentala', 'Nihal Sarin', 'Sadhwani, Raunak', 'Karthikeyan, Murali', 'Mendonca, Leon Luke', 'Puranik, Abhimanyu', 'Narayanan S L', 'Aryan Chopra', 'Pranav, V', 'Pranesh M', 'Gupta, Abhijeet', 'Ganguly, Surya Shekhar', 'Ghosh, Diptayan', 'Vaibhav, Suri', 'Bharath Subramaniyam H', 'Iniyan, Pa', 'Karthik Venkataraman', 'Gopal G.N.', 'Adhiban, B.', 'Aditya Mittal', 'Sethuraman, S.P.', 'Sasikiran, Krishnan', 'Sankalp Gupta', 'Pranav Anand', 'Vignesh N R', 'Harsha Bharathakoti', 'Raja Rithvik R', 'Koneru, Humpy']

In [None]:
import pandas as pd

# Load the dataset from the provided URL
df = pd.read_csv('https://d3ejq4mxgimsmf.cloudfront.net/active_grandmasters_March25-b0e7a64fd25c48ab8f380debb40e8a0a.csv')

# Calculate the mean rating for each federation
mean_ratings_by_fed = df.groupby('Fed')['Rating'].mean()

# Function to get players above or equal to the mean rating for a given federation
def get_players_above_mean(federation_df, mean_rating):
    return federation_df[federation_df['Rating'] >= mean_rating]['Name'].tolist()

# Create an empty dictionary to store the results
players_above_mean_dict = {}

# Iterate through each federation and apply the logic
for fed, mean_rating in mean_ratings_by_fed.items():
    federation_df = df[df['Fed'] == fed]
    players_above_mean_dict[fed] = get_players_above_mean(federation_df, mean_rating)

# Convert the dictionary to a Pandas Series
players_above_mean = pd.Series(players_above_mean_dict)

# Display the resulting Series
print(players_above_mean)

# Testcase 1 Verification (NOR)
expected_nor = ['Carlsen, Magnus', 'Christiansen, Johan-Sebastian', 'Tari, Aryan', 'Hammer, Jon Ludvig', 'Agdestein, Simen', 'Urkedal, Frode Olav Olsen', 'Amar, Elham']
actual_nor = players_above_mean.loc['NOR']
print(f"\nTestcase 1 (NOR) Expected: {expected_nor}")
print(f"Testcase 1 (NOR) Actual: {actual_nor}")
print(f"Testcase 1 {'PASS' if actual_nor == expected_nor else 'FAIL'}")

# Testcase 2 Verification (IND)
expected_ind = [
    'Gukesh D', 'Erigaisi Arjun', 'Praggnanandhaa R', 'Anand, Viswanathan',
    'Aravindh, Chithambaram VR.', 'Vidit, Santosh Gujrathi', 'Harikrishna, Pentala',
    'Nihal Sarin', 'Sadhwani, Raunak', 'Karthikeyan, Murali', 'Mendonca, Leon Luke',
    'Puranik, Abhimanyu', 'Narayanan S L', 'Aryan Chopra', 'Pranav, V', 'Pranesh M',
    'Gupta, Abhijeet', 'Ganguly, Surya Shekhar', 'Ghosh, Diptayan', 'Vaibhav, Suri',
    'Bharath Subramaniyam H', 'Iniyan, Pa', 'Karthik Venkataraman', 'Gopal G.N.',
    'Adhiban, B.', 'Aditya Mittal', 'Sethuraman, S.P.', 'Sasikiran, Krishnan',
    'Sankalp Gupta', 'Pranav Anand', 'Vignesh N R', 'Harsha Bharathakoti',
    'Raja Rithvik R', 'Koneru, Humpy'
]
actual_ind = players_above_mean.loc['IND']
print(f"\nTestcase 2 (IND) Expected: {expected_ind}")
print(f"Testcase 2 (IND) Actual: {actual_ind}")
print(f"Testcase 2 {'PASS' if actual_ind == expected_ind else 'FAIL'}")

You are a Data Analyst at a retail company that tracks sales across various product categories. Your task is to analyze sales trends for a specific category over a given time period to support decisions on promotions and inventory planning.



You are provided with a dataset containing

- sale_date: Date of the sale (YYYY-MM-DD), ranging from 2024-01-01 to 2024-12-31 in string format

- product_category: Category of the product sold, one of Electronics, Furniture, or Clothing in string format

- sales_amount: Value of the sale (float value between 100.00 and 1000.00)



Task

- Implement a function aggregate_monthly_sales(input_tuple) that:

- Accepts a tuple input: (product_category, start_date, end_date)

- Filters the dataset based on the given product category and date range

- Aggregates the sales data

  - Daily, if the date range is within a single calendar month

  - Monthly, if the date range spans more than one month

- Returns a list of tuples [(YYYY-MM-DD, total_sales), ...] for daily or monthly aggregation



Input Format

- A tuple (product_category, start_date, end_date)



Output Format

- A list of tuples [(YYYY-MM-DD, total_sales_amount), ...]



Constraints

- Input must be a tuple in the format: (product_category, start_date, end_date)

- Dates must be in YYYY-MM-DD format

- Start date cannot be after end date

- Every row in the dataset contains valid, complete data across all columns

- There is no missing data in the dataset



Examples



Testcase 1



Input

Electronics, 2024-01-01, 2024-01-31



Expected Output

[('2024-01-05', 735.21), ('2024-01-14', 827.28), ('2024-01-21', 705.59)]



Testcase 2



Input

Clothing, 2024-05-01, 2024-05-31



Expected Output

[('2024-05-05', 692.03), ('2024-05-20', 223.46), ('2024-05-24', 733.61), ('2024-05-31', 769.24)]

In [None]:
import pandas as pd
from datetime import datetime

# This function aggregates sales data for a given product category and date range.
# It aggregates daily if the date range is within a single calendar month,
# and monthly if the date range spans more than one month.
# The filename parameter allows specifying the CSV file source.
def aggregate_monthly_sales(input_tuple, filename='https://d3ejq4mxgimsmf.cloudfront.net/Product_sales_data-9f83ae7a11c340d1884ae214aadbacac.csv'):
    # Read the CSV file into a pandas DataFrame
    df = pd.read_csv(filename)

    # Unpack the input tuple into product category, start date string, and end date string
    product_category, start_date_str, end_date_str = input_tuple

    # Convert the 'sale_date' column in the DataFrame to datetime objects
    df['sale_date'] = pd.to_datetime(df['sale_date'])

    # Convert the input start and end date strings to datetime objects
    start_date = pd.to_datetime(start_date_str)
    end_date = pd.to_datetime(end_date_str)

    # Filter the DataFrame based on the product category and the date range
    filtered_df = df[
        (df['product_category'] == product_category) &
        (df['sale_date'] >= start_date) &
        (df['sale_date'] <= end_date)
    ].copy()

    # Determine the aggregation level: daily or monthly
    if start_date.to_period('M') == end_date.to_period('M'):
        # If the date range is within a single calendar month, perform daily aggregation
        # Group by the full date (YYYY-MM-DD) and sum the 'sales_amount'
        grouped_sales = filtered_df.groupby(filtered_df['sale_date'].dt.strftime('%Y-%m-%d'))['sales_amount'].sum().reset_index()
        # Rename the columns for clarity
        grouped_sales.columns = ['date', 'total_sales']
    else:
        # If the date range spans multiple months, perform monthly aggregation
        # Group by the year-month period and sum the 'sales_amount'
        grouped_sales = filtered_df.groupby(filtered_df['sale_date'].dt.to_period('M'))['sales_amount'].sum().reset_index()
        # Convert the Period object to a string in 'YYYY-MM-01' format for consistency with daily dates
        grouped_sales['date'] = grouped_sales['sale_date'].astype(str) + '-01'
        # Select and reorder columns
        grouped_sales = grouped_sales[['date', 'total_sales']]

    # Round the total sales amounts to 2 decimal places
    grouped_sales['total_sales'] = grouped_sales['total_sales'].round(2)

    # Convert the aggregated DataFrame to a list of tuples, where each tuple is (date, total_sales)
    result = list(grouped_sales.itertuples(index=False, name=None))
    return result

# Example of how to use the function with user input:
# The input() function would typically be used to get input from the user in a command-line environment.
# In a notebook, you might manually define input_data or use widgets.
# input_data = input('Enter product category, start date, end date (e.g., Electronics, 2024-01-01, 2024-01-31): ')

# For demonstration purposes, let's use a hardcoded input:
input_data_example = 'Electronics, 2024-01-01, 2024-01-31' # You can change this to test different scenarios
product_category, start_date, end_date = map(str.strip, input_data_example.split(','))

# Call the aggregate_monthly_sales function with the parsed input
output = aggregate_monthly_sales((product_category, start_date, end_date))

# Print the returned list of tuples
print(output)

You are a Data Analyst at a retail company that tracks sales across various product categories. Your task is to analyze sales trends for a specific category over a given time period to support decisions on promotions and inventory planning.



You are provided with a dataset containing

- sale_date: Date of the sale (YYYY-MM-DD), ranging from 2024-01-01 to 2024-12-31 in string format

- product_category: Category of the product sold, one of Electronics, Furniture, or Clothing in string format

- sales_amount: Value of the sale (float value between 100.00 and 1000.00)



Task

- Implement a function aggregate_monthly_sales(input_tuple) that:

- Accepts a tuple input: (product_category, start_date, end_date)

- Filters the dataset based on the given product category and date range

- Aggregates the sales data

  - Daily, if the date range is within a single calendar month

  - Monthly, if the date range spans more than one month

- Returns a list of tuples [(YYYY-MM-DD, total_sales), ...] for daily or monthly aggregation



Input Format

- A tuple (product_category, start_date, end_date)



Output Format

- A list of tuples [(YYYY-MM-DD, total_sales_amount), ...]



Constraints

- Input must be a tuple in the format: (product_category, start_date, end_date)

- Dates must be in YYYY-MM-DD format

- Start date cannot be after end date

- Every row in the dataset contains valid, complete data across all columns

- There is no missing data in the dataset



Examples



Testcase 1



Input

Electronics, 2024-01-01, 2024-01-31



Expected Output

[('2024-01-05', 735.21), ('2024-01-14', 827.28), ('2024-01-21', 705.59)]



Testcase 2



Input

Clothing, 2024-05-01, 2024-05-31



Expected Output

[('2024-05-05', 692.03), ('2024-05-20', 223.46), ('2024-05-24', 733.61), ('2024-05-31', 769.24)]

In [None]:
import pandas as pd
from datetime import datetime

def aggregate_monthly_sales(input_tuple, filename='https://d3ejq4mxgimsmf.cloudfront.net/Product_sales_data-9f83ae7a11c340d1884ae214aadbacac.csv'):
    # Read the CSV file
    df = pd.read_csv(filename)

    product_category, start_date_str, end_date_str = input_tuple

    # Convert sale_date to datetime
    df['sale_date'] = pd.to_datetime(df['sale_date'])

    # Convert input dates to datetime
    start_date = pd.to_datetime(start_date_str)
    end_date = pd.to_datetime(end_date_str)

    # Filter by product category and date range
    filtered_df = df[
        (df['product_category'] == product_category) &
        (df['sale_date'] >= start_date) &
        (df['sale_date'] <= end_date)
    ].copy()

    # Check if the date range spans more than one month
    if start_date.to_period('M') == end_date.to_period('M'):
        # Daily aggregation for a single calendar month
        grouped_sales = filtered_df.groupby(filtered_df['sale_date'].dt.strftime('%Y-%m-%d'))['sales_amount'].sum().reset_index()
        grouped_sales.columns = ['date', 'total_sales']
    else:
        # Monthly aggregation for multiple months
        grouped_sales = filtered_df.groupby(filtered_df['sale_date'].dt.to_period('M'))['sales_amount'].sum().reset_index()
        grouped_sales['date'] = grouped_sales['sale_date'].astype(str) + '-01'
        grouped_sales = grouped_sales[['date', 'total_sales']]

    # Round sales_amount to 2 decimal places
    grouped_sales['total_sales'] = grouped_sales['total_sales'].round(2)

    # Convert to list of tuples
    result = list(grouped_sales.itertuples(index=False, name=None))
    return result

# The input() function will prompt for input when the cell is run
input_data = input()
product_category, start_date, end_date = map(str.strip, input_data.split(','))

output = aggregate_monthly_sales((product_category, start_date, end_date))
print(output)

You are working as an energy analyst in a smart city initiative. You are provided with hourly electricity consumption data for 10 households (observations) over a 24-hour period. Each householdâ€™s usage is recorded as 24 hourly readings (features from 'Hour_0' to 'Hour_23'), representing electricity consumed in kilowatt-hours (kWh) in those hours.



Implement a Python function analyze_household_peak_usage(household_id) to analyse the data for a specific household and identify:

Which part of the day had the highest average energy consumption
Which specific hour(s) in that segment had consumption above the segment's average


The 24-hour day is segmented as follows:

Column indices 0 to 5 of the data (12 AM to 5:59 AM) -> late night or early morning
Column indices 6 to 11 of the data (6 AM to 11:59 AM) -> morning
Column indices 12 to 17 of the data (12 PM to 5:59 PM) -> afternoon
Column indices 18 to 23 of the data (6 PM to 11:59 PM) -> evening or night


Dataset Description

'Household_ID': Unique identifier for a household (str)
'Hour_0', 'Hour_1', ..., 'Hour_23': Hourly electricity consumption values in kWh (int)


Input Format

The household ID (str)


Output Format

A dictionary with the following keys and values
A key 'Peak Segment' with one of the following values suitably populated: 'Late Night/Early Morning', 'Morning', 'Afternoon', 'Evening/Night'
A key 'High Usage Hours' whose value is a list of hours (int) extracted from the string column names of the hours, for instance, 'Hour_13' would be 13


Constraints

Input household ID must be in the range of 'Household_1' to 'Household_10' (both inclusive)
There are no corrupt or missing values or duplicate observations in the data
All values other than the household ID are numeric
Household IDs are strings
Dataset columns are consistently named from 'Hour_0' to 'Hour_23'
No reordering or rearrangement of rows or columns is done


Testcases



Testcase 1



Input

Household_3



Expected Output

{'Peak Segment': 'Evening/Night', 'High Usage Hours': [18, 19, 20, 22]}



Testcase 2



Input

Household_9



Expected Output

{'Peak Segment': 'Late Night/Early Morning', 'High Usage Hours': [0, 2, 4, 5]}

In [None]:
import pandas as pd

def analyze_household_peak_usage(household_id):
    """
    Analyzes hourly electricity consumption data for a specific household to identify
    the peak usage segment and high usage hours within that segment.

    Args:
        household_id (str): The unique identifier for the household (e.g., 'Household_3').

    Returns:
        dict: A dictionary with 'Peak Segment' and 'High Usage Hours' keys.
    """
    filename = 'https://d3ejq4mxgimsmf.cloudfront.net/hourly_energy_usage-9eb4bd3748964f8da8d95b54df732b1d.csv'
    df = pd.read_csv(filename)

    # Filter data for the specific household
    household_data = df[df['Household_ID'] == household_id].iloc[0]

    # Define segments and their corresponding hour ranges and names
    segments = {
        'Late Night/Early Morning': (list(range(0, 6)), 'Hour_'),
        'Morning': (list(range(6, 12)), 'Hour_'),
        'Afternoon': (list(range(12, 18)), 'Hour_'),
        'Evening/Night': (list(range(18, 24)), 'Hour_')
    }

    segment_averages = {}
    # Calculate average consumption for each segment
    for segment_name, (hours, prefix) in segments.items():
        segment_columns = [f'{prefix}{h}' for h in hours]
        segment_values = household_data[segment_columns]
        segment_averages[segment_name] = segment_values.mean()

    # Identify the peak segment (highest average consumption)
    peak_segment_name = max(segment_averages, key=segment_averages.get)
    peak_segment_hours, prefix = segments[peak_segment_name]
    peak_segment_average = segment_averages[peak_segment_name]

    # Identify high usage hours within the peak segment
    high_usage_hours = []
    for hour_int in peak_segment_hours:
        hour_col = f'{prefix}{hour_int}'
        if household_data[hour_col] > peak_segment_average:
            high_usage_hours.append(hour_int)

    return {
        'Peak Segment': peak_segment_name,
        'High Usage Hours': sorted(high_usage_hours) # Ensure hours are sorted
    }

# Testcase 1
input_household_1 = 'Household_3'
expected_output_1 = {'Peak Segment': 'Evening/Night', 'High Usage Hours': [18, 19, 20, 22]}
result_1 = analyze_household_peak_usage(input_household_1)
print(f"Testcase 1 Input: {input_household_1}")
print(f"Testcase 1 Output: {result_1}")
print(f"Testcase 1 Expected: {expected_output_1}")
print(f"Testcase 1 {'PASS' if result_1 == expected_output_1 else 'FAIL'}\n")

# Testcase 2
input_household_2 = 'Household_9'
expected_output_2 = {'Peak Segment': 'Late Night/Early Morning', 'High Usage Hours': [0, 2, 4, 5]}
result_2 = analyze_household_peak_usage(input_household_2)
print(f"Testcase 2 Input: {input_household_2}")
print(f"Testcase 2 Output: {result_2}")
print(f"Testcase 2 Expected: {expected_output_2}")
print(f"Testcase 2 {'PASS' if result_2 == expected_output_2 else 'FAIL'}\n")