In [2]:
# Part 1: Handle Missing Values & Duplicates

    # Step-by-Step Guidelines:
# 1. Load the Data: First, ensure you have pandas installed and import it.
# 2. Handling Missing Values
#     1. Identify Missing Values:
#     2. Fill Missing Values:
# 3. Handling Duplicates
#     1. Identify Duplicates:
#     2. Remove Duplicates:
# 4. Combined Practice on a New Dataset
#     1. New Sample Data:
#     2. Handling Missing Values:
#     3. Remove Duplicates:
        
import pandas as pd
import numpy as np

# --- Part 1: Handle Missing Values & Duplicates ---
# This script demonstrates how to handle missing values and duplicate rows
# in a pandas DataFrame following the provided step-by-step guidelines.

# --- Step-by-Step Guidelines: ---

# 1. Load the Data: First, ensure you have pandas installed and import it.
# (Already done at the top of the script)

# Create a sample DataFrame for initial demonstration
print("--- Step 1: Load the Data (Creating Sample Data) ---")
data_initial = {
    'ID': [1, 2, 3, 4, 5, 1, 6, 7, 8, 9], # Duplicate ID
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve', 'Alice', 'Frank', 'Grace', 'Heidi', 'Ivan'],
    'Score': [85, 92, np.nan, 78, 95, 85, 88, 91, np.nan, 80], # Missing Scores
    'City': ['New York', 'London', 'Paris', 'Tokyo', 'Sydney', 'New York', 'Berlin', 'London', 'Paris', 'Tokyo'],
    'JoinDate': ['2023-01-10', '2023-01-11', '2023-01-11', '2023-01-12', '2023-01-12', '2023-01-10', '2023-01-13', '2023-01-14', '2023-01-14', '2023-01-15']
}
df_initial = pd.DataFrame(data_initial)

print("Initial Sample DataFrame:")
print(df_initial)
print("\n")

# 2. Handling Missing Values

# 2.1. Identify Missing Values:
print("--- Step 2.1: Identify Missing Values ---")
print("Count of missing values per column:")
print(df_initial.isnull().sum())
print("\n")

# Display rows with missing values
print("Rows with any missing value:")
print(df_initial[df_initial.isnull().any(axis=1)])
print("\n")

# 2.2. Fill Missing Values:
print("--- Step 2.2: Fill Missing Values ---")

# We will fill missing 'Score' values with the mean of the existing scores
mean_score = df_initial['Score'].mean()
df_initial['Score'].fillna(mean_score, inplace=True)

print(f"Filled missing 'Score' values with the mean ({mean_score:.2f}).")
print("DataFrame after filling missing values:")
print(df_initial)
print("\n")

# Verify missing values are gone
print("Count of missing values per column after filling:")
print(df_initial.isnull().sum())
print("\n")


# 3. Handling Duplicates

# 3.1. Identify Duplicates:
print("--- Step 3.1: Identify Duplicates ---")

# Identify exact duplicate rows
print("Boolean Series indicating duplicate rows (keeping first occurrence as False):")
print(df_initial.duplicated())
print("\n")

# Show the duplicate rows (keeping all occurrences)
print("Identified duplicate rows (showing all occurrences):")
print(df_initial[df_initial.duplicated(keep=False)])
print("\n")

# 3.2. Remove Duplicates:
print("--- Step 3.2: Remove Duplicates ---")

# Remove duplicate rows (keeps the first occurrence by default)
initial_row_count = len(df_initial)
df_initial.drop_duplicates(inplace=True)
rows_after_removal = len(df_initial)
duplicates_removed = initial_row_count - rows_after_removal

print(f"Removed {duplicates_removed} duplicate rows.")
print("DataFrame after removing duplicate rows:")
print(df_initial)
print("\n")


# 4. Combined Practice on a New Dataset

# 4.1. New Sample Data:
print("--- Step 4.1: New Sample Data ---")
data_new = {
    'ProductID': [10, 20, 30, 40, 50, 10, 60, 70, 80, 90, 20], # Duplicates
    'ProductName': ['A', 'B', 'C', 'D', 'E', 'A', 'F', 'G', 'H', 'I', 'B'],
    'Price': [10.5, np.nan, 5.0, 20.0, 15.0, 10.5, 25.0, 12.0, np.nan, 30.0, np.nan], # Missing Prices
    'Stock': [100, 50, 200, np.nan, 150, 100, 80, 120, 90, 110, 50], # Missing Stock
    'Supplier': ['X', 'Y', 'Z', 'W', 'V', 'X', 'U', 'Y', 'Z', 'W', 'Y']
}
df_new = pd.DataFrame(data_new)

print("New Sample DataFrame:")
print(df_new)
print("\n")

# 4.2. Handling Missing Values:
print("--- Step 4.2: Handling Missing Values in New Dataset ---")

print("Missing values before filling:")
print(df_new.isnull().sum())
print("\n")

# Fill missing 'Price' with median
median_price = df_new['Price'].median()
df_new['Price'].fillna(median_price, inplace=True)
print(f"Filled missing 'Price' with median ({median_price:.2f}).")

# Fill missing 'Stock' with a constant value (e.g., 0)
df_new['Stock'].fillna(0, inplace=True)
print("Filled missing 'Stock' with constant value (0).")

print("DataFrame after filling missing values:")
print(df_new)
print("\n")

print("Missing values after filling:")
print(df_new.isnull().sum())
print("\n")

# 4.3. Remove Duplicates:
print("--- Step 4.3: Remove Duplicates from New Dataset ---")

print("Identified duplicate rows before removal:")
print(df_new[df_new.duplicated(keep=False)])
print("\n")

initial_row_count_new = len(df_new)
df_new.drop_duplicates(inplace=True)
rows_after_removal_new = len(df_new)
duplicates_removed_new = initial_row_count_new - rows_after_removal_new

print(f"Removed {duplicates_removed_new} duplicate rows.")
print("DataFrame after removing duplicate rows:")
print(df_new)
print("\n")

# --- Conclusion ---
# The script demonstrated identifying and handling missing values
# and duplicate rows in pandas DataFrames, following the provided steps.
# It also showed a combined practice on a new dataset.
     
        
        
        
        
        
        
        

--- Step 1: Load the Data (Creating Sample Data) ---
Initial Sample DataFrame:
   ID     Name  Score      City    JoinDate
0   1    Alice   85.0  New York  2023-01-10
1   2      Bob   92.0    London  2023-01-11
2   3  Charlie    NaN     Paris  2023-01-11
3   4    David   78.0     Tokyo  2023-01-12
4   5      Eve   95.0    Sydney  2023-01-12
5   1    Alice   85.0  New York  2023-01-10
6   6    Frank   88.0    Berlin  2023-01-13
7   7    Grace   91.0    London  2023-01-14
8   8    Heidi    NaN     Paris  2023-01-14
9   9     Ivan   80.0     Tokyo  2023-01-15


--- Step 2.1: Identify Missing Values ---
Count of missing values per column:
ID          0
Name        0
Score       2
City        0
JoinDate    0
dtype: int64


Rows with any missing value:
   ID     Name  Score   City    JoinDate
2   3  Charlie    NaN  Paris  2023-01-11
8   8    Heidi    NaN  Paris  2023-01-14


--- Step 2.2: Fill Missing Values ---
Filled missing 'Score' values with the mean (86.75).
DataFrame after filling mis

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_initial['Score'].fillna(mean_score, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_new['Price'].fillna(median_price, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are sett

In [4]:
# Part 2: Apply Standardization & Formatting Rules

#     Step-by-Step Guidelines:
# 1. Standardize Text Data
#     1. Convert All Names to Lowercase:
# 2. Format Numerical Data
#     1. Round Age Column to the Nearest Integer:
# 3. Combined Practice on Another Dataset
#     1. New Sample Data:
#     2. Standardize Product Names:
#     3. Format Prices to Two Decimal Places:
import pandas as pd
import numpy as np

# --- Part 2: Apply Standardization & Formatting Rules ---
# This script demonstrates how to standardize text data and format numerical data
# in a pandas DataFrame following the provided step-by-step guidelines.

# --- Step-by-Step Guidelines: ---

# 1. Standardize Text Data

# 1.1. Convert All Names to Lowercase:
print("--- Step 1.1: Convert All Names to Lowercase ---")

# Create a sample DataFrame with a Name column
data_names = {
    'PersonID': [1, 2, 3, 4, 5],
    'Name': ['Alice Smith', 'BOB JOHNSON', 'Charlie Brown', 'David Lee', 'Eve Wang'],
    'Score': [85, 92, 78, 95, 88]
}
df_names = pd.DataFrame(data_names)

print("Original DataFrame (Names):")
print(df_names)
print("\n")

# Convert the 'Name' column to lowercase
df_names['Name'] = df_names['Name'].str.lower()

print("DataFrame after converting 'Name' to lowercase:")
print(df_names)
print("\n")

# 2. Format Numerical Data

# 2.1. Round Age Column to the Nearest Integer:
print("--- Step 2.1: Round Age Column to the Nearest Integer ---")

# Create a sample DataFrame with an Age column (potentially with decimals)
data_age = {
    'UserID': [101, 102, 103, 104, 105],
    'Age': [25.3, 31.9, 45.1, 29.7, 34.5],
    'Height_cm': [165.5, 178.2, 159.8, 170.0, 168.1]
}
df_age = pd.DataFrame(data_age)

print("Original DataFrame (Age):")
print(df_age)
print("\n")

# Round the 'Age' column to the nearest integer
# Use .round(0) and then .astype(int) to convert to integer type
df_age['Age'] = df_age['Age'].round(0).astype(int)

print("DataFrame after rounding 'Age' to nearest integer:")
print(df_age)
print("\n")

# 3. Combined Practice on Another Dataset

# 3.1. New Sample Data:
print("--- Step 3.1: New Sample Data ---")

# Create a new sample DataFrame with Product Names and Prices
data_products = {
    'ProductID': [1001, 1002, 1003, 1004, 1005],
    'ProductName': ['Laptop Pro', 'Gaming Mouse', 'USB Keyboard', '4K Monitor', 'Webcam HD'], # Mixed casing
    'Price': [1250.756, 75.3, 29.999, 349.5, 55.0], # Varying decimal places
    'Stock': [10, 50, 15, 8, 25]
}
df_products = pd.DataFrame(data_products)

print("New Sample DataFrame (Products):")
print(df_products)
print("\n")

# 3.2. Standardize Product Names:
print("--- Step 3.2: Standardize Product Names ---")

# Convert the 'ProductName' column to lowercase
df_products['ProductName'] = df_products['ProductName'].str.lower()

print("DataFrame after standardizing 'ProductName' to lowercase:")
print(df_products)
print("\n")

# 3.3. Format Prices to Two Decimal Places:
print("--- Step 3.3: Format Prices to Two Decimal Places ---")

# Round the 'Price' column to two decimal places
# Use .round(2)
df_products['Price'] = df_products['Price'].round(2)

print("DataFrame after formatting 'Price' to two decimal places:")
print(df_products)
print("\n")

# --- Conclusion ---
# The script demonstrated standardizing text data by converting to lowercase
# and formatting numerical data by rounding to the nearest integer and
# to a specific number of decimal places, following the provided steps.
        
        
        
        

--- Step 1.1: Convert All Names to Lowercase ---
Original DataFrame (Names):
   PersonID           Name  Score
0         1    Alice Smith     85
1         2    BOB JOHNSON     92
2         3  Charlie Brown     78
3         4      David Lee     95
4         5       Eve Wang     88


DataFrame after converting 'Name' to lowercase:
   PersonID           Name  Score
0         1    alice smith     85
1         2    bob johnson     92
2         3  charlie brown     78
3         4      david lee     95
4         5       eve wang     88


--- Step 2.1: Round Age Column to the Nearest Integer ---
Original DataFrame (Age):
   UserID   Age  Height_cm
0     101  25.3      165.5
1     102  31.9      178.2
2     103  45.1      159.8
3     104  29.7      170.0
4     105  34.5      168.1


DataFrame after rounding 'Age' to nearest integer:
   UserID  Age  Height_cm
0     101   25      165.5
1     102   32      178.2
2     103   45      159.8
3     104   30      170.0
4     105   34      168.1


--- St