In [1]:
# Part 2: Remove Duplicates & Fix Data Types

# Task 1: Remove Duplicates
#     1. Load Extended Data:
#     2. Remove Duplicates:
        


# Task 2: Fix Data Types
#     1. Fix Incorrect Data Types:
        


# Task 3: Convert Data Type for Analysis
#     1. Convert Date Strings to DateTime:
import pandas as pd
import numpy as np

# --- Part 2: Remove Duplicates & Fix Data Types ---
# This script demonstrates how to remove duplicate rows and fix incorrect data types
# in a pandas DataFrame.

# --- Task 1: Remove Duplicates ---

# 1. Load Extended Data:
print("--- Task 1: Remove Duplicates ---")
print("--- 1. Load Extended Data (Creating Sample Data) ---")

# Create a sample DataFrame with duplicate rows
data_extended = {
    'OrderID': [101, 102, 103, 104, 105, 101, 106, 107, 108, 109, 103], # Duplicate OrderIDs
    'CustomerID': [1, 2, 3, 4, 5, 1, 6, 7, 8, 9, 3], # Duplicate CustomerIDs
    'Product': ['A', 'B', 'C', 'D', 'E', 'A', 'F', 'G', 'H', 'I', 'C'],
    'Quantity': [1, 2, 1, 3, 1, 1, 2, 1, 3, 1, 1],
    'OrderDate': ['2023-01-10', '2023-01-11', '2023-01-11', '2023-01-12', '2023-01-12', '2023-01-10', '2023-01-13', '2023-01-14', '2023-01-14', '2023-01-15', '2023-01-11']
}
df_duplicates = pd.DataFrame(data_extended)

print("Original DataFrame with duplicates:")
print(df_duplicates)
print("\n")

# Identify duplicate rows before removal
print("Identified duplicate rows (showing all occurrences):")
print(df_duplicates[df_duplicates.duplicated(keep=False)])
print("\n")


# 2. Remove Duplicates:
print("--- 2. Remove Duplicates ---")

# Use the drop_duplicates() method to remove duplicate rows.
# By default, it removes rows that are identical across all columns,
# keeping the first occurrence.
initial_rows = len(df_duplicates)
df_no_duplicates = df_duplicates.drop_duplicates()
rows_after_removal = len(df_no_duplicates)
duplicates_removed = initial_rows - rows_after_removal

print(f"Removed {duplicates_removed} duplicate rows.")
print("DataFrame after removing duplicate rows:")
print(df_no_duplicates)
print("\n")


# --- Task 2: Fix Incorrect Data Types ---
print("--- Task 2: Fix Incorrect Data Types ---")

# Create a sample DataFrame with incorrect data types
data_types = {
    'ItemID': [1, 2, 3, 4, 5],
    'Price_Str': ['100.50', '75', '25.99', '300.00', '50.00'], # Price as string
    'Stock_Str': ['10', '50', '15', '8', '25'], # Stock as string
    'IsAvailable_Str': ['True', 'False', 'True', 'True', 'False'], # Boolean as string
    'Rating_Float': [4.0, 5.0, 3.0, 4.0, 5.0] # Rating as float, should be int
}
df_types = pd.DataFrame(data_types)

print("Original DataFrame with incorrect data types:")
print(df_types)
print("\n")

print("Original data types:")
print(df_types.dtypes)
print("\n")

# 1. Fix Incorrect Data Types:
print("--- 1. Fix Incorrect Data Types ---")

# Convert 'Price_Str' and 'Stock_Str' to numeric types
# Use pd.to_numeric() which is robust and can handle errors
df_types['Price_Str'] = pd.to_numeric(df_types['Price_Str'], errors='coerce') # 'coerce' turns invalid parsing into NaN
df_types['Stock_Str'] = pd.to_numeric(df_types['Stock_Str'], errors='coerce')

# Convert 'IsAvailable_Str' to boolean type
# Use astype(bool) - be cautious, non-empty strings are True.
# A more robust way might involve mapping string values.
df_types['IsAvailable_Str'] = df_types['IsAvailable_Str'].astype(bool)

# Convert 'Rating_Float' to integer type
# Ensure there are no decimals if converting directly to int, or use .round() first
df_types['Rating_Float'] = df_types['Rating_Float'].astype(int)


print("DataFrame after fixing incorrect data types:")
print(df_types)
print("\n")

print("Data types after fixing:")
print(df_types.dtypes)
print("\n")


# --- Task 3: Convert Data Type for Analysis ---
print("--- Task 3: Convert Data Type for Analysis ---")

# Create a sample DataFrame with date strings
data_dates = {
    'EventID': [1, 2, 3, 4, 5],
    'EventName': ['Start', 'Process', 'Analyze', 'Report', 'Archive'],
    'Timestamp_Str': ['2023-01-10 10:00:00', '2023-01-10 10:30:00', '2023-01-10 11:00:00', '2023-01-10 11:45:00', '2023-01-10 12:00:00'], # Date/Time as string
    'Value': [100, 150, 120, 180, 90]
}
df_dates = pd.DataFrame(data_dates)

print("Original DataFrame with date strings:")
print(df_dates)
print("\n")

print("Original data types:")
print(df_dates.dtypes)
print("\n")

# 1. Convert Date Strings to DateTime:
print("--- 1. Convert Date Strings to DateTime ---")

# Use pd.to_datetime() to convert the string column to datetime objects
# errors='coerce' will turn unparseable dates into NaT (Not a Time)
df_dates['Timestamp_Str'] = pd.to_datetime(df_dates['Timestamp_Str'], errors='coerce')

print("DataFrame after converting date strings to DateTime:")
print(df_dates)
print("\n")

print("Data types after conversion:")
print(df_dates.dtypes)
print("\n")


# --- Conclusion ---
# The script demonstrated removing duplicate rows, fixing incorrect data types
# (string to numeric, string to boolean, float to int), and converting date
# strings to datetime objects using pandas functions like drop_duplicates(),
# astype(), pd.to_numeric(), and pd.to_datetime().





--- Task 1: Remove Duplicates ---
--- 1. Load Extended Data (Creating Sample Data) ---
Original DataFrame with duplicates:
    OrderID  CustomerID Product  Quantity   OrderDate
0       101           1       A         1  2023-01-10
1       102           2       B         2  2023-01-11
2       103           3       C         1  2023-01-11
3       104           4       D         3  2023-01-12
4       105           5       E         1  2023-01-12
5       101           1       A         1  2023-01-10
6       106           6       F         2  2023-01-13
7       107           7       G         1  2023-01-14
8       108           8       H         3  2023-01-14
9       109           9       I         1  2023-01-15
10      103           3       C         1  2023-01-11


Identified duplicate rows (showing all occurrences):
    OrderID  CustomerID Product  Quantity   OrderDate
0       101           1       A         1  2023-01-10
2       103           3       C         1  2023-01-11
5       101 