In [1]:
# Task 1: Load a CSV Dataset
# Description: Load a CSV file into a Pandas DataFrame and print the first five rows to understand the structure of the dataset.

import pandas as pd

# Load the CSV file into a DataFrame
df = pd.read_csv('your_dataset.csv')

# Print the first five rows to understand the structure
print(df.head())




FileNotFoundError: [Errno 2] No such file or directory: 'your_dataset.csv'

In [None]:
# Task 2: Check for Missing Values
# Description: Identify and list the columns with missing values and the number of missing values in each.

import pandas as pd

# Load the CSV file into a DataFrame (if not already loaded)
df = pd.read_csv('your_dataset.csv')

# Check for missing values and list the columns with the number of missing values
missing_values = df.isnull().sum()

# Filter columns with missing values
missing_values = missing_values[missing_values > 0]

# Print the columns with missing values and their count
print(missing_values)


In [None]:
# Task 3: Visualize Missing Data
# Description: Use a heatmap to visualize the missing values in the dataset.

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Load the CSV file into a DataFrame (if not already loaded)
df = pd.read_csv('your_dataset.csv')

# Create a heatmap to visualize missing values
plt.figure(figsize=(10, 6))
sns.heatmap(df.isnull(), cbar=False, cmap='viridis', yticklabels=False)

# Display the heatmap
plt.title('Missing Data Heatmap')
plt.show()




In [None]:
# Task 4: Remove Columns with Many Missing Values
# Description: Drop columns that have more than 50% missing values.

import pandas as pd

# Load the CSV file into a DataFrame (if not already loaded)
df = pd.read_csv('your_dataset.csv')

# Calculate the threshold for columns with more than 50% missing values
threshold = 0.5 * len(df)

# Drop columns with more than 50% missing values
df_cleaned = df.dropna(axis=1, thresh=threshold)

# Print the cleaned DataFrame
print(df_cleaned)



In [None]:
# Task 5: Identify Duplicate Rows
# Description: Check for and display any duplicate rows in the dataset.

import pandas as pd

# Load the CSV file into a DataFrame (if not already loaded)
df = pd.read_csv('your_dataset.csv')

# Check for duplicate rows
duplicates = df[df.duplicated()]

# Display the duplicate rows
print(duplicates)



In [None]:
# Task 6: Remove Duplicate Rows
# Description: Remove duplicate rows from the dataset and verify that they have been removed.
import pandas as pd

# Load the CSV file into a DataFrame (if not already loaded)
df = pd.read_csv('your_dataset.csv')

# Remove duplicate rows
df_cleaned = df.drop_duplicates()

# Verify that duplicates have been removed
print("Duplicate rows removed. Here's the cleaned DataFrame:")
print(df_cleaned)

# Optional: Check if there are any remaining duplicate rows
duplicates_remaining = df_cleaned[df_cleaned.duplicated()]
if duplicates_remaining.empty:
    print("No duplicate rows remain.")
else:
    print("Some duplicates remain:")
    print(duplicates_remaining)




In [None]:
# Task 7: Check Data Inconsistencies
# Description: Identify inconsistencies in categorical columns, such as differing text cases or trailing spaces.
import pandas as pd

# Load the CSV file into a DataFrame (if not already loaded)
df = pd.read_csv('your_dataset.csv')

# Identify categorical columns
categorical_columns = df.select_dtypes(include=['object']).columns

# Check for text case inconsistencies and trailing spaces in categorical columns
for col in categorical_columns:
    # Strip leading and trailing spaces
    df[col] = df[col].str.strip()
    
    # Check for different text cases (e.g., "apple" vs "Apple")
    unique_values = df[col].unique()
    print(f"Inconsistencies in column '{col}':")
    print(f"Unique values (before normalization): {unique_values}")
    
    # Normalize case (convert everything to lowercase or uppercase)
    df[col] = df[col].str.lower()
    
    # Check unique values after case normalization
    normalized_unique_values = df[col].unique()
    print(f"Unique values (after normalization): {normalized_unique_values}")
    print()




In [None]:
# Task 8: Get Summary of Data Quality
# Description: Generate a summary of data quality including total records, number of duplicate rows, and columns with missing values.


import pandas as pd

# Load the CSV file into a DataFrame (if not already loaded)
df = pd.read_csv('your_dataset.csv')

# Total number of records
total_records = len(df)

# Number of duplicate rows
duplicate_rows = df.duplicated().sum()

# Columns with missing values
missing_values = df.isnull().sum()
columns_with_missing = missing_values[missing_values > 0]

# Summary of data quality
print("Data Quality Summary:")
print(f"Total records: {total_records}")
print(f"Number of duplicate rows: {duplicate_rows}")
print(f"Columns with missing values:")
print(columns_with_missing)


In [None]:
# Task 9: Generate a Data Quality Report
# Description: Create a comprehensive data quality report that includes not only missing values but also basic statistics for numerical columns and the distribution of categorical columns.


import pandas as pd

# Load the CSV file into a DataFrame (if not already loaded)
df = pd.read_csv('your_dataset.csv')

# Data Quality Report

# 1. Total Records and Missing Values
total_records = len(df)
missing_values = df.isnull().sum()
columns_with_missing = missing_values[missing_values > 0]

# 2. Basic Statistics for Numerical Columns
numerical_stats = df.describe()

# 3. Distribution of Categorical Columns
categorical_columns = df.select_dtypes(include=['object']).columns
categorical_distribution = {col: df[col].value_counts() for col in categorical_columns}

# Create a comprehensive report
report = {}

# Add Total Records and Missing Values to the report
report['Total Records'] = total_records
report['Columns with Missing Values'] = columns_with_missing.to_dict()

# Add Basic Statistics for Numerical Columns
report['Numerical Statistics'] = numerical_stats

# Add Distribution of Categorical Columns
report['Categorical Column Distributions'] = categorical_distribution

# Print the Data Quality Report
print("Data Quality Report:\n")
print(f"Total Records: {report['Total Records']}\n")
print("Columns with Missing Values:")
for column, missing in report['Columns with Missing Values'].items():
    print(f"{column}: {missing} missing values")
print()

print("Basic Statistics for Numerical Columns:")
print(report['Numerical Statistics'])
print()

print("Distribution of Categorical Columns:")
for col, dist in report['Categorical Column Distributions'].items():
    print(f"\n{col} Distribution:")
    print(dist)


In [None]:
# Task 10: Advanced Data Imputation
# Description: Perform advanced data imputation by replacing missing values in numerical columns with the mean and categorical columns with the mode.


import pandas as pd

# Load the CSV file into a DataFrame (if not already loaded)
df = pd.read_csv('your_dataset.csv')

# 1. Impute missing values in numerical columns with the mean
numerical_columns = df.select_dtypes(include=['float64', 'int64']).columns
for col in numerical_columns:
    df[col].fillna(df[col].mean(), inplace=True)

# 2. Impute missing values in categorical columns with the mode (most frequent value)
categorical_columns = df.select_dtypes(include=['object']).columns
for col in categorical_columns:
    df[col].fillna(df[col].mode()[0], inplace=True)

# Display the DataFrame after imputation
print("Data after advanced imputation:")
print(df.head())

# Optional: Verify that no missing values remain
missing_after_imputation = df.isnull().sum()
print("\nMissing values after imputation:")
print(missing_after_imputation)

