In [1]:
import pandas as pd

# Load the dataset
df = pd.read_csv("Census.csv")

# Check for missing data
missing_data = df.isnull().sum()

# Print the missing data count for each column
print("Missing data counts:")
print(missing_data)


Missing data counts:
age               0
workclass         0
final-weight      0
education         0
education-num     0
marital-status    0
occupation        0
relationship      0
race              0
sex               0
capital-gain      0
capital-loos      0
hour-per-week     0
native-country    0
income            0
dtype: int64


In [2]:
# List of categorical columns
categorical_columns = df.select_dtypes(include=['object']).columns

# Check for "?" in categorical columns
question_mark_present = df[categorical_columns].apply(lambda x: x.str.contains('\?')).any()

# Print columns with "?" present
print("Columns with '?' present:")
print(question_mark_present[question_mark_present].index.tolist())


Columns with '?' present:
['workclass', 'occupation', 'native-country']


In [3]:
# List of categorical columns
categorical_columns = df.select_dtypes(include=['object']).columns

# Check for "?" in categorical columns and count occurrences
question_mark_count = df[categorical_columns].apply(lambda x: x.str.contains('\?')).sum()

# Print columns with "?" present and their corresponding counts
print("Columns with '?' present and their counts:")
for column, count in question_mark_count.items():
    if count > 0:
        print(f"{column}: {count}")


Columns with '?' present and their counts:
workclass: 1836
occupation: 1843
native-country: 583


In [4]:
# List of categorical columns
categorical_columns = df.select_dtypes(include=['object']).columns

# Replace "?" with NaN in categorical columns
df[categorical_columns] = df[categorical_columns].replace('?', pd.NA)

# Count null values in categorical columns
null_count = df[categorical_columns].isnull().sum()

# Print the count of null values
print("Count of null values in categorical columns:")
print(null_count)


Count of null values in categorical columns:
workclass         0
education         0
marital-status    0
occupation        0
relationship      0
race              0
sex               0
native-country    0
income            0
dtype: int64


In [5]:
# List of categorical columns
categorical_columns = df.select_dtypes(include=['object']).columns

# Replace "?" with NaN in categorical columns
df[categorical_columns] = df[categorical_columns].replace(' ?', pd.NA)

# Count null values in categorical columns
null_count = df[categorical_columns].isnull().sum()

# Print the count of null values
print("Count of null values in categorical columns:")
print(null_count)

Count of null values in categorical columns:
workclass         1836
education            0
marital-status       0
occupation        1843
relationship         0
race                 0
sex                  0
native-country     583
income               0
dtype: int64


In [6]:
# Identify columns with missing data
missing_columns = df.columns[df.isnull().any()]

# Get the data types of columns with missing data
missing_column_types = df[missing_columns].dtypes

# Print the types of columns with missing data
print("Types of columns with missing data:")
print(missing_column_types)


Types of columns with missing data:
workclass         object
occupation        object
native-country    object
dtype: object


In [7]:
# Identify categorical columns with missing data
categorical_columns_with_missing = df.select_dtypes(include=['object']).columns[df.select_dtypes(include=['object']).isnull().any()]

# Dictionary to store the count of replacements per column
replacement_counts = {}

# Replace missing categorical values with mode and count replacements
for column in categorical_columns_with_missing:
    mode_value = df[column].mode()[0]  # Calculate mode
    replacement_count = df[column].isnull().sum()  # Count missing values before replacement
    replaced_value = mode_value
    df[column].fillna(mode_value, inplace=True)  # Replace missing values with mode
    replacement_counts[column] = replacement_count  # Store the count of replacements
    print(f"Column: {column}, Mode value to be replaced: {replaced_value}")

# Print the number of replacements per column
print("\nNumber of replacements per column:")
for column, count in replacement_counts.items():
    print(f"{column}: {count}")

# Verify the changes
print("\nDataset after mode imputation:")
print(df.head())


Column: workclass, Mode value to be replaced:  Private
Column: occupation, Mode value to be replaced:  Prof-specialty
Column: native-country, Mode value to be replaced:  United-States

Number of replacements per column:
workclass: 1836
occupation: 1843
native-country: 583

Dataset after mode imputation:
   age          workclass  final-weight   education  education-num  \
0   39          State-gov         77516   Bachelors             13   
1   50   Self-emp-not-inc         83311   Bachelors             13   
2   38            Private        215646     HS-grad              9   
3   53            Private        234721        11th              7   
4   28            Private        338409   Bachelors             13   

        marital-status          occupation    relationship    race      sex  \
0        Never-married        Adm-clerical   Not-in-family   White     Male   
1   Married-civ-spouse     Exec-managerial         Husband   White     Male   
2             Divorced   Handlers-cle

In [8]:
# Define a function to detect outliers using the IQR method
def detect_outliers_iqr(column):
    Q1 = column.quantile(0.25)
    Q3 = column.quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = (column < lower_bound) | (column > upper_bound)
    return outliers

# Iterate over each column in the DataFrame
for column in df.columns:
    if df[column].dtype != 'object':  # Exclude non-numeric columns
        outliers = detect_outliers_iqr(df[column])
        if outliers.any():  # Check if there are any outliers in the column
            print(f"Column '{column}' has outliers:")
            print(df[outliers][column])
            print()
        else:
            print(f"No outliers found in column '{column}'.\n")
    else:
        print(f"Skipping column '{column}' as it is non-numeric.\n")


Column 'age' has outliers:
74       79
222      90
430      80
918      81
1040     90
         ..
32277    90
32367    90
32459    85
32494    82
32525    81
Name: age, Length: 143, dtype: int64

Skipping column 'workclass' as it is non-numeric.

Column 'final-weight' has outliers:
37       544091
40       507875
80       446839
110      432376
157      494223
          ...  
32306    427686
32329    435842
32470    485710
32496    436163
32511    514716
Name: final-weight, Length: 992, dtype: int64

Skipping column 'education' as it is non-numeric.

Column 'education-num' has outliers:
15       4
56       3
61       4
79       4
160      2
        ..
32431    4
32432    1
32448    4
32479    4
32517    4
Name: education-num, Length: 1198, dtype: int64

Skipping column 'marital-status' as it is non-numeric.

Skipping column 'occupation' as it is non-numeric.

Skipping column 'relationship' as it is non-numeric.

Skipping column 'race' as it is non-numeric.

Skipping column 'sex' as it

In [9]:
# Define a function to detect outliers using the IQR method
def detect_outliers_iqr(column):
    Q1 = column.quantile(0.25)
    Q3 = column.quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = (column < lower_bound) | (column > upper_bound)
    return outliers

# Iterate over each column in the DataFrame
for column in df.columns:
    if df[column].dtype != 'object':  # Exclude non-numeric columns
        outliers = detect_outliers_iqr(df[column])
        num_outliers = outliers.sum()
        if num_outliers > 0:  # Check if there are any outliers in the column
            print(f"Column '{column}' has {num_outliers} outliers:")
            print(df[outliers][column])
            print()
        else:
            print(f"No outliers found in column '{column}'.\n")
    else:
        print(f"Skipping column '{column}' as it is non-numeric.\n")


Column 'age' has 143 outliers:
74       79
222      90
430      80
918      81
1040     90
         ..
32277    90
32367    90
32459    85
32494    82
32525    81
Name: age, Length: 143, dtype: int64

Skipping column 'workclass' as it is non-numeric.

Column 'final-weight' has 992 outliers:
37       544091
40       507875
80       446839
110      432376
157      494223
          ...  
32306    427686
32329    435842
32470    485710
32496    436163
32511    514716
Name: final-weight, Length: 992, dtype: int64

Skipping column 'education' as it is non-numeric.

Column 'education-num' has 1198 outliers:
15       4
56       3
61       4
79       4
160      2
        ..
32431    4
32432    1
32448    4
32479    4
32517    4
Name: education-num, Length: 1198, dtype: int64

Skipping column 'marital-status' as it is non-numeric.

Skipping column 'occupation' as it is non-numeric.

Skipping column 'relationship' as it is non-numeric.

Skipping column 'race' as it is non-numeric.

Skipping colum