In [43]:
import pandas as pd

ussc_df = pd.read_csv('../data/cleaned_data_phase2.csv')

# Step 1: Confirm Data Integrity
print("Dataset shape:", ussc_df.shape)
print("Sample rows:")
# print(ussc_df.head())

# Step 2: Remove Duplicates
before_dups = ussc_df.shape[0]
ussc_df.drop_duplicates(inplace=True)
after_dups = ussc_df.shape[0]
# print(f"Dropped {before_dups - after_dups} duplicate rows.")

# Step 3: Check Data Types
print("\nData Types:")
print(ussc_df.dtypes)

# Step 4: Validate Unique Values in Key Categorical Features
categorical_features = ['NEWRACE', 'MONSEX', 'EDUCATN', 'CITIZEN', 'ZONE', "SENTMON", "DISPOSIT"]
for feature in categorical_features:
    print(f"\nUnique values in {feature}:")
    print(ussc_df[feature].unique())

# Step 5: Check for Missing Data
missing_counts = ussc_df.isna().sum().sort_values(ascending=False)
# print("\nMissing Values per Feature:")
# print(missing_counts)

# Step 6: Save the Cleaned Dataset After Sub-Phase 3.1
# This ensures we can revisit this intermediate state if needed
output_file = "cleaned_data_after_subphase_3_1.csv"
ussc_df.to_csv(output_file, index=False)
print(f"\nIntermediate dataset saved to: {output_file}")

Dataset shape: (76538, 16)
Sample rows:

Data Types:
AGE         float64
NEWRACE      object
MONSEX       object
EDUCATN      object
DISTRICT     object
CIRCDIST     object
CRIMHIST     object
SENTYR      float64
CITIZEN      object
CITWHERE     object
NUMDEPEN     object
CRIMLIV      object
SENTMON      object
ZONE         object
DISPOSIT     object
SENTTOT     float64
dtype: object

Unique values in NEWRACE:
['Hispanic' 'White' 'Black' 'Other' 'American Indian or Alaskan Native'
 'Asian or Pacific Islander' nan]

Unique values in MONSEX:
['Male' 'Female' nan]

Unique values in EDUCATN:
['Six years of school completed' 'High school graduate'
 'Some trade or vocational school' 'Nine years of school completed'
 'Some college' 'College graduate' 'Eleven years of school completed'
 'G.E.D. (general education diploma)' nan 'One year of school completed'
 'Trade or vocational degree' 'Middle school / junior high'
 'Ten years of school completed' 'Four years of school completed'
 'Some high 

In [44]:
# Load the intermediate dataset from Sub-Phase 3.1
file_path = "cleaned_data_after_subphase_3_1.csv"
ussc_df = pd.read_csv(file_path)

# Step 1: Check for Missing Values Again
missing_counts = ussc_df.isna().sum()
print("\nMissing Values per Feature:")
print(missing_counts)

# Step 2: Preprocessing Specific Features
# Convert "No Dependents" in NUMDEPEN to 0 and ensure numeric type
ussc_df['NUMDEPEN'] = ussc_df['NUMDEPEN'].replace("No dependents", 0).astype(float)

# Convert CRIMHIST to binary values: 1 for "Yes, there is a criminal history", 0 otherwise
ussc_df['CRIMHIST'] = ussc_df['CRIMHIST'].apply(lambda x: 1 if x == "Yes, there is a criminal history" else 0)

# Remove extreme outliers in SENTOT
ussc_df['SENTTOT'] = ussc_df['SENTTOT'].apply(lambda x: x if x < 2000 else 2000)
print(ussc_df["SENTTOT"].describe())

print("\nData Types:")
print(ussc_df.dtypes)

# Step 3: Handling Missing Values
# Define strategies for missing data
# Numerical Features: Fill with median
numerical_features = ['AGE', 'NUMDEPEN', 'CRIMHIST', 'SENTYR', "SENTTOT"]
for feature in numerical_features:
    median_value = ussc_df[feature].median()
    ussc_df.fillna({feature: median_value}, inplace=True)
    print(f"Filled missing values in {feature} with median: {median_value}")

# Categorical Features: Fill with "Unknown"
categorical_features = ['NEWRACE', 'MONSEX', 'EDUCATN', 'CITIZEN', 'CITWHERE', 'ZONE', 'DISTRICT', 'CIRCDIST', 'CRIMLIV', 'SENTMON']
for feature in categorical_features:
    ussc_df.fillna({feature : "Unknown"}, inplace=True)
    # print(f"Filled missing values in {feature} with 'Unknown'")

# Step 3: Validate No Missing Values Remain
missing_counts_after = ussc_df.isna().sum()
print("\nMissing Values After Handling:")
print(missing_counts_after)
print(ussc_df["AGE"].isna().sum())

# Step 4: Save the Dataset After Handling Missing Values
output_file = "cleaned_data_after_subphase_3_2.csv"
ussc_df.to_csv(output_file, index=False)
print(f"\nDataset after handling missing values saved to: {output_file}")


Missing Values per Feature:
AGE           44
NEWRACE      794
MONSEX       104
EDUCATN     8296
DISTRICT       0
CIRCDIST       0
CRIMHIST    1968
SENTYR         0
CITIZEN      415
CITWHERE    1352
NUMDEPEN    8317
CRIMLIV     4994
SENTMON        0
ZONE        1145
DISPOSIT       0
SENTTOT     6336
dtype: int64
count    76314.000000
mean       207.880397
std        543.185412
min          0.030000
25%          6.000000
50%         24.000000
75%         80.000000
max       2000.000000
Name: SENTTOT, dtype: float64

Data Types:
AGE         float64
NEWRACE      object
MONSEX       object
EDUCATN      object
DISTRICT     object
CIRCDIST     object
CRIMHIST      int64
SENTYR      float64
CITIZEN      object
CITWHERE     object
NUMDEPEN    float64
CRIMLIV      object
SENTMON      object
ZONE         object
DISPOSIT     object
SENTTOT     float64
dtype: object
Filled missing values in AGE with median: 35.0
Filled missing values in NUMDEPEN with median: 1.0
Filled missing values in CRIMHIST w

In [53]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

# --------------------------------------------------
# 1. Load and Inspect the Dataset
# --------------------------------------------------
file_path = "cleaned_data_after_subphase_3_2.csv"
ussc_df = pd.read_csv(file_path)

print("Initial checks on AGE:")
print("Missing in AGE:", ussc_df["AGE"].isna().sum())
print("AGE describe:\n", ussc_df['AGE'].describe())

# --------------------------------------------------
# 2. Bin AGE Before Scaling
# --------------------------------------------------
# Define age bins based on actual age values
age_bins = [0, 25, 35, 45, np.inf]
age_labels = ['<25', '25-35', '35-45', '45+']

# Create 'AGE_BIN' from the raw (unscaled) AGE
ussc_df['AGE_BIN'] = pd.cut(
    ussc_df['AGE'],
    bins=age_bins,
    labels=age_labels,
    right=False,       # intervals like [0,25), [25,35), etc.
    include_lowest=True
)
print("Created 'AGE_BIN' feature. Unique bins:\n", ussc_df["AGE_BIN"].value_counts(dropna=False))

# --------------------------------------------------
# 3. Standardize Other Numeric Features
#    Exclude 'AGE' from scaling if you want to preserve the raw age in the dataset
# --------------------------------------------------
# If you still want to scale AGE for some analysis, store the raw in another column:
# ussc_df['AGE_RAW'] = ussc_df['AGE']
# Then scale 'AGE'
numerical_features = ['AGE', 'NUMDEPEN', 'CRIMHIST', 'SENTYR', "SENTTOT"]
scaler = StandardScaler()
ussc_df[numerical_features] = scaler.fit_transform(ussc_df[numerical_features])
print("Standardized numerical features (including AGE).")

# --------------------------------------------------
# 4. Create Interaction Features
# --------------------------------------------------
# Example: Race + Citizenship
ussc_df['RACE_CITIZEN'] = ussc_df['NEWRACE'] + "_" + ussc_df['CITIZEN']
print("Created 'RACE_CITIZEN' feature.")

print("\nDescribe disposit")
print(ussc_df.dtypes)
print(ussc_df["DISPOSIT"].value_counts(dropna=False))

ussc_df.to_csv("../data/cleaned_data_phase3_unencoded_DISPOSIT.csv", index=False)
ussc_df.drop(columns=['DISPOSIT', "SENTTOT"], inplace=True)

# --------------------------------------------------
# 5. Encode Categorical Features
# --------------------------------------------------
categorical_features = [
    'NEWRACE', 'MONSEX', 'EDUCATN',
    'CITIZEN', 'CITWHERE', 'ZONE', 'DISTRICT',
    'CIRCDIST', 'CRIMLIV', 'AGE_BIN', 'RACE_CITIZEN', 'SENTMON'
]

encoded_df = pd.get_dummies(ussc_df, columns=categorical_features, drop_first=True)
print("\nApplied one-hot encoding to categorical features.")

# --------------------------------------------------
# 6. Final Verification
# --------------------------------------------------
missing_counts_final = encoded_df.isna().sum()
print("\nMissing Values After Handling:")
print(missing_counts_final[missing_counts_final > 0])

print("\nCheck AGE_BIN distribution (no NaNs expected):")
print(ussc_df["AGE_BIN"].value_counts(dropna=False))

# # --------------------------------------------------
# # 7. Save the Transformed Dataset
# # --------------------------------------------------
# output_file = "../data/cleaned_data_phase3.csv"
# encoded_df.to_csv(output_file, index=False)
# print(f"\nDataset after feature transformation saved to: {output_file}")


Initial checks on AGE:
Missing in AGE: 0
AGE describe:
 count    76314.000000
mean        36.394633
std         10.935215
min         16.000000
25%         28.000000
50%         35.000000
75%         43.000000
max         86.000000
Name: AGE, dtype: float64
Created 'AGE_BIN' feature. Unique bins:
 AGE_BIN
25-35    27526
35-45    22707
45+      16320
<25       9761
Name: count, dtype: int64
Standardized numerical features (including AGE).
Created 'RACE_CITIZEN' feature.

Describe disposit
AGE              float64
NEWRACE           object
MONSEX            object
EDUCATN           object
DISTRICT          object
CIRCDIST          object
CRIMHIST         float64
SENTYR           float64
CITIZEN           object
CITWHERE          object
NUMDEPEN         float64
CRIMLIV           object
SENTMON           object
ZONE              object
DISPOSIT          object
SENTTOT          float64
AGE_BIN         category
RACE_CITIZEN      object
dtype: object
DISPOSIT
Guilty plea                       