In [900]:
# imports

import pandas as pd
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [901]:
# load data file located in compass-analysis folder

data_file_path = "../compas-analysis/compas-scores-two-years.csv"
df = pd.read_csv(data_file_path)

print(f'Shape fo dataset: {df.shape}')

Shape fo dataset: (7214, 53)


In [902]:
# Features to isolate
features_to_isolate = [
    'age',                   # Current age
    'priors_count',          # Total prior offenses
    'juv_fel_count',         # Juvenile felony offenses
    'juv_misd_count',        # Juvenile misdemeanors
    'juv_other_count',       # Other juvenile offenses
    # 'violent_recid',         # Indicator of violent recidivism - removed as all data is null
    'days_b_screening_arrest', # Days between arrest and screening
    'is_recid',              # General recidivism indicator (optional for context)
    'c_charge_desc'          # Charge description (optional for text analysis)
]

# Isolate the features into a new dataset
df_isolated = df[features_to_isolate]

print(f'Shape of dataset: {df_isolated.shape}')

# Save
df_isolated.to_csv("isolated_features.csv", index=False)

Shape of dataset: (7214, 8)


## Missing Values Handling

In [903]:
# Check for missing values
print("Missing Values:\n", df_isolated.isnull().sum())

# Drop rows with missing values (if critical features have missing data)
df_isolated = df_isolated.dropna()

# fill with median
# df_isolated.fillna(df_isolated.median(), inplace=True)

# Verify dataset after cleaning
print("Dataset after Cleaning:\n", df_isolated.info())

print(f'Shape of dataset: {df_isolated.shape}')


Missing Values:
 age                          0
priors_count                 0
juv_fel_count                0
juv_misd_count               0
juv_other_count              0
days_b_screening_arrest    307
is_recid                     0
c_charge_desc               29
dtype: int64
<class 'pandas.core.frame.DataFrame'>
Index: 6900 entries, 0 to 7213
Data columns (total 8 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   age                      6900 non-null   int64  
 1   priors_count             6900 non-null   int64  
 2   juv_fel_count            6900 non-null   int64  
 3   juv_misd_count           6900 non-null   int64  
 4   juv_other_count          6900 non-null   int64  
 5   days_b_screening_arrest  6900 non-null   float64
 6   is_recid                 6900 non-null   int64  
 7   c_charge_desc            6900 non-null   object 
dtypes: float64(1), int64(6), object(1)
memory usage: 485.2+ KB
Dataset after Cle

## Take care of categorical features

In [904]:
categorical_columns = df_isolated.select_dtypes(include=['object']).columns.tolist()

print("Categorical features in this dataset:\n", categorical_columns)

for categorical_column in categorical_columns:
    print(f'For the category: {categorical_column}')
    print(df_isolated[categorical_column].unique())



Categorical features in this dataset:
 ['c_charge_desc']
For the category: c_charge_desc
['Aggravated Assault w/Firearm' 'Felony Battery w/Prior Convict'
 'Possession of Cocaine' 'Battery' 'Possession Burglary Tools'
 'arrest case no charge' 'Insurance Fraud' 'Poss 3,4 MDMA (Ecstasy)'
 'Poss3,4 Methylenedioxymethcath' 'Felony Driving While Lic Suspd'
 'Grand Theft in the 3rd Degree' 'Driving While License Revoked'
 'Possession Of Heroin' 'Battery on Law Enforc Officer'
 'Possession Of Methamphetamine' 'Introduce Contraband Into Jail'
 'Lewd/Lasc Battery Pers 12+/<16' 'Susp Drivers Lic 1st Offense'
 'Carrying Concealed Firearm' 'Pos Cannabis W/Intent Sel/Del'
 'Tampering With Physical Evidence' 'Att Tamper w/Physical Evidence'
 'Agg Fleeing and Eluding' 'Operating W/O Valid License'
 'Poss Wep Conv Felon' 'Possess Cannabis/20 Grams Or Less'
 'Unlaw Use False Name/Identity' 'Viol Injunct Domestic Violence'
 'Defrauding Innkeeper $300/More' 'Uttering a Forged Instrument'
 'DUI Level 0.15 

In [905]:
# Define 10 categories for crime types
crime_types = {
    'charge-theft': ['Burglary', 'Theft', 'Robbery', 'Burgl', 'Burg', 'Carjacking', 'Stolen'],
    'charge-intoxication': ['Possession', 'Possess', 'Poss', 'Cannabis', 'Alcoholic', 'Control Subst', 'Bev In Pub', 'Methylenediox', 'Cocaine'],
    'charge-violent': ['Homicide', 'Manslaughter', 'Battery', 'Assault', 'Abuse', 'Stalking', 'Aggress', 'Mischief', 'Cruelty', 'Resist', 'Disorderly'],
    'charge-domestic': ['Domestic Violence', 'Neglect Child'],
    'charge-fraud': ['Fraud', 'Forge', 'Tamper', 'Eluding', 'False Name', 'Worthless Check', 'Bribery', 'Corrupt', 'Crim Use of Personal ID Info', 'False Ownership Info'],
    'charge-arson': ['Arson'],
    'charge-driving': ['DUI', 'Driving', 'Drivers', 'D.U.I'],
    'charge-weapons': ['Weapons', 'Firearm', 'Explosives', 'Weapon'],
    'charge-morality': ['Indecent Exposure', 'Loitering', 'Prostitution', 'Voyeurism', 'Pornography'],
}



for crime_category, crime_substrings in crime_types.items():

    # substrings joined with OR operator
    pattern = '|'.join(crime_substrings)

    # create the col for the crime category
    df_isolated[crime_category] = df_isolated['c_charge_desc'].str.contains(pattern, case=False, na=False)
    
    # copy the description in copy desc colum
    df_isolated.loc[df_isolated[crime_category], 'c_charge_desc_copy'] = df_isolated['c_charge_desc']
    
    # erase the original description
    df_isolated['c_charge_desc'] = df_isolated['c_charge_desc'].where(~df_isolated[crime_category], other=None)


others_category = 'charge-others'
df_isolated[others_category] = ~(df_isolated['c_charge_desc'].isnull())
df_isolated.loc[df_isolated[others_category], 'c_charge_desc_copy'] = df_isolated['c_charge_desc']
df_isolated['c_charge_desc'] = df_isolated['c_charge_desc'].where(~df_isolated[others_category], other=None)


for crime_cat_col, _ in crime_types.items():
    print(f'Number for category {crime_cat_col} is {df_isolated[crime_cat_col].sum()}')

print(f'Number for category {others_category} is {df_isolated[others_category].sum()}')




Number for category charge-theft is 1064
Number for category charge-intoxication is 1270
Number for category charge-violent is 1979
Number for category charge-domestic is 76
Number for category charge-fraud is 257
Number for category charge-arson is 2
Number for category charge-driving is 709
Number for category charge-weapons is 44
Number for category charge-morality is 35
Number for category charge-others is 1464


In [906]:
# Define expanded crime categories to cover 99% of data
crime_types = {
    # Theft-related crimes
    'Burglary': 'charge-theft',
    'Theft': 'charge-theft',
    'Robbery': 'charge-theft',
    'Shoplifting': 'charge-theft',
    'Burg': 'charge-theft',
    'Larceny': 'charge-theft',
    'Stolen': 'charge-theft',
    'Auto': 'charge-theft',
    'Vehicle': 'charge-theft',

    # Drug-related crimes
    'Possession': 'charge-drugs',
    'Possess': 'charge-drugs',
    'Poss': 'charge-drugs',
    'Cannabis': 'charge-drugs',
    'Marijuana': 'charge-drugs',
    'Meth': 'charge-drugs',
    'Cocaine': 'charge-drugs',
    'Heroin': 'charge-drugs',
    'Alcoholic': 'charge-drugs',
    'Control Subst': 'charge-drugs',
    'Drug': 'charge-drugs',
    'Narcotics': 'charge-drugs',

    # Violent crimes
    'Battery': 'charge-violent',
    'Batt': 'charge-violent',
    'Assault': 'charge-violent',
    'Abuse': 'charge-violent',
    'Aggress': 'charge-violent',
    'Stalking': 'charge-violent',
    'Fight': 'charge-violent',
    'Threat': 'charge-violent',
    'Harm': 'charge-violent',
    
    # Domestic-related crimes
    'Domestic Violence': 'charge-domestic',
    'Child Abuse': 'charge-domestic',
    'Spouse': 'charge-domestic',

    # Financial crimes
    'Fraud': 'charge-financial',
    'Forge': 'charge-financial',
    'Tamper': 'charge-financial',
    'Eluding': 'charge-financial',
    'Unlicensed': 'charge-financial',
    'False Name': 'charge-financial',
    'Identity': 'charge-financial',
    'Credit': 'charge-financial',
    'Embezzle': 'charge-financial',

    # Property-related crimes
    'Arson': 'charge-property',
    'Vandalism': 'charge-property',
    'Trespass': 'charge-property',
    'Damage': 'charge-property',
    'Burgl': 'charge-property',
    'Looting': 'charge-property',

    # Homicide-related crimes
    'Homicide': 'charge-homicide',
    'Manslaughter': 'charge-homicide',
    'Murder': 'charge-homicide',

    # Traffic-related crimes
    'DUI': 'charge-traffic',
    'D.U.I.': 'charge-traffic',
    'Driving': 'charge-traffic',
    'Drivg': 'charge-traffic',
    'Drivers': 'charge-traffic',
    'Reckless': 'charge-traffic',
    'Speeding': 'charge-traffic',
    'Suspended': 'charge-traffic',
    'License': 'charge-traffic',

    # Weapons-related crimes
    'Weapons': 'charge-weapons',
    'Firearm': 'charge-weapons',
    'Explosives': 'charge-weapons',
    'Gun': 'charge-weapons',
    'Knife': 'charge-weapons',

    # Morality-related crimes
    'Indecent Exposure': 'charge-morality',
    'Loitering': 'charge-morality',
    'Prostitution': 'charge-morality',
    'Voyeur': 'charge-morality',
    'Lewd': 'charge-morality',
    'Obscene': 'charge-morality',
    'Pornography': 'charge-morality',

    # Other crimes
    'Viol': 'charge-other',
    'Arrest Case No Charge': 'charge-other',
    'Resisting': 'charge-other',
    'Disorderly': 'charge-other',
    'Obstruction': 'charge-other',
    'Escape': 'charge-other',
    'Contempt': 'charge-other',
    'Violation': 'charge-other',
    'Harassment': 'charge-other',
    'Misc': 'charge-other'
}

# Apply the categorization logic
for crime_substring, crime_category in crime_types.items():
    df_isolated[crime_category] = df_isolated['c_charge_desc'].str.contains(crime_substring, case=False, na=False)
    df_isolated['c_charge_desc_copy'] = df_isolated[df_isolated['c_charge_desc'].str.contains(crime_substring, case=False, na=False)]['c_charge_desc']
    df_isolated['c_charge_desc'] = df_isolated['c_charge_desc'].where(~df_isolated[crime_category], other=None)




# Display remaining uncategorized entries, if any
uncategorized = df_isolated[df_isolated['c_charge_desc'].isnull()]
print(uncategorized.shape)


df_categorized = df_isolated[df_isolated['c_charge_desc'].isnull() == False]

print(df_categorized.shape)

print(df_categorized)


(6900, 25)
(0, 25)
Empty DataFrame
Columns: [age, priors_count, juv_fel_count, juv_misd_count, juv_other_count, days_b_screening_arrest, is_recid, c_charge_desc, charge-theft, c_charge_desc_copy, charge-intoxication, charge-violent, charge-domestic, charge-fraud, charge-arson, charge-driving, charge-weapons, charge-morality, charge-others, charge-drugs, charge-financial, charge-property, charge-homicide, charge-traffic, charge-other]
Index: []
