In [3]:
# imports

import pandas as pd

In [5]:
# load data file located in compass-analysis folder

data_file_path = "../compas-analysis/compas-scores-two-years.csv"

df = pd.read_csv(data_file_path)

In [8]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# 1. Row and Column Count
print("Dataset Shape:", df.shape)

# 2. Column Names
print("Column Names:", df.columns.tolist())

# 3. Data Types
print("Data Types:\n", df.dtypes)

# 4. Basic Summary Statistics
print("Summary Statistics:\n", df.describe())

# 5. Missing Values
print("Missing Values:\n", df.isnull().sum())

# 6. Unique Values
print("Unique Values Per Column:\n", df.nunique())

# 7. Value Counts for Categorical Fields
categorical_cols = df.select_dtypes(include='object').columns
for col in categorical_cols:
    print(f"Value Counts for {col}:\n", df[col].value_counts(), "\n")


Dataset Shape: (7214, 53)
Column Names: ['id', 'name', 'first', 'last', 'compas_screening_date', 'sex', 'dob', 'age', 'age_cat', 'race', 'juv_fel_count', 'decile_score', 'juv_misd_count', 'juv_other_count', 'priors_count', 'days_b_screening_arrest', 'c_jail_in', 'c_jail_out', 'c_case_number', 'c_offense_date', 'c_arrest_date', 'c_days_from_compas', 'c_charge_degree', 'c_charge_desc', 'is_recid', 'r_case_number', 'r_charge_degree', 'r_days_from_arrest', 'r_offense_date', 'r_charge_desc', 'r_jail_in', 'r_jail_out', 'violent_recid', 'is_violent_recid', 'vr_case_number', 'vr_charge_degree', 'vr_offense_date', 'vr_charge_desc', 'type_of_assessment', 'decile_score.1', 'score_text', 'screening_date', 'v_type_of_assessment', 'v_decile_score', 'v_score_text', 'v_screening_date', 'in_custody', 'out_custody', 'priors_count.1', 'start', 'end', 'event', 'two_year_recid']
Data Types:
 id                           int64
name                        object
first                       object
last       

In [9]:
# 8. Class Distribution of the Label
print("Class Distribution (two_year_recid):\n", df['two_year_recid'].value_counts(normalize=True))

# 9. Correlation Matrix
correlation_matrix = df.corr()
plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm")
plt.title("Correlation Matrix")
plt.show()

# 10. Data Type Conversions (Example for Date Fields)
df['compas_screening_date'] = pd.to_datetime(df['compas_screening_date'], errors='coerce')

# 11. Duplicate Rows
duplicates = df[df.duplicated()]
print("Duplicate Rows:", duplicates.shape[0])

# 12. Cross-Tabulation for Race and Recidivism
race_recid_ct = pd.crosstab(df['race'], df['two_year_recid'], normalize='index')
print("Cross Tabulation (Race vs Recidivism):\n", race_recid_ct)

# 13. Outliers (Box Plot for priors_count)
plt.figure(figsize=(8, 6))
sns.boxplot(x=df['priors_count'])
plt.title("Boxplot of priors_count")
plt.show()

# 14. Date-Based Analysis
df['year'] = df['compas_screening_date'].dt.year
recidivism_by_year = df.groupby('year')['two_year_recid'].mean()
plt.figure(figsize=(8, 6))
recidivism_by_year.plot(kind='bar')
plt.title("Recidivism Rate by Year")
plt.ylabel("Recidivism Rate")
plt.xlabel("Year")
plt.show()


Class Distribution (two_year_recid):
 two_year_recid
0    0.549348
1    0.450652
Name: proportion, dtype: float64


ValueError: could not convert string to float: 'miguel hernandez'

In [None]:
# 15. Distribution Analysis for Numerical Features
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns
df[numerical_cols].hist(figsize=(12, 10), bins=20)
plt.suptitle("Distributions of Numerical Features")
plt.show()

# 16. Feature Importance (Using Random Forest)
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Prepare dataset
X = df[['age', 'priors_count', 'juv_fel_count', 'juv_misd_count', 'c_charge_degree']]
X = pd.get_dummies(X, drop_first=True)  # Encode categorical variables
y = df['two_year_recid']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train Random Forest
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

# Feature Importance
feature_importances = pd.Series(rf_model.feature_importances_, index=X.columns)
feature_importances.sort_values(ascending=False).plot(kind='bar', figsize=(10, 6))
plt.title("Feature Importance")
plt.show()

# 17. Bias Analysis: Error Rates Across Groups
from sklearn.metrics import confusion_matrix

y_pred = rf_model.predict(X_test)
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", conf_matrix)

# Group-Specific Analysis
for group in df['race'].unique():
    group_data = df[df['race'] == group]
    X_group = pd.get_dummies(group_data[['age', 'priors_count', 'juv_fel_count', 'juv_misd_count', 'c_charge_degree']], drop_first=True)
    y_group = group_data['two_year_recid']
    y_pred_group = rf_model.predict(X_group)
    group_conf_matrix = confusion_matrix(y_group, y_pred_group)
    print(f"Confusion Matrix for {group}:\n", group_conf_matrix)

# 18. ROC Curves
from sklearn.metrics import roc_curve, auc

y_prob = rf_model.predict_proba(X_test)[:, 1]
fpr, tpr, _ = roc_curve(y_test, y_prob)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label=f"ROC Curve (AUC = {roc_auc:.2f})")
plt.plot([0, 1], [0, 1], linestyle='--', color='gray')
plt.title("ROC Curve")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.legend()
plt.show()

# 19. Fairness Metrics
# Example: Calculate disparate impact ratio for race
positive_rate_by_race = df.groupby('race')['two_year_recid'].mean()
disparate_impact_ratio = positive_rate_by_race / positive_rate_by_race.max()
print("Disparate Impact Ratio:\n", disparate_impact_ratio)

# 20. Data Transformation for Modeling
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
print("Scaled Data:\n", pd.DataFrame(X_scaled, columns=X.columns).head())
