In [112]:
import os
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
import numpy as np
import seaborn as sns

In [113]:
RESULTS_FOLDER = "../results"
DATA_FOLDER = "../data"
TEMP_FOLDER = "../tmp"

## Load the training dataset

Load the train dataset in a pandas dataframe

In [114]:
df_train_path = os.path.join(DATA_FOLDER, 'train_dataset.csv')
df_train = pd.read_csv(df_train_path)

## Create additional features

We create the following additional features:

- `history_of_violence` - sum of all violence-related crimes in the past
- `socioeconomic_stability` - 1 / (1 + `priors_count`). If no priors count this will be equal to 1 (good stability), otherwise it will start getting smaller with each increase of priors


In [115]:
df_train["history_of_violence"] = (
    df_train["juv_fel_count"] +
    df_train["juv_misd_count"] +
    df_train["juv_other_count"] +
    df_train["priors_count"]
)

# Socioeconomic stability proxy
df_train["socioeconomic_stability"] = (1 / (1 + df_train["priors_count"])) 

## Prepare data for model training

- Select features to be used for training
    - `age`
    - `priors_count`
    - `history_of_violence`
    - `days_b_screening_arrest`
    - `socioeconomic_stability`
    - `c_charge_degree_F`
    - `c_charge_degree_M`
- Scale all features, mean 0 and std dev 1


- Select the label for training
    - `two_year_recid` * 10 to put the scale between 0 and 10



In [116]:

X_train = df_train[[ 
    "age", "priors_count", "history_of_violence", 
    "socioeconomic_stability", "c_charge_degree_F", "c_charge_degree_M"
]]

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

y_train = df_train["two_year_recid"] * 10


## Train linear regression

- limit range of predicted values between 0 and 10

In [117]:

model = LinearRegression()
model.fit(X_train_scaled, y_train)
y_pred = model.predict(X_train_scaled)

# y_pred_normalized =  10 * (y_pred - y_pred.min()) / (y_pred.max() - y_pred.min())
# y_pred_normalized = 10 / (1 + np.exp(-y_pred))
y_pred =  np.clip( np.ceil(y_pred), 0, 10)
# y_pred =  np.clip( np.floor(y_pred), 0, 10)


## Print the coefficients for linear regression

print intercept and all other coefficients

In [118]:

print("Intercept (w_0):", model.intercept_)

coefficients = pd.DataFrame({
    "Feature": X_train.columns,
    "Coefficient": model.coef_
})

print(coefficients)


Intercept (w_0): 4.498353838156299
                   Feature  Coefficient
0                      age    -1.099823
1             priors_count     0.210068
2      history_of_violence     0.803596
3  socioeconomic_stability    -0.752347
4        c_charge_degree_F     0.097469
5        c_charge_degree_M    -0.097469


In [119]:
def compare_with_two_year_recid(pred_type, y_pred):
    '''
    categorize the predicted scores into low, medium, and high risk groups
    '''
    
    def categorize_score(score):
        if score <= 4:
            return "Low"
        elif 5 <= score <= 7:
            return "Medium"
        else:
            return "High"

    # Add the predictions to the dataframe by mapping the categorize_score function to the predictions
    # prediction values will be low, medium, or high
    df_train[f"Predicted_{pred_type}_Risk_Group"] = pd.Categorical(
        pd.Series(y_pred).map(categorize_score),
        categories=["Low", "Medium", "High"],
        ordered=True
    )

    # groups based on predicted risk group and actual recidivism
    # size() returns the number of rows in each group
    # unstack() pivots the table so that the predicted risk group is the index and the two_year_recid is the column
    predicted_grouped = df_train.groupby(
        [f"Predicted_{pred_type}_Risk_Group", "two_year_recid"]).size().unstack(fill_value=0)

    # save to csv
    predicted_file_path = os.path.join(RESULTS_FOLDER, f"predicted_vs_recid_{pred_type}.csv")
    predicted_grouped.to_csv(predicted_file_path)

    # as above but add race to the grouping
    race_comparison = df_train.groupby(
        ["race", f"Predicted_{pred_type}_Risk_Group", "two_year_recid"]).size().unstack(fill_value=0)

    # save to csv
    race_comparison_file_path = os.path.join(RESULTS_FOLDER, f"predicted_risk_by_race_{pred_type}_summary.csv")
    race_comparison.to_csv(race_comparison_file_path)


In [120]:
compare_with_two_year_recid("LinearRegression", y_pred)

  predicted_grouped = df_train.groupby([f"Predicted_{pred_type}_Risk_Group", "two_year_recid"]).size().unstack(fill_value=0)
  race_comparison = df_train.groupby(["race", f"Predicted_{pred_type}_Risk_Group", "two_year_recid"]).size().unstack(fill_value=0)
