In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [2]:
data_from_2001 = pd.read_csv('data_from_2001.csv')
data_up_to_2000 = pd.read_csv('data_up_to_2000.csv')
did_results = pd.read_csv('did_results.csv')

In [3]:
data_from_2001_cleaned = data_from_2001.dropna(subset=['policy_id'])
data_up_to_2000_cleaned = data_up_to_2000.dropna(subset=['policy_id'])

In [4]:
data_up_to_2000_merged = pd.merge(data_up_to_2000_cleaned, did_results, on='policy_id', how='inner')

In [5]:
columns_to_keep = ['policy_id', 'location', 'year',
                   #'full_policy',
                   'expand', 'restrict', 'neutral', 'DiD_rate_for_women', 'DiD_num_providers']
data_up_to_2000_merged = data_up_to_2000_merged[columns_to_keep]

In [6]:
columns_to_keep = ['policy_id', 'location', 'year',
                   #'full_policy',
                   'expand', 'restrict', 'neutral']
data_from_2001_cleaned = data_from_2001_cleaned[columns_to_keep]

## Data Preprocessing

In [119]:
Q1 = data_up_to_2000_merged['DiD_rate_for_women'].quantile(0.25)
Q3 = data_up_to_2000_merged['DiD_rate_for_women'].quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

data_up_to_2000_filtered = data_up_to_2000_merged[(data_up_to_2000_merged['DiD_rate_for_women'] >= lower_bound) & (data_up_to_2000_merged['DiD_rate_for_women'] <= upper_bound)]

In [120]:
Q1 = data_up_to_2000_merged['DiD_num_providers'].quantile(0.25)
Q3 = data_up_to_2000_merged['DiD_num_providers'].quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

data_up_to_2000_filtered = data_up_to_2000_merged[(data_up_to_2000_merged['DiD_num_providers'] >= lower_bound) & (data_up_to_2000_merged['DiD_num_providers'] <= upper_bound)]

In [121]:
data_up_to_2000_filtered

Unnamed: 0,policy_id,location,year,expand,restrict,neutral,DiD_rate_for_women,DiD_num_providers
0,2.0,Federal,1973,1.0,0.0,0.0,25.089655,657.372414
1,4.0,Federal,1976,1.0,0.0,1.0,6.680769,183.561538
2,14.0,Federal,1990,0.0,0.0,1.0,-2.598039,397.716667
3,16.0,Federal,1994,0.0,0.0,0.0,-3.852381,270.808333
4,17.0,Federal,1994,0.0,1.0,1.0,-3.852381,270.808333
6,3.0,Kentucky,1974,0.0,0.0,1.0,10.1625,8.0
7,7.0,Kentucky,1980,0.0,0.0,1.0,-1.414286,-1.142857
8,9.0,Kentucky,1983,0.0,0.0,0.0,-1.65,-1.333333
9,12.0,Kentucky,1986,0.0,0.0,0.0,-1.98,-1.6
10,19.0,Kentucky,1998,0.0,0.0,0.0,-3.683333,-3.333333


## Training

### Abortion Rates

In [7]:
X = data_up_to_2000_merged[['expand', 'restrict', 'neutral']]  # features
y = data_up_to_2000_merged['DiD_rate_for_women']  # target

#### Linear Regression

In [8]:
model = LinearRegression()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model.fit(X_train, y_train)

In [9]:
mae = mean_absolute_error(y_test, model.predict(X_test))
mse = mean_squared_error(y_test, model.predict(X_test))
rmse = np.sqrt(mse)
r2 = r2_score(y_test, model.predict(X_test))

print(f"MAE: {mae}")
print(f"MSE: {mse}")
print(f"RMSE: {rmse}")
print(f"R-squared: {r2}")

MAE: 15.305302377123628
MSE: 360.75072950384833
RMSE: 18.99343911733334
R-squared: -3.0235184295422384


#### Ridge Model

In [10]:
from sklearn.linear_model import Ridge

ridge_model = Ridge(alpha=1.0)
ridge_model.fit(X_train, y_train)

In [11]:
mae = mean_absolute_error(y_test, ridge_model.predict(X_test))
mse = mean_squared_error(y_test, ridge_model.predict(X_test))
rmse = np.sqrt(mse)
r2 = r2_score(y_test, ridge_model.predict(X_test))

print(f"MAE: {mae}")
print(f"MSE: {mse}")
print(f"RMSE: {rmse}")
print(f"R-squared: {r2}")

MAE: 13.23560385308572
MSE: 252.2619248798522
RMSE: 15.882755582072406
R-squared: -1.813523080665198


In [12]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(ridge_model, X, y, cv=5, scoring='neg_mean_squared_error')
print(f"Average cross-validated MSE: {np.mean(np.abs(scores))}")


Average cross-validated MSE: 239.33416325000866


#### Random Forest Model

In [13]:
from sklearn.ensemble import RandomForestRegressor

rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

In [14]:
mae = mean_absolute_error(y_test, rf_model.predict(X_test))
mse = mean_squared_error(y_test, rf_model.predict(X_test))
rmse = np.sqrt(mse)
r2 = r2_score(y_test, rf_model.predict(X_test))

print(f"MAE: {mae}")
print(f"MSE: {mse}")
print(f"RMSE: {rmse}")
print(f"R-squared: {r2}")

MAE: 14.848873652182121
MSE: 329.6668950558051
RMSE: 18.156731397908743
R-squared: -2.6768347764431875


#### Final Model
Choosing Ridge model.

In [15]:
X_new = data_from_2001_cleaned[['expand', 'restrict', 'neutral']]

In [16]:
predictions = ridge_model.predict(X_new)

data_from_2001_cleaned['predicted_DiD_rate_for_women'] = predictions

In [17]:
data_from_2001_cleaned

Unnamed: 0,policy_id,location,year,expand,restrict,neutral,predicted_DiD_rate_for_women
0,21.0,Federal,2001,0.0,0.0,1.0,3.226814
1,22.0,Federal,2003,0.0,1.0,0.0,-4.908232
2,23.0,Federal,2006,0.0,0.0,1.0,3.226814
3,24.0,Federal,2007,0.0,0.0,0.0,-1.860251
4,25.0,Ohio,2013,0.0,0.0,0.0,-1.860251
5,26.0,New York,2016,0.0,0.0,0.0,-1.860251
6,27.0,Kentucky,2016,0.0,1.0,1.0,0.178833
7,28.0,New York,2019,0.0,1.0,1.0,0.178833
8,29.0,Kentucky,2019,0.0,1.0,0.0,-4.908232
9,30.0,Kentucky,2019,0.0,1.0,0.0,-4.908232


### Number of Providers

In [18]:
X = data_up_to_2000_merged[['expand', 'restrict', 'neutral']]  # features
y = data_up_to_2000_merged['DiD_num_providers']  # target

#### Linear Regression

In [19]:
model = LinearRegression()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model.fit(X_train, y_train)

In [20]:
mae = mean_absolute_error(y_test, model.predict(X_test))
mse = mean_squared_error(y_test, model.predict(X_test))
rmse = np.sqrt(mse)
r2 = r2_score(y_test, model.predict(X_test))

print(f"MAE: {mae}")
print(f"MSE: {mse}")
print(f"RMSE: {rmse}")
print(f"R-squared: {r2}")

MAE: 260.18538617493243
MSE: 149215.97424226618
RMSE: 386.2848356359154
R-squared: -1.5719957908427973


#### Ridge Model

In [21]:
from sklearn.linear_model import Ridge

ridge_model = Ridge(alpha=1.0)
ridge_model.fit(X_train, y_train)

In [22]:
mae = mean_absolute_error(y_test, ridge_model.predict(X_test))
mse = mean_squared_error(y_test, ridge_model.predict(X_test))
rmse = np.sqrt(mse)
r2 = r2_score(y_test, ridge_model.predict(X_test))

print(f"MAE: {mae}")
print(f"MSE: {mse}")
print(f"RMSE: {rmse}")
print(f"R-squared: {r2}")

MAE: 234.66776785893708
MSE: 100338.95568637768
RMSE: 316.7632486359137
R-squared: -0.7295157103214849


In [23]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(ridge_model, X, y, cv=5, scoring='neg_mean_squared_error')
print(f"Average cross-validated MSE: {np.mean(np.abs(scores))}")

Average cross-validated MSE: 110570.738548061


#### Random Forest Model

In [24]:
from sklearn.ensemble import RandomForestRegressor

rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

In [25]:
mae = mean_absolute_error(y_test, rf_model.predict(X_test))
mse = mean_squared_error(y_test, rf_model.predict(X_test))
rmse = np.sqrt(mse)
r2 = r2_score(y_test, rf_model.predict(X_test))

print(f"MAE: {mae}")
print(f"MSE: {mse}")
print(f"RMSE: {rmse}")
print(f"R-squared: {r2}")

MAE: 262.9942006258196
MSE: 144367.7319461059
RMSE: 379.95753966213897
R-squared: -1.488427936650024


#### Final Model

In [26]:
X_new = data_from_2001_cleaned[['expand', 'restrict', 'neutral']]

In [27]:
predictions = ridge_model.predict(X_new)

data_from_2001_cleaned['predicted_DiD_num_provideres'] = predictions

In [30]:
data_from_2001_cleaned.to_csv('predictions_output.csv')

In [144]:
combined_data = pd.read_csv('/content/combined_data.csv')
combined_data

Unnamed: 0.1,Unnamed: 0,location,year,full_policy,expand,restrict,neutral,policy_id,rate_for_women,ratio_for_women,num_providers
0,0,Federal,1973,Roe v. Wade is a landmark decision by the Supr...,1.0,0.0,0.0,744600,16.3,19.3,492.8
1,1,Federal,1974,,,,,898600,19.3,22.0,492.8
2,2,Federal,1975,,,,,1034200,21.7,24.9,492.8
3,3,Federal,1976,Planned Parenthood v. Danforth is a Supreme Co...,1.0,0.0,1.0,1179300,24.2,26.5,492.8
4,4,Federal,1977,,,,,1316700,26.4,28.6,492.8
5,5,Federal,1978,,,,,1409600,27.7,29.2,492.8
6,6,Federal,1979,,,,,1497700,28.8,29.6,492.8
7,7,Federal,1980,,,,,1553900,29.3,30.0,492.8
8,8,Federal,1981,,,,,1577300,29.3,30.1,492.8
9,9,Federal,1982,,,,,1573900,28.8,30.0,492.8
