In [2]:
from interpret.glassbox import ExplainableBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import pandas as pd

file_path = 'Aggregated_Cleaned_Merged_Dataset_No_FIPS.csv'
data = pd.read_csv(file_path)

y = data["Percent (%) of Labor Force Unemployed in State/Area"]

X = data.drop(columns=["Percent (%) of Labor Force Unemployed in State/Area", "State"])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

ebm = ExplainableBoostingRegressor(random_state=42)
ebm.fit(X_train, y_train)

y_pred = ebm.predict(X_test)
mse = mean_squared_error(y_test, y_pred)

print(f'Mean Squared Error: {mse}')

from interpret import show

ebm_global = ebm.explain_global()  
show(ebm_global)


Mean Squared Error: 0.14981422631464508


In [3]:

from sklearn.ensemble import GradientBoostingRegressor

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

gbr = GradientBoostingRegressor(random_state=42)
gbr.fit(X_train, y_train)

y_pred_gbr = gbr.predict(X_test)
mse_gbr = mean_squared_error(y_test, y_pred_gbr)

mse_gbr


0.41443742559472113

In [4]:
from scipy.stats import shapiro

normality_results = {}

for state in data['State'].unique():
    state_data = data[data['State'] == state]["Percent (%) of Labor Force Unemployed in State/Area"]
    stat, p_value = shapiro(state_data)
    normality_results[state] = p_value


normality_results


{'Alabama': 0.0059900789915628,
 'Alaska': 0.008536102304853502,
 'Arizona': 0.002711920837731934,
 'Arkansas': 0.42062828292048293,
 'California': 0.03716397485907289,
 'Colorado': 0.39119440013169077,
 'Connecticut': 0.2716900218461192,
 'Delaware': 0.0010594664753839139,
 'District of Columbia': 0.6357236540639195,
 'Florida': 0.4805907300270907,
 'Georgia': 0.0017429975127090552,
 'Hawaii': 0.0018994120861776747,
 'Idaho': 0.23604807559105018,
 'Illinois': 0.02740223176316245,
 'Indiana': 0.0039036144380776303,
 'Iowa': 0.0035946283903068588,
 'Kansas': 0.16518114366816528,
 'Kentucky': 0.0014181639426711604,
 'Louisiana': 0.0009345149410750983,
 'Maine': 0.051375839401800134,
 'Maryland': 0.03157617771489984,
 'Massachusetts': 0.06929794940270297,
 'Michigan': 0.04837062309727997,
 'Minnesota': 0.050498560447050804,
 'Mississippi': 0.0060487865542154405,
 'Missouri': 0.06257110010285552,
 'Montana': 0.6397638201813071,
 'Nebraska': 0.0026345554630792427,
 'Nevada': 1.8148680620639

In [5]:
from scipy.stats import kruskal

grouped_data = [data[data['State'] == state]["Percent (%) of Labor Force Unemployed in State/Area"] for state in data['State'].unique()]

stat, p_value_kruskal = kruskal(*grouped_data)

p_value_kruskal


1.0426483240316596e-114

In [3]:
import statsmodels.api as sm

data['interaction'] = data['State.Minimum.Wage'] * data['CPI.Average']


X = data[['State.Minimum.Wage', 'CPI.Average', 'interaction']]
X = sm.add_constant(X)  

y = data['Percent (%) of Labor Force Unemployed in State/Area']

model = sm.OLS(y, X).fit()
print(model.summary())


                                             OLS Regression Results                                            
Dep. Variable:     Percent (%) of Labor Force Unemployed in State/Area   R-squared:                       0.056
Model:                                                             OLS   Adj. R-squared:                  0.054
Method:                                                  Least Squares   F-statistic:                     45.04
Date:                                                 Thu, 24 Oct 2024   Prob (F-statistic):           2.83e-28
Time:                                                         12:39:05   Log-Likelihood:                -4880.9
No. Observations:                                                 2295   AIC:                             9770.
Df Residuals:                                                     2291   BIC:                             9793.
Df Model:                                                            3                                  

In [4]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

X_vif = data[['State.Minimum.Wage', 'CPI.Average']]
X_vif['interaction'] = X_vif['State.Minimum.Wage'] * X_vif['CPI.Average']

vif_data = pd.DataFrame()
vif_data['Feature'] = X_vif.columns
vif_data['VIF'] = [variance_inflation_factor(X_vif.values, i) for i in range(X_vif.shape[1])]

vif_data




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,Feature,VIF
0,State.Minimum.Wage,32.392522
1,CPI.Average,6.47511
2,interaction,22.994033
