# Predicting Change in Heart-Related Deaths

In [80]:
# importing necessary packages
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import KFold
from sklearn import linear_model
import matplotlib.pyplot as plt

In [81]:
def coef_graph():
    # plotting columns and their coef impact on predictions
    coefs = pd.DataFrame(
    regr.coef_,
    #  columns=['Coefficients'], index=X_train.columns
    columns=['Coefficients'], index=X.columns
    )

    # sort coefficients highest to lowest
    coefs.sort_values(by=['Coefficients'], inplace=True)

    # function to add value labels
    def addlabels(x,y):
        for i in range(len(x)):
            plt.text(i,y[i],y[i])

    # creating the graph
    coefs.plot(kind='barh', figsize=(9, 7))
    plt.title('Linear Regression Model')
    plt.axvline(x=0, color='.5')
    plt.subplots_adjust(left=.3)

In [67]:
# loading in data cleaned in data-preparation.ipynb
svi_mortality_2014 = pd.read_csv('./data/svi-mortality-county/2014-svi-mortality.csv', index_col=0)
svi_mortality_2020 = pd.read_csv('./data/svi-mortality-county/2020-svi-mortality.csv', index_col=0)

# rename columns to match
svi_mortality_2020.rename(columns={'E_POV150': 'E_POV'}, inplace=True)

# drop columns with no equivalent
svi_mortality_2014.drop('E_PCI', axis=1, inplace=True)
svi_mortality_2020.drop(['E_HBURD', 'E_NOINT', 'E_AFAM', 'E_HISP', 'E_ASIAN', 'E_AIAN', 'E_NHPI', 'E_TWOMORE', 'E_OTHERRACE'], axis=1, inplace=True)

# create new df with calculated change in estimated values
diff_2020_2014 = svi_mortality_2020 - svi_mortality_2014

# drop all na rows
diff_2020_2014.dropna(inplace=True)

In [69]:
# Splitting dataset for predictions
X = diff_2020_2014.drop('Deaths', axis=1)
y = diff_2020_2014.Deaths

In [70]:
# Scaling X learning data
scaler = preprocessing.MinMaxScaler()
names = X.columns
d = scaler.fit_transform(X)
X = pd.DataFrame(d, columns=names)

In [82]:
# Splitting X and y into training datasets
kf = KFold(n_splits=10, shuffle=True)
# kf = KFold(n_splits=5,random_state=42,shuffle=True)
for train_index,val_index in kf.split(X):
    X_train,X_test = X.iloc[train_index],X.iloc[val_index],
    y_train,y_test = y.iloc[train_index],y.iloc[val_index]

    regr = linear_model.LinearRegression()
    regr.fit(X_train, y_train)

    # testing model for general accuracy
    print(f'\nModel score on testing data: {regr.score(X_train, y_train)}')
    print(f'Model score on training data: {regr.score(X_test, y_test)}')

    # coef_graph()


Model score on testing data: 0.8114482914254135
Model score on training data: 0.4091949766890898

Model score on testing data: 0.8236553763187812
Model score on training data: 0.25628156940812485

Model score on testing data: 0.8049021209242971
Model score on training data: 0.6454692372757816

Model score on testing data: 0.8020235181614166
Model score on training data: 0.6857455211434513

Model score on testing data: 0.8092229512316195
Model score on training data: 0.31051708042962944

Model score on testing data: 0.8147876636448248
Model score on training data: 0.5415985944699424

Model score on testing data: 0.8072129589890793
Model score on training data: 0.6093217622006994

Model score on testing data: 0.7411114162668266
Model score on training data: 0.8787920363800901

Model score on testing data: 0.8031438463249735
Model score on training data: 0.6182496390285059

Model score on testing data: 0.8041412244594606
Model score on training data: 0.6018077763335488


In [None]:
# https://machinelearningmastery.com/regression-tutorial-keras-deep-learning-library-python/