# Building regressions

In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn import datasets, linear_model
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn import preprocessing # puts the features in standard format
from sklearn import pipeline # 


## Import dataframe

In [2]:
df = pd.read_csv('csv_files/final_with_logs.csv')

In [3]:
df.head()

Unnamed: 0,County,Obesity,Min Wage,Life expectancy 2010,Percent Smokers,Jan & July Public Assist. Participation,Jan & July Non-Public Assist. Participation,Jan & July TOTAL Assist. Participation,Unemployment_rate_2010,Alaska,...,Vermont,Virginia,Washington,West Virginia,Wisconsin,Wyoming,Min Wages Dummies,log_PA,log_NPA,log_TA
0,Autauga County,30.5,7.25,75.74,25.8,1706,12399,14105,8.9,0,...,0,0,0,0,0,0,0,7.441907,9.425371,9.554285
1,Baldwin County,26.6,7.25,77.8,23.0,3236,34173,37409,10.0,0,...,0,0,0,0,0,0,0,8.082093,10.439191,10.529667
2,Baldwin County,32.0,7.25,77.8,23.0,3236,34173,37409,10.0,0,...,0,0,0,0,0,0,0,8.082093,10.439191,10.529667
3,Barbour County,37.3,7.25,75.34,26.8,2137,9847,11984,12.3,0,...,0,0,0,0,0,0,0,7.667158,9.194922,9.391328
4,Barbour County,31.9,7.25,75.34,26.8,2137,9847,11984,12.3,0,...,0,0,0,1,0,0,0,7.667158,9.194922,9.391328


### Split Data into features and Target 

In [16]:
# Putting continuous variables together by grabbing log-ed values putting them on the first columns
cols = df.columns.tolist()
cols
first = cols[:-3] 
second = cols[-3:]
log_first = second + first
log_first
df = df[log_first]
df

Unnamed: 0,log_PA,log_NPA,log_TA,County,Obesity,Min Wage,Life expectancy 2010,Percent Smokers,Jan & July Public Assist. Participation,Jan & July Non-Public Assist. Participation,...,Tennessee,Texas,Utah,Vermont,Virginia,Washington,West Virginia,Wisconsin,Wyoming,Min Wages Dummies
0,7.441907,9.425371,9.554285,Autauga County,30.5,7.25,75.74,25.8,1706,12399,...,0,0,0,0,0,0,0,0,0,0
1,8.082093,10.439191,10.529667,Baldwin County,26.6,7.25,77.80,23.0,3236,34173,...,0,0,0,0,0,0,0,0,0,0
2,8.082093,10.439191,10.529667,Baldwin County,32.0,7.25,77.80,23.0,3236,34173,...,0,0,0,0,0,0,0,0,0,0
3,7.667158,9.194922,9.391328,Barbour County,37.3,7.25,75.34,26.8,2137,9847,...,0,0,0,0,0,0,0,0,0,0
4,7.667158,9.194922,9.391328,Barbour County,31.9,7.25,75.34,26.8,2137,9847,...,0,0,0,0,0,0,1,0,0,0
5,7.085901,8.772300,8.942199,Bibb County,34.3,7.25,74.13,30.1,1195,6453,...,0,0,0,0,0,0,0,0,0,0
6,7.085901,8.772300,8.942199,Bibb County,30.3,7.25,74.13,30.1,1195,6453,...,0,0,0,0,0,0,0,0,0,0
7,7.623642,9.576025,9.708749,Blount County,30.4,7.25,76.41,27.9,2046,14415,...,0,0,0,0,0,0,0,0,0,0
8,7.623642,9.576025,9.708749,Blount County,31.9,7.25,76.41,27.9,2046,14415,...,1,0,0,0,0,0,0,0,0,0
9,7.080868,8.398410,8.635687,Bullock County,42.1,7.25,73.61,28.0,1189,4440,...,0,0,0,0,0,0,0,0,0,0


In [26]:
target = df['Life expectancy 2010'] # making target = to life expectancy column

features = df.drop(columns=['Life expectancy 2010', 'County']) # taking target out of dataframe

#features = df_no_target.iloc[:] # setting features = to all columns left after taking target out

features.head()

Unnamed: 0,log_PA,log_NPA,log_TA,Obesity,Min Wage,Percent Smokers,Jan & July Public Assist. Participation,Jan & July Non-Public Assist. Participation,Jan & July TOTAL Assist. Participation,Unemployment_rate_2010,...,Tennessee,Texas,Utah,Vermont,Virginia,Washington,West Virginia,Wisconsin,Wyoming,Min Wages Dummies
0,7.441907,9.425371,9.554285,30.5,7.25,25.8,1706,12399,14105,8.9,...,0,0,0,0,0,0,0,0,0,0
1,8.082093,10.439191,10.529667,26.6,7.25,23.0,3236,34173,37409,10.0,...,0,0,0,0,0,0,0,0,0,0
2,8.082093,10.439191,10.529667,32.0,7.25,23.0,3236,34173,37409,10.0,...,0,0,0,0,0,0,0,0,0,0
3,7.667158,9.194922,9.391328,37.3,7.25,26.8,2137,9847,11984,12.3,...,0,0,0,0,0,0,0,0,0,0
4,7.667158,9.194922,9.391328,31.9,7.25,26.8,2137,9847,11984,12.3,...,0,0,0,0,0,0,1,0,0,0


In [27]:
# Assigning the non-dummy variables to a new dataframe to scale the variables correctly 
# THe dummy variables are either 0 or 1, which would affect our scale
features_no_dummy = features.iloc[:, 0:10]
features_no_dummy.head()

Unnamed: 0,log_PA,log_NPA,log_TA,Obesity,Min Wage,Percent Smokers,Jan & July Public Assist. Participation,Jan & July Non-Public Assist. Participation,Jan & July TOTAL Assist. Participation,Unemployment_rate_2010
0,7.441907,9.425371,9.554285,30.5,7.25,25.8,1706,12399,14105,8.9
1,8.082093,10.439191,10.529667,26.6,7.25,23.0,3236,34173,37409,10.0
2,8.082093,10.439191,10.529667,32.0,7.25,23.0,3236,34173,37409,10.0
3,7.667158,9.194922,9.391328,37.3,7.25,26.8,2137,9847,11984,12.3
4,7.667158,9.194922,9.391328,31.9,7.25,26.8,2137,9847,11984,12.3


In [28]:
# first step to split the data into test and train data
X_train, X_test, y_train, y_test = train_test_split(features, target, random_state=32,test_size=0.2)

In [29]:
# Makes the features standardized
#by removing the mean and scaling to unit variance
#instantiating a scaler
scaler = preprocessing.StandardScaler()

### Scale Train and Test Data

In [33]:
# Gets mean and std for later scaling
# passing in our training data set with out dummies
X_train_no_dummy = X_train.iloc[:, 0:10]
X_train_no_dummy.head()

scaler.fit(X_train_no_dummy) 

StandardScaler(copy=True, with_mean=True, with_std=True)

In [34]:
# Gets mean and std for later scaling for test data
# passing in our test data set with out dummies
X_test_no_dummy = X_test.iloc[:, 0:10]
X_test_no_dummy.head()

scaler.fit(X_test_no_dummy) 

StandardScaler(copy=True, with_mean=True, with_std=True)

In [52]:
# Actually creating a new dataframe only containing the scaled x training values (ie x training values w/o dummies)
X_train_scaled_no_dummy = pd.DataFrame(scaler.transform(X_train_no_dummy), columns=X_train_no_dummy.columns, index=X_train_no_dummy.index)

# Just getting the dummies from the x training data
X_train_only_dums = X_train.iloc[:,10:]

# Joining the non-dummy x training data w/the dummy x training data since we have now scaled everything that needs to be scaled in x training data
X_train_final = X_train_scaled_no_dummy.join(X_train_only_dums, how='outer').dropna()

# This is what it looks like
X_train_final.head()






Unnamed: 0,log_PA,log_NPA,log_TA,Obesity,Min Wage,Percent Smokers,Jan & July Public Assist. Participation,Jan & July Non-Public Assist. Participation,Jan & July TOTAL Assist. Participation,Unemployment_rate_2010,...,Tennessee,Texas,Utah,Vermont,Virginia,Washington,West Virginia,Wisconsin,Wyoming,Min Wages Dummies
493,-1.082522,-0.92123,-1.083551,0.249947,-0.235347,0.476491,-0.169367,-0.302808,-0.282195,-0.737519,...,0,0,0,0,0,0,0,0,0,0
1435,0.276322,0.1144,0.087552,0.047589,-0.235347,-0.012464,-0.127044,-0.196159,-0.188641,0.049203,...,0,0,0,0,0,0,0,0,0,0
1885,-0.277298,-0.486183,-0.56507,0.882314,-0.235347,1.50587,-0.155601,-0.275777,-0.257459,-0.154762,...,0,0,0,0,0,0,0,0,0,0
1855,-1.044594,0.23494,0.119833,1.944692,-0.235347,0.75957,-0.169051,-0.170127,-0.18343,0.049203,...,0,0,0,0,0,0,0,0,0,0
2202,0.051493,-0.311611,-0.344645,-2.254229,-0.235347,-1.093313,-0.141887,-0.259204,-0.240518,-1.262001,...,0,0,0,0,0,0,0,0,0,0


In [56]:
# Actually creating a new dataframe only containing the scaled x training values (ie x training values w/o dummies)
X_test_scaled_no_dummy = pd.DataFrame(scaler.transform(X_test_no_dummy), columns=X_test_no_dummy.columns, index=X_test_no_dummy.index)

# Just getting the dummies from the x training data
X_test_only_dums = X_test.iloc[:,10:]

# Joining the non-dummy x training data w/the dummy x training data since we have now scaled everything that needs to be scaled in x training data
X_test_final = X_test_scaled_no_dummy.join(X_test_only_dums, how='outer').dropna()

# This is what it looks like
X_test_final

Unnamed: 0,log_PA,log_NPA,log_TA,Obesity,Min Wage,Percent Smokers,Jan & July Public Assist. Participation,Jan & July Non-Public Assist. Participation,Jan & July TOTAL Assist. Participation,Unemployment_rate_2010,...,Tennessee,Texas,Utah,Vermont,Virginia,Washington,West Virginia,Wisconsin,Wyoming,Min Wages Dummies
1304,-0.133089,-0.118535,-0.193539,0.376420,-0.235347,0.244880,-0.150504,-0.235326,-0.225664,-0.912346,...,0,0,0,0,0,0,0,0,0,0
2841,-0.948672,0.400732,0.297340,-0.205358,-0.235347,-0.707295,-0.168158,-0.126107,-0.150397,-0.329589,...,0,1,0,0,0,0,0,0,0,0
1221,-0.177616,-0.152628,-0.232086,-0.559484,-0.235347,0.939711,-0.152212,-0.240037,-0.229743,2.147129,...,0,0,0,0,0,0,0,0,0,0
1840,-0.978044,-0.284755,-0.424986,0.806430,-0.235347,0.888242,-0.168447,-0.256265,-0.247277,-0.504416,...,0,0,0,0,0,0,0,0,0,0
1034,-0.446499,-1.376522,-1.376957,0.022294,-0.235347,0.862508,-0.160198,-0.317763,-0.290228,1.564372,...,0,0,0,0,0,0,0,0,0,0
660,-1.315262,-0.595936,-0.762348,0.882314,-0.235347,1.119853,-0.170917,-0.284260,-0.268925,-0.679243,...,0,1,0,0,0,0,0,0,0,0
98,0.140704,-0.346619,-0.355594,0.932904,-0.235347,0.630898,-0.136659,-0.262868,-0.241482,0.923339,...,0,0,0,0,0,0,0,0,0,0
2247,-0.357337,-0.424091,-0.517498,-0.230653,-0.235347,-0.733030,-0.157939,-0.270352,-0.254213,-1.903034,...,0,0,0,0,0,0,0,0,0,0
318,1.606049,1.452671,1.563233,1.160556,-0.235347,-0.424216,0.267703,0.606873,0.541405,0.049203,...,0,0,0,0,0,0,0,0,0,0
1524,0.091700,0.044819,-0.006612,-0.685957,-0.235347,0.836773,-0.139628,-0.209258,-0.202619,0.369720,...,0,0,0,0,0,0,0,0,0,0
