In [2]:
import numpy as np
import pandas as pd
import scipy.stats as stats


from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.model_selection import KFold

import statsmodels.api as sm

import matplotlib.pyplot as plt
plt.style.use('ggplot')

%matplotlib inline

In [7]:
data = pd.read_csv('forecast_HIV_infections/data/x_train.csv', index_col=0)
data

Unnamed: 0,county_code,COUNTY,STATEABBREVIATION,YEAR,AMAT_fac,HIVdiagnoses,HIVprevalence,MH_fac,Med_AMAT_fac,Med_MH_fac,...,nonmedpain,ADULTMEN,MSM12MTH,MSM5YEAR,%msm12month,%msm5yr,unemployment_rate,poverty_rate,household_income,percent_uninsured
0,13049,Charlton County,GA,2015,0.0,0.0,213.3,1.0,0.0,1.0,...,4.77,5979,95,147,1.588894,2.458605,16.8,18.0,3552,23.0
1,42065,Jefferson County,PA,2015,0.0,0.0,31.8,1.0,0.0,1.0,...,3.47,17297,127,196,0.734231,1.133144,7.6,14.9,18583,11.1
2,18117,Orange County,IN,2015,0.0,0.0,104.0,1.0,0.0,1.0,...,4.91,7311,40,62,0.547121,0.848037,8.9,18.4,7618,14.5
3,21203,Rockcastle County,KY,2015,0.0,0.0,41.8,1.0,0.0,1.0,...,4.11,6419,68,106,1.059355,1.651348,10.5,25.0,6634,13.0
4,48373,Polk County,TX,2015,0.0,8.0,421.6,1.0,0.0,1.0,...,4.17,19485,319,494,1.637157,2.535284,11.7,18.0,17195,22.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2350,31003,Antelope County,NE,2015,0.0,0.0,0.0,0.0,0.0,0.0,...,3.47,2509,14,21,0.557991,0.836987,1.9,12.6,2824,10.0
2351,26043,Dickinson County,MI,2015,0.0,0.0,0.0,3.0,0.0,1.0,...,3.96,10172,95,147,0.933936,1.445144,8.5,13.4,11263,9.4
2352,8021,Conejos County,CO,2015,0.0,0.0,0.0,2.0,0.0,2.0,...,4.58,2984,16,25,0.536193,0.837802,9.2,18.6,3023,17.6
2353,1011,Bullock County,AL,2015,0.0,8.0,691.4,0.0,0.0,0.0,...,5.12,4597,73,113,1.587992,2.458125,17.4,20.5,3746,17.6


In [22]:
X = data.drop(columns=['COUNTY','STATEABBREVIATION']).values
y = pd.read_csv('forecast_HIV_infections/data/y_train.csv', index_col=0).values
X

array([[1.3049e+04, 2.0150e+03, 0.0000e+00, ..., 1.8000e+01, 3.5520e+03,
        2.3000e+01],
       [4.2065e+04, 2.0150e+03, 0.0000e+00, ..., 1.4900e+01, 1.8583e+04,
        1.1100e+01],
       [1.8117e+04, 2.0150e+03, 0.0000e+00, ..., 1.8400e+01, 7.6180e+03,
        1.4500e+01],
       ...,
       [8.0210e+03, 2.0150e+03, 0.0000e+00, ..., 1.8600e+01, 3.0230e+03,
        1.7600e+01],
       [1.0110e+03, 2.0150e+03, 0.0000e+00, ..., 2.0500e+01, 3.7460e+03,
        1.7600e+01],
       [2.0201e+04, 2.0150e+03, 0.0000e+00, ..., 1.1100e+01, 2.4580e+03,
        1.0200e+01]])

In [23]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler


class XyScaler(BaseEstimator, TransformerMixin):
    """Standardize a training set of data along with a vector of targets."""

    def __init__(self):
        self.X_scaler = StandardScaler()
        self.y_scaler = StandardScaler()
        
    def fit(self, X, y, *args, **kwargs):
        """Fit the scaler to data and a target vector."""
        self.X_scaler.fit(X)
        self.y_scaler.fit(y.reshape(-1, 1))
        return self
    
    def transform(self, X, y, *args, **kwargs):
        """Transform a new set of data and target vector."""
        return (self.X_scaler.transform(X),
                self.y_scaler.transform(y.reshape(-1, 1)).flatten())

    def inverse_transform(self, X, y, *args, **kwargs):
        """Tranform from a scaled representation back to the original scale."""
        return (self.X_scaler.inverse_transform(X),
                self.y_scaler.inverse_transform(y.reshape(-1, 1)).flatten())

In [24]:
def cross_val(X, y, base_estimator, n_folds=10, random_seed=154, scale=True):
    kf = KFold(n_splits=n_folds, shuffle=True, random_state=random_seed)
    test_cv_errors, train_cv_errors = np.empty(n_folds), np.empty(n_folds)
    scaler = XyScaler()

    for idx, (train_idx, valid_idx) in enumerate (kf.split(X)):
        
        if scale:
            scaler.fit(X[train_idx], y[train_idx])
            std_X_train, std_y_train = scaler.transform(X[train_idx], y[train_idx])
            std_X_valid, std_y_valid = scaler.transform(X[valid_idx], y[valid_idx])
        else:
            std_X_train, std_y_train = X[train_idx], y[train_idx]
            std_X_valid, std_y_valid = X[valid_idx], y[valid_idx]
            
        base_estimator.fit(std_X_train,std_y_train)
        y_pred = base_estimator.predict(std_X_valid)
        train_cv_errors[idx] = mean_squared_error(std_y_train, base_estimator.predict(std_X_train))
        test_cv_errors[idx] = mean_squared_error(std_y_valid, y_pred)

    return train_cv_errors, test_cv_errors


In [25]:
def train_at_various_alphas(X, y, model, alphas, n_folds=10, **kwargs):
    
    cv_errors_train = pd.DataFrame(np.empty(shape=(n_folds, len(alphas))),columns=alphas)
    cv_errors_test = pd.DataFrame(np.empty(shape=(n_folds, len(alphas))),columns=alphas)
    
    for alpha in alphas:
        train_cv_errors, test_cv_errors = cv(X, y, model(alpha=alpha), n_folds=n_folds)
        cv_errors_train[alpha] = train_cv_errors
        cv_errors_test[alpha] = test_cv_errors
        
    return cv_errors_train, cv_errors_test



In [26]:
model = LinearRegression()
ms1_train, mse_test = cross_val(X, y, model, scale=False)

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').