In [None]:
from __future__ import division
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn as skl
from statsmodels.graphics.tsaplots import plot_acf
from statsmodels.tsa.stattools import adfuller
import statsmodels.api as sm
from scipy import stats
from sklearn.metrics import mean_squared_error, r2_score


#this class will take in the relevant data and return the
#data in desired format and transformations for analysis
class fix_data:

     #upload and explore Case-Schiller HPI data
    def cleanup_data(CS, PC, MG, verb=False, logz=False):
        CS_dat = pd.read_csv(CS)
        CS_dat['DATE'] =  pd.to_datetime(CS_dat['DATE'], infer_datetime_format=True)
        CS_dat = CS_dat.set_index('DATE')

        #import personal expenditures to adjust for inflation over time
        #https://www.bea.gov/iTable/iTable.cfm?reqid=19&step=2#reqid=19&step=3&isuri=1&1910=x&0=-
        #99&1921=survey&1903=64&1904=1987&1905=2018&1906=q&1911=0
        p_exp = pd.read_csv(PC)
        p_exp = p_exp.drop(['Table 2.3.4. Price Indexes for Personal Consumption Expenditures by Major Type of Product'], axis=1)
        PCE = p_exp.iloc[[3,4,5]]
        PCE = PCE.drop(['Unnamed: 1'], axis = 1)

        #interpolate the PCE data to take it from quarterly to monthly using the
        #pandas interpolation
        for j in range(125):
            if PCE.iloc[1][j] == "Q1":
                PCE.iloc[0][j] = PCE.iloc[0][j] + '-01-01'
            elif PCE.iloc[1][j] == "Q2":
                PCE.iloc[0][j] = PCE.iloc[0][j] + '-04-01'
            elif PCE.iloc[1][j] == "Q3":
                PCE.iloc[0][j] = PCE.iloc[0][j] + '-07-01'
            else:
                PCE.iloc[0][j] = PCE.iloc[0][j] + '-10-01'

        for f in range(125):
            PCE.iloc[0][f]= pd.to_datetime(PCE.iloc[0][f],infer_datetime_format=True )
        PCE = PCE.drop([4])

        new_PCE = pd.DataFrame([], columns=['Date', 'pce'])
        new_PCE['Date'] = PCE.iloc[0]
        new_PCE['pce'] = PCE.iloc[1]
        new_PCE = new_PCE.set_index('Date')
        new_PCE = new_PCE.resample('MS').asfreq()
        new_PCE['pce'] = pd.to_numeric(new_PCE['pce'])
        new_PCE = new_PCE.interpolate()

        #adjust the CS raw values using the personal consumption expenditures
        #from the most recent quarter value reported
        #(since PCE is quarterly and CS is monthly)
        adj_CS_dat = CS_dat.copy(deep=True)
        adj_CS_dat.iloc[[0],[0]] = 0.0

        for j in range(len(new_PCE)):
             adj_CS_dat.iloc[[j],[0]] = CS_dat.iloc[[j],[0]].values / (new_PCE.iloc[j].values)
        adj_CS_dat = adj_CS_dat.drop(pd.to_datetime('2018-02-01'))


        #update so that the variable is log of the percentage change in HPI
        perc_adj_CS_dat = adj_CS_dat.copy(deep=True)
        perc_adj_CS_dat.iloc[[0]] = 0

        if logz==True:
            for p in range(1,len(adj_CS_dat)):
                perc_adj_CS_dat.iloc[[p],[0]] = np.log((adj_CS_dat.iloc[[p],[0]].values - adj_CS_dat.iloc[[p-1],[0]].values)/adj_CS_dat.iloc[[p-1],[0]].values +1.)
        if logz==False:
            for p in range(1,len(adj_CS_dat)):
                perc_adj_CS_dat.iloc[[p],[0]] = (adj_CS_dat.iloc[[p],[0]].values - adj_CS_dat.iloc[[p-1],[0]].values)/adj_CS_dat.iloc[[p-1],[0]].values

        #update so that the mortage data is percentage change month to month
        #import the mortgage data
        mdo_dat = pd.read_csv(MG)

        #get rid of data before 1987 since that is when CS data starts
        for j in range(157):
            mdo_dat = mdo_dat.drop([j])

        perc_adj_mdo_dat = mdo_dat

        if verb == True:
            filepath = '/Users/abbysuckow/Desktop/'
            #plot the CS HPI over time
            CS_dat.plot(legend=None)
            plt.xlabel('Date', fontsize=10)
            plt.ylabel('CS HPI ', fontsize=10)
            plt.savefig(filepath+'CSHPI')
            plt.show()

            #plot the difference adjusted CS HPI over time
            adj_CS_dat.plot(legend=None)
            plt.xlabel('Date', fontsize=10)
            plt.ylabel('Adjusted CS HPI ', fontsize = 10)
            plt.savefig(filepath+'adjCSHPI')
            plt.show()

            #plot the log adjusted CS HPI over time
            perc_adj_CS_dat.plot(legend=None)
            plt.xlabel('Date', fontsize=10)
            plt.ylabel('Percentage change CS HPI ', fontsize = 10)
            plt.savefig(filepath+'percchangeHPI')
            plt.show()



        return CS_dat, adj_CS_dat, perc_adj_CS_dat, perc_adj_mdo_dat, new_PCE


    #function to get 1-4 four family data for the mortage data
    #also interpolates data
    def get_residential_data(mdo_dat, new_PCE):

        #only want 1-4 fam residence values
        mdo_dat = mdo_dat.set_index(mdo_dat['Series Description'])
        resid = 'One- to four- family'
        for label in mdo_dat.columns:
            if resid not in label:
                mdo_dat = mdo_dat.drop([label], axis = 1)

        #interpolate mortgage data to make it monthly from quarterly
        #first need to make the index a datetime
        mdo_dat.index = pd.to_datetime(mdo_dat.index,infer_datetime_format=True )

        #now interpolate
        mdo_dat = mdo_dat.resample('MS').asfreq()
        for label in mdo_dat.columns:
            mdo_dat[label] = pd.to_numeric(mdo_dat[label])
        mdo_dat = mdo_dat.interpolate()

        #adjust for inflation
        adj_mdo_dat = mdo_dat.copy(deep=True)
        adj_mdo_dat.iloc[[0],[0]] = 0.0

        for j in range(len(new_PCE)):
             adj_mdo_dat.iloc[[j],[0]] = mdo_dat.iloc[[j],[0]].values / (new_PCE.iloc[j].values)

        return adj_mdo_dat


    #function to split data into train, validation and test
    #split = the number of data points to include in training
    #val = the number of data points to include in the validation set
    def ttv_split(dat, split, val):
        train = dat[:split]
        valid = dat[split+1:split+1+val]
        test = dat[split+val+2:len(dat)-1]
        return train, valid, test

    #this function pulls the Case Shiller out from the dataset to be the target
    #variable
    def get_target(dat):
        y_, X_ = pd.DataFrame(dat['CSUSHPINSA']), pd.DataFrame(dat.drop('CSUSHPINSA', axis=1))
        return y_, X_

    def split_shift_data(dat, train_size, valid_size, num_lags, dif_order=0, num_dif=0, shift_ord=0,plot=False, second_dif=False, mdo_dat=None, both=True):

        sec = 0
        #option to difference the data
        if dif_order > 0:
            for r in range(num_dif):
                dat = dat - dat.shift(dif_order)
        if second_dif == True:
            dat = dat - dat.shift(1)
            sec = 1


        #add lags for the independent inputs into the model
        for l in range(num_lags):
            dat['Lag-'+str(l+1)] = dat['CSUSHPINSA'].shift(l+1)

        if mdo_dat is not None:
            mdo_dat = pd.DataFrame(mdo_dat[str(mdo_dat.iloc[0].index[0])])
            for l in range(num_lags):
                mdo_dat['MDO Lag-'+str(l+1)] = mdo_dat.iloc[:,0].shift(l+1)
            mdo_dat = pd.DataFrame(mdo_dat.drop(str(mdo_dat.columns[0]),axis=1))

        dat = pd.concat([dat, mdo_dat], axis=1)
        #separete target variable
        y_, X_ = pd.DataFrame(dat['CSUSHPINSA']), pd.DataFrame(dat.drop('CSUSHPINSA', axis=1))

        #for indexing new data once we lose data to differencing, shifting, etc.
        num = (num_dif*dif_order)+num_lags+sec


        #split into train valid and test
        y_train, X_train= y_[:train_size], X_[:train_size]
        y_valid, X_valid = y_[train_size:train_size+valid_size], X_[train_size:train_size+valid_size]
        y_test, X_test = y_[train_size+valid_size:len(dat)-1], X_[train_size+valid_size:len(dat)-1]

        #if we are shifting the data
        if shift_ord>0:
            X_train, X_valid, X_test = X_train.shift(shift_ord), X_valid.shift(shift_ord), X_test.shift(shift_ord)
            num = (num_dif*dif_order)+num_lags+shift_ord

        #add in for a bias term
        X_train = sm.add_constant(X_train)
        X_valid = sm.add_constant(X_valid)
        X_test = sm.add_constant(X_test)

        if plot == True:
            filepath = '/Users/abbysuckow/Desktop/'
            total = pd.concat([y_train[num:].rename(columns={'CSUSHPINSA':'Train'}), y_valid.rename(columns={'CSUSHPINSA':'Validation'}), y_test.rename(columns={'CSUSHPINSA':'Test'})], axis=1)
            total.plot()

            plt.savefig(filepath+'splitdata')
            y_train.plot()
            y_valid.plot(color='orange')
            y_test.plot(color='green')

            plt.show()


        return y_, y_train, X_train, y_valid, X_valid, y_test, X_test, num

    def check_stationary(dat, plots=False):

        result = adfuller(dat)
        print('ADF Statistic: %f' % result[0])
        print('p-value: %f' % result[1])

        if plots==True:
            pd.plotting.autocorrelation_plot(dat)
            plt.show()
            plot_acf(dat, lags=36)
            plt.show()
        return


    #this function is to transform the data back into the raw Case Shiller
    #index values for visualization and error reporting purposes
    def switch_back(y_hat, tru_dat, dif_order, month_forecast, adj_dat, PCE_dat):

        hmm = y_hat.copy()
        holder = tru_dat[:dif_order].copy()

        count = np.round(len(hmm)/dif_order).astype(int)
        undiff_preds = np.zeros([len(hmm),1])

        #get rid of the seasonal differencing
        for j in range(count):
            for p in range(dif_order):
                if (j*dif_order)-1+p >= len(undiff_preds) -1 :
                    break
                if j == 0:
                    undiff_preds[((j*dif_order)+p),0] = hmm.iloc[(j*dif_order)+p,-1] + holder.iloc[p][0]

                else:
                    undiff_preds[((j*dif_order)+p),0] = hmm.iloc[(j*dif_order)+p,-1] + holder.iloc[p][0]

            if j == 0:
                holder[:] =  undiff_preds[:dif_order]
            else:
                if (j*dif_order)-1+p >= len(undiff_preds) -1:
                    break
                holder[:] = undiff_preds[(j*dif_order):(j*dif_order)+p+1]

        unperc_preds = np.zeros(undiff_preds.shape)
        #get rid of the percentage change
        x_0 = adj_dat.iloc[month_forecast-1]
        counter =1
        for f in range(len(y_hat)):
            counter = counter*(undiff_preds[f,0] +1)
            unperc_preds[f,0] = counter*x_0

        #get rid of CPI adjustment
        unadj_preds = unperc_preds * PCE_dat.values

        return unadj_preds

    #function to remove seasonality from data based on passed in adjustments
    def adjust_months(target_dat, month_dat, adjustments):
        new_target_dat = pd.DataFrame(0, index=range(len(target_dat)), columns=range(1))
        for j in range(len(month_dat)):
            if month_dat.iloc[j][0] == 1.:
                new_target_dat.iloc[[j],[0]] = target_dat.iloc[j]- adjustments[0]
            elif month_dat.iloc[j][1] == 1.:
                new_target_dat.iloc[[j],[0]] = target_dat.iloc[j]- adjustments[1]
            elif month_dat.iloc[j][2] == 1.:
                new_target_dat.iloc[[j],[0]] = target_dat.iloc[j]- adjustments[2]
            elif month_dat.iloc[j][3] == 1.:
                new_target_dat.iloc[[j],[0]] = target_dat.iloc[j]- adjustments[3]
            elif month_dat.iloc[j][4] == 1.:
                new_target_dat.iloc[[j],[0]] = target_dat.iloc[j]- adjustments[4]
            elif month_dat.iloc[j][5] == 1.:
                new_target_dat.iloc[[j],[0]] = target_dat.iloc[j]- adjustments[5]
            elif month_dat.iloc[j][6] == 1.:
                new_target_dat.iloc[[j],[0]] = target_dat.iloc[j]- adjustments[6]
            elif month_dat.iloc[j][7] == 1.:
                new_target_dat.iloc[[j],[0]] = target_dat.iloc[j]- adjustments[7]
            elif month_dat.iloc[j][8] == 1.:
                new_target_dat.iloc[[j],[0]] = target_dat.iloc[j]- adjustments[8]
            elif month_dat.iloc[j][9] == 1.:
                new_target_dat.iloc[[j],[0]] = target_dat.iloc[j]- adjustments[9]
            elif month_dat.iloc[j][10] == 1.:
                new_target_dat.iloc[[j],[0]] = target_dat.iloc[j]- adjustments[10]
            else:
                new_target_dat.iloc[[j],[0]] = target_dat.iloc[j]

        return new_target_dat
