### This notebook contains code for regression analysis using keras. You will need to provide an input file specifying independent and dependent variables. The code will ignore any independent variable that has a correlation of greater than 90% with the dependent variable. Then, you will need to provide a test/future data file, that you want to predict on. This file must have the same independent variables as the first file.

# Importing Necessary packages

In [1]:
# Numeric Python Library.
import numpy as np
# Python Data Analysis Library.
import pandas as pd
# Scikit-learn Machine Learning Python Library modules.
#   Preprocessing utilities.
from sklearn import preprocessing
#   Cross-validation utilities.
from sklearn import cross_validation
# random forest library
from sklearn.ensemble import RandomForestRegressor
from sklearn import linear_model
# Python graphical library
import matplotlib.pyplot as plt
 
# Keras perceptron neuron layer implementation.
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import Activation

Using TensorFlow backend.


# Class that handles reading and formatting data, and all regression models.

In [5]:
class Regression(object):
    "class that handles all tasks regarding Regression"
    def __init__(self):
        """constructor"""
        self.data=pd.DataFrame()
        self.DataX=pd.DataFrame()
        self.DataY=pd.DataFrame()
        self.futureX=pd.DataFrame()
        self.futureY=pd.DataFrame()
        self.prediction_duration=None
        self.selectedCols=[]
        self.maxY=None

    def loadData(self, filepath,fillData=False):
        """reads data from excel or csv"""
        """filepath=complete file path of the data file,must be string
        fillData=if true, front fill and back fill will be used to fill missing values"""
        if filepath[-4:]=='xlsx':
            self.data=pd.read_excel(filepath)
        elif filepath[-3:]=='csv':
            self.data=pd.read_csv(filepath)
        else:
            raise Exception("File format not supported")
        
        if fillData==True:
            """method for filling up missing values
            must be replaced with DCL"""
            self.data=self.data.fillna(method='ffill')
            self.data=self.data.fillna(method='bfill')
    
    def createData(self,xcol,ycol):
        """method for selecting input and output variables and creating x, y data"""
        """xcol=list of independent variables column, must be integer
        ycol=dependent variable column, must be integer
        """
        self.DataY=self.data.iloc[:,ycol]
        self.maxY=self.DataY.max()
        c=0
        if type(xcol)==list:
            for i in xrange(len(xcol)):
                """if correlation is more than 90%, ignore that column"""
                if self.data.iloc[:,xcol[i]].corr(self.DataY) <0.9:
                    self.DataX=pd.concat([self.DataX,self.data.iloc[:,xcol[i]]],axis=1)
                    self.selectedCols.append(xcol[i])
                    c +=1
                """if all columns have a correlation of more than 0.9, just consider the last column"""
                if i==(len(xcol)-1) and c==0:
                    self.DataX=pd.concat([self.DataX,self.data.iloc[:,xcol[i]]],axis=1)
                    self.selectedCols.append(xcol(i))
        else:
            self.DataX=self.data.iloc[:,xcol]
        
        
        # freeing up memory by deleting variable
        del self.data
    
    def transformData(self,df):
        """function for normalizing data"""
        return df/self.maxY
    
    def inverseTransform(self,df):
        """inverse transforming normalized data"""
        return df*self.maxY    
            
    def futureData(self,filepath,futureXcol=None,fillData=False):
        """future xData for prediction"""
        """NOTE: future xData must have same format as present xData"""
        futureData=pd.DataFrame()
        if filepath[-4:]=='xlsx':
            futureData=pd.read_excel(filepath)
        elif filepath[-3:]=='csv':
            futureData=pd.read_csv(filepath)
        else:
            raise Exception("File format not supported")
            
        if fillData==True:
            """method for filling up missing values
            must be replaced with DCL"""
            futureData=futureData.fillna(method='ffill')
            futureData=futureData.fillna(method='bfill')
        #print(futureData.head(5))
        for i in self.selectedCols:
            self.futureX=pd.concat([self.futureX,futureData.iloc[:,i]],axis=1)
        
        #futureY only if for testing, in deployment, we won't have target variable. we have to predict it
        self.futureY=futureData.iloc[:,17]
        # freeing up memory by deleting variable
        del futureData

    def RegressionNN(self,nb_epoch=10,batch_size=0):
        "neural net based regression model"
        """nb_epoch=no of iterations over the data
        batch_size= no of data points taken per iteration"""
        # New sequential network structure.
        model = Sequential()
        # Input layer with dimension 1 and hidden layer i with 256 neurons. 
        model.add(Dense(256, input_dim=len(self.DataX.columns), activation='relu'))
        # Dropout of 25% of the neurons and activation layer.
        model.add(Dropout(.25))
        model.add(Activation("linear"))
        # Hidden layer j with 128 neurons plus activation layer.
        #model.add(Dense(128, activation='relu'))
        #model.add(Dropout(.25))
        #model.add(Activation("linear"))
        # Hidden layer k with 128 neurons.
        #model.add(Dense(128, activation='relu'))
        #model.add(Dropout(.25))

        # Output Layer.
        model.add(Dense(1))

        # Model is derived and compiled using mean square error as loss
        # function, accuracy as metric and gradient descent optimizer.
        model.compile(loss='mse', optimizer='Adagrad', metrics=["accuracy"])
        #model.summary()

        
        if batch_size == 0:
            if len(self.DataX)>10000:
                batch_size=100
            elif len(self.DataX)<=10000:
                batch_size=10
        #fitting the model on input data
        model.fit(self.DataX.values, self.DataY.values, nb_epoch=nb_epoch,batch_size=batch_size, verbose=0)
        # predicting for future data
        pred_NN=model.predict(self.futureX.values)
        return pred_NN

    def regressionRandomForest(self):
        #building the random forest model
        regr_rf = RandomForestRegressor(max_depth=200, random_state=0)
        #fitting with x and y data
        regr_rf.fit(self.DataX,self.DataY)
        #predicting on future data
        rf_pred=regr_rf.predict(self.futureX)
        return rf_pred

    def regressionLinear(self):
        #building the linear model
        regr_linear = linear_model.LinearRegression()
        #fitting with x and y data
        regr_linear.fit(self.DataX,self.DataY)
        #predicting on future data
        linear_pred=regr_linear.predict(self.futureX)
        return linear_pred


# Sample code to test the class and how to implement it

In [7]:
df=Regression()
df.loadData('Matchams15.csv',fillData=True)
df.createData(xcol=range(10,17,1),ycol=17)


In [8]:
df.futureData('Matchams15Test.xlsx',fillData=True)


In [17]:
df.DataX.head()

Unnamed: 0,Kw/ML,tariff
0,146.881762,0.07365
1,147.748371,0.07365
2,148.104612,0.07365
3,148.3869,0.07365
4,148.346662,0.07365


In [18]:
df.DataX['Kw/ML'].corr(df.DataY)

0.86991457781939741

In [19]:
#predNN=df.RegressionNN(nb_epoch=10,batch_size=100)
predrf=df.regressionRandomForest()
predL=df.regressionLinear()


In [21]:
from sklearn.metrics import r2_score
#r2_score(predNN,df.futureY)
print(r2_score(predrf,df.futureY))
print(r2_score(predL,df.futureY))

0.992863995792
0.432548376263
