In [14]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from sklearn.linear_model import LinearRegression
import math

In [15]:
trainingPercent = 70
validationPercent = 10
testingPercent = 20

In [16]:
def getDf(fileName):                       # Function to read file and store data in a dataframe
    df = pd.read_csv(fileName)             # df is my dataframe with all the values
    return df                              # Returns the dataframe

In [17]:
def encodeSubsidiary(df):                             # Encodes the categorical subsidiaries to numerical values
    df.Subsidiary = pd.Categorical(df.Subsidiary)
    df['Subsidiary'] = df.Subsidiary.cat.codes
    return df                                         # Returns the modified dataframe

In [18]:
def getFeatureValues(df):                # Gets the feature matrix
    featureData = df.iloc[:,0:3]
    return featureData.values

In [19]:
def normalizeValues(m):             # Function to normalize any matrix in parameter
    m = (m-m.mean())/m.std()        # Formula to normalize matrix for python
    return m                        # Return normalized matrix

In [20]:
def targetValues(df):                # Get the target vector
    targetData = df.iloc[:, 3:4]
    return targetData.values

In [21]:
def shuffleRows(df):            # Shuffle rows to get randomize rows of data for training and validation and testing
    shuffled = df.sample(frac=1).reset_index(drop=True)
    return shuffled

In [22]:
def trainingData(df, trainingPercent):                            # Get training data to train the model
    trainingDataRows = int(math.ceil(len(df)*trainingPercent*0.01))      # Get the number of rows for training data
    tMat = df[:trainingDataRows]                                    # Return the new matrix only with the training data
    return tMat                                                        

In [23]:
def valData(df, validationPercent):
    validationRows = int(math.ceil(len(df)*validationPercent*0.01))
    trainingDataRows = int(math.ceil(len(df)*trainingPercent*0.01))
    vRow = validationRows +trainingDataRows
    vMat = df[trainingDataRows:vRow]
    return vMat

In [24]:
def testingData(df, testingPercent):
    validationRows = int(math.ceil(len(df)*validationPercent*0.01))
    trainingDataRows = int(math.ceil(len(df)*trainingPercent*0.01))
    testingRows = int(math.ceil(len(df)*testingPercent*0.01))
    vRow = validationRows +trainingDataRows
    testRow = validationRows +trainingDataRows + testingRows
    vMat = df[vRow:testRow]
    return vMat

In [25]:
def getRegressor():
    regressor = LinearRegression()
    return regressor

In [26]:
def getFitModel(regressor,trainingFeatures,trainingTargets):
    return regressor.fit(trainingFeatures,trainingTargets)

In [27]:
def getPrediction(regressor,feature,target):
    predictions = regressor.predict(feature)
    return predictions

In [30]:
def getLastCol(df):
    return df.dtypes.index

In [53]:
df = getDf('rochester-genesee-regional-transportation-authority-rgrta-percentage-of-buses-running-on-time-beginning-2009.csv')
pos = df.columns[3]
df = encodeSubsidiary(df)
df = normalizeValues(df)
df = shuffleRows(df)
median_OnTime = df['Percent On-Time'].median()
df['Percent On-Time'] = df['Percent On-Time'].fillna(median_OnTime)
td = trainingData(df,trainingPercent)
vd = valData(df,validationPercent)
testd = testingData(df,testingPercent)
td_x = getFeatureValues(td)
td_y = targetValues(td)
vd_x = getFeatureValues(vd)
vd_y = targetValues(vd)
testd_x = getFeatureValues(testd)
testd_y = targetValues(testd)
regressor = getRegressor()
regressor = getFitModel(regressor,td_x,td_y)
