In [165]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from sklearn.linear_model import LinearRegression
import math

In [166]:
trainingPercent = 70
validationPercent = 10
testingPercent = 20

In [167]:
def getDf(fileName):                       # Function to read file and store data in a dataframe
    df = pd.read_csv(fileName)             # df is my dataframe with all the values
    return df                              # Returns the dataframe

In [168]:
def encodeSubsidiary(df):                             # Encodes the categorical subsidiaries to numerical values
    df.Subsidiary = pd.Categorical(df.Subsidiary)
    df['Subsidiary'] = df.Subsidiary.cat.codes
    return df                                         # Returns the modified dataframe

In [169]:
def getFeatureValues(df):                # Gets the feature matrix
    featureData = df.iloc[:,0:3]
    return featureData.values

In [170]:
def normalizeValues(m):             # Function to normalize any matrix in parameter
    m = (m-m.mean())/m.std()        # Formula to normalize matrix for python
    return m                        # Return normalized matrix

In [171]:
def targetValues(df):                # Get the target vector
    targetData = df.iloc[:, 3:4]
    return targetData.values

In [172]:
def shuffleRows(df):            # Shuffle rows to get randomize rows of data for training and validation and testing
    shuffled = df.sample(frac=1).reset_index(drop=True)
    return shuffled

In [173]:
def trainingData(df, trainingPercent):                            # Get training data to train the model
    trainingDataRows = int(math.ceil(len(df)*trainingPercent*0.01))      # Get the number of rows for training data
    tMat = df[:trainingDataRows]                                    # Return the new matrix only with the training data
    return tMat                                                        

In [174]:
def valData(df, validationPercent):
    validationRows = int(math.ceil(len(df)*validationPercent*0.01))
    trainingDataRows = int(math.ceil(len(df)*trainingPercent*0.01))
    vRow = validationRows +trainingDataRows
    vMat = df[trainingDataRows:vRow]
    return vMat

In [175]:
def testingData(df, testingPercent):
    validationRows = int(math.ceil(len(df)*validationPercent*0.01))
    trainingDataRows = int(math.ceil(len(df)*trainingPercent*0.01))
    testingRows = int(math.ceil(len(df)*testingPercent*0.01))
    vRow = validationRows +trainingDataRows
    testRow = validationRows +trainingDataRows + testingRows
    vMat = df[vRow:testRow]
    return vMat

In [176]:
def getRegressor():
    regressor = LinearRegression()
    return regressor

In [177]:
def getFitModel(regressor,trainingFeatures,trainingTargets):
    return regressor.fit(trainingFeatures,trainingTargets)

In [184]:
def getPrediction(regressor,feature):
    predictions = regressor.predict(feature)
    return predictions

In [187]:
df = getDf('rochester-genesee-regional-transportation-authority-rgrta-percentage-of-buses-running-on-time-beginning-2009.csv')
pos = df.columns[3]
df = encodeSubsidiary(df)
df = normalizeValues(df)
df = shuffleRows(df)
median_OnTime = df['Percent On-Time'].median()
df['Percent On-Time'] = df['Percent On-Time'].fillna(median_OnTime)
td = trainingData(df,trainingPercent)
vd = valData(df,validationPercent)
testd = testingData(df,testingPercent)
td_x = getFeatureValues(td)
td_y = targetValues(td)
vd_x = getFeatureValues(vd)
vd_y = targetValues(vd)
testd_x = getFeatureValues(testd)
testd_y = targetValues(testd)
regressor = getRegressor()
regressor = getFitModel(regressor,td_x,td_y)
prediction = getPrediction(regressor,testd_x)


[[-0.06125489]
 [ 0.02343836]
 [ 0.28585085]
 [ 0.30523897]
 [ 0.09682225]
 [-0.37978911]
 [ 0.07190693]
 [-0.06873891]
 [ 0.18585225]
 [ 0.32088508]
 [ 0.31255137]
 [ 0.21433796]
 [ 0.10787666]
 [-0.09492704]
 [-0.38259561]
 [ 0.21901548]
 [-0.08480813]
 [ 0.19597116]
 [-0.0622762 ]
 [ 0.08202584]
 [-0.19322625]
 [ 0.31535788]
 [-0.53846099]
 [-0.16218899]
 [-0.08667914]
 [-0.20053865]
 [ 0.03355727]
 [-0.17520021]
 [ 0.07462763]
 [ 0.18032505]
 [-0.20615167]
 [-0.07664605]
 [ 0.19222915]
 [-0.30980646]
 [-0.38812282]
 [-0.17613572]
 [-0.37247671]
 [-0.10700276]
 [ 0.19129365]
 [ 0.08576785]
 [-0.10513176]
 [-0.17060851]
 [-0.05394249]
 [-0.37341221]
 [ 0.09120924]
 [ 0.00694256]
 [-0.07571054]
 [-0.08676495]
 [ 0.10047845]
 [-0.17562333]
 [ 0.31705726]
 [ 0.10974767]
 [ 0.29971177]
 [-0.07758155]
 [ 0.11340387]
 [ 0.02624487]
 [-0.27894082]
 [-0.00241246]
 [ 0.21425215]
 [ 0.03823478]
 [ 0.10507015]
 [ 0.09214474]
 [-0.09781936]
 [ 0.32173477]
 [-0.08863596]
 [ 0.07658444]
 [ 0.28023