In [112]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from sklearn.linear_model import LinearRegression
import math

In [113]:
trainingPercent = 75
testingPercent = 25

In [114]:
def getDf(fileName):                       # Function to read file and store data in a dataframe
    df = pd.read_csv(fileName)             # df is my dataframe with all the values
    return df                              # Returns the dataframe

In [115]:
def parseData(df):
    subsidiaries = []
    subsidiaries.append((df.loc[df['Subsidiary']=='Batavia Bus Service']))
    subsidiaries.append((df.loc[df['Subsidiary']=='Lift Line']))
    subsidiaries.append((df.loc[df['Subsidiary']=='Livingston Area Transportation Service']))
    subsidiaries.append((df.loc[df['Subsidiary']=='Orleans Transit Service']))
    subsidiaries.append((df.loc[df['Subsidiary']=='Regional Transit Service']))
    subsidiaries.append((df.loc[df['Subsidiary']=='RTS Access']))
    subsidiaries.append((df.loc[df['Subsidiary']=='RTS Genesee']))
    subsidiaries.append((df.loc[df['Subsidiary']=='RTS Livingston']))
    subsidiaries.append((df.loc[df['Subsidiary']=='RTS Ontario']))
    subsidiaries.append((df.loc[df['Subsidiary']=='RTS Orleans']))
    subsidiaries.append((df.loc[df['Subsidiary']=='RTS Seneca']))
    subsidiaries.append((df.loc[df['Subsidiary']=='RTS Wayne']))
    subsidiaries.append((df.loc[df['Subsidiary']=='RTS Wyoming']))
    subsidiaries.append((df.loc[df['Subsidiary']=='Seneca Transit Service']))
    subsidiaries.append((df.loc[df['Subsidiary']=='Wayne Area Transit Service']))
    subsidiaries.append((df.loc[df['Subsidiary']=='Wayne Area Transportation Service']))
    subsidiaries.append((df.loc[df['Subsidiary']=='Wyoming Transit Service']))
    subsidiaries.append((df.loc[df['Subsidiary']=='Wyoming Transportation Service']))
    
    return subsidiaries

In [116]:
def getFeatureValues(subsidiaries):                # Gets the feature matrix
    features = []
    for i in subsidiaries:
        features.append(i.iloc[:,1:3])
    return features

In [117]:
def getTargetValues(subsidiaries):                # Get the target vector
    targets = []
    for i in subsidiaries:
        targets.append(i.iloc[:,3:4])
    return targets

In [118]:
df = getDf('rochester-genesee-regional-transportation-authority-rgrta-percentage-of-buses-running-on-time-beginning-2009.csv')
sub = parseData(df)
features = getFeatureValues(sub)
targets = getTargetValues(sub)


In [119]:
def getTrainAndTest(feature,target,trainingPercent):
    percentData = int(math.ceil((len(feature))*(trainingPercent*0.01)))
    train_x = feature[0:percentData]
    train_y = target[0:percentData]
    test_x = feature[percentData:len(feature)]
    test_y = target[percentData:len(target)]
    
    return train_x,train_y,test_x,test_y

In [120]:
def getRegressor():
    regressor = LinearRegression()
    return regressor

In [121]:
def getFitModel(regressor,trainingFeatures,trainingTargets):
    return regressor.fit(trainingFeatures,trainingTargets)

In [122]:
def getPrediction(regressor,feature):
    predictions = regressor.predict(feature)
    return predictions

In [123]:
train_x = []
train_y = []
test_x = []
test_y = []
for i,j in zip(features,targets):
    tx,ty,tex,tey = getTrainAndTest(i,j,trainingPercent)
    train_x.append(tx)
    train_y.append(ty)
    test_x.append(tex)
    test_y.append(tey)


In [124]:
def getModel(train_x,train_y,test_x,test_y):
    re = getRegressor()
    re = getFitModel(re,train_x,train_y)
    prediction = getPrediction(re,test_x)
    correct = 0
    for i,j in zip(prediction,test_y):
        if abs(i-j) < 1:
            correct+=1
    return re,correct

In [127]:
prediction = []
for i,j,k,l in zip(train_x,train_y,test_x,test_y):
    re = getRegressor()
    re = getFitModel(re,i,j)
    prediction.append(getPrediction(re,k))


In [128]:
print(prediction)

[array([[89.57476051],
       [89.49827409],
       [89.42178767],
       [89.34530124],
       [89.26881482],
       [89.1923284 ],
       [89.11584198],
       [89.03935555],
       [88.96286913],
       [88.88638271],
       [88.80989629]]), array([[91.78901106],
       [91.75031114],
       [91.71161121],
       [91.67291129],
       [91.63421136],
       [91.59551144],
       [91.55681152],
       [91.51811159],
       [91.47941167],
       [91.44071174],
       [91.40201182]]), array([[97.34949232],
       [97.27320086],
       [97.19690939],
       [97.12061792],
       [97.04432646],
       [96.96803499],
       [96.89174353],
       [96.81545206],
       [96.7391606 ],
       [96.66286913],
       [96.58657767]]), array([[95.06352328],
       [95.22555617],
       [95.38758907],
       [95.54962196],
       [95.71165486],
       [95.87368775],
       [96.03572065],
       [96.19775354],
       [96.35978644],
       [96.52181933],
       [96.68385223]]), array([[93.78833575],
 