In [21]:
# general 
import datetime
import os

# data analysis and wrangling
import pandas as pd

import numpy as np

# machine learning
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics

# pickling
import pickle

%matplotlib inline

In [22]:
def train_model(read_directory, write_directory, line):
    
#     Read in the data from a csv file using Pandas
    df = pd.read_csv(read_directory + line + ".csv", low_memory=False, header=None)
    df.columns = ["Timestamp", "LineID", "JourneyPatternID", "TimeFrame", 
                  "VehicleJourneyID", "Lon", "Lat", "VehicleID", "StopID", 
                  "AtStop", "HumanTime", "Day", "Hour", "Runtime"]
    
    # Setting up descriptive & target features
    features = ["Day", "Hour", "JourneyPatternID", "StopID",]# "JourneyPatternID"] #'LineID', 'JourneyPatternID']

    # converting to categories
    for feature in features:
        df[feature] = df[feature].astype('category')

    # Priming features for the model
    X = pd.concat([df[features]], axis=1)
    y = df.Runtime

    # Creating and training the model 
    rfc = RandomForestRegressor(n_estimators=100, max_features='auto', oob_score=True, random_state=1)
    rfc.fit(X, y)

    # pickling the model 
    pickle.dump(rfc, open(write_directory + line + ".sav", 'wb'))



In [25]:
def main(read_directory, write_directory):
    for read_file in os.listdir(read_directory):
        if read_file.endswith(".csv"):
            line = read_file[:-4]
        #             print("Reading", read_file, "from", read_directory)
            if not os.path.isfile(write_directory + line + ".sav"):
                print(write_directory + line + ".sav")
#                 train_model(read_directory, write_directory, line)            
            print("Finished", read_file)
            print()
    print("Finished main!")

In [26]:
read_directory = "bus_data/line_data/"
write_directory = "bus_data/sklearn_models/"

main(read_directory, write_directory)

Finished 102.csv

Finished 104.csv

Finished 11.csv

Finished 111.csv

Finished 114.csv

Finished 116.csv

Finished 118.csv

Finished 120.csv

Finished 122.csv

Finished 123.csv

Finished 13.csv

Finished 130.csv

Finished 14.csv

Finished 140.csv

Finished 142.csv

Finished 145.csv

Finished 14C.csv

Finished 15.csv

Finished 150.csv

Finished 151.csv

Finished 15A.csv

Finished 15B.csv

Finished 16.csv

Finished 161.csv

Finished 16C.csv

Finished 17.csv

Finished 17A.csv

Finished 18.csv

Finished 184.csv

Finished 185.csv

Finished 220.csv

Finished 236.csv

Finished 238.csv

Finished 239.csv

Finished 25.csv

Finished 25A.csv

Finished 25B.csv

Finished 25X.csv

Finished 26.csv

Finished 27.csv

Finished 270.csv

Finished 27A.csv

Finished 27B.csv

Finished 27X.csv

Finished 29A.csv

Finished 31.csv

Finished 31A.csv

Finished 31B.csv

Finished 32.csv

Finished 32A.csv

Finished 32B.csv

Finished 32X.csv

Finished 33.csv

Finished 33A.csv

Finished 33B.csv

Finished 33X.csv

Finis