## SF Bike-sharing Dock Size Optimization

Gege Zhang (gegez@mit.edu)

Alexandru Socolov (socolov@mit.edu)

In [None]:
using JuMP, Gurobi, CSV, StatsBase, DataFrames, LinearAlgebra, Distributions, Plots, Random, Statistics, Dates

In [None]:
bikes = CSV.read("status_station_weather_NAremoved.csv", header = true, missingstring = "NA")
bikes = bikes[!, 3:end];

In [None]:
bikes[:, :station_id] = categorical(string.(bikes[:, :station_id]))
bikes[:, :zip_code] = categorical(string.(bikes[:, :zip_code]));

In [None]:
first(bikes, 5)

In [None]:
println(names(bikes))

### Splitting the data: 
train = before May 2015

test = June - Aug 2015

In [None]:
bikes_train = bikes[bikes[!, :date] .<= Date("2015-05-31"), :]
bikes_test = bikes[bikes[!, :date] .> Date("2015-05-31"), :];

In [None]:
CSV.write("bike_train.csv", bikes_train)
CSV.write("bike_test.csv", bikes_test)

In [None]:
println(size(bikes_train))
println(size(bikes_test))

### Fit ORT

In [None]:
bikes_train_X = bikes_train[:, filter(x -> !(x in [:installation_date, :dock_count, :date,
                                   :avg_docks_available, :avg_bikes_available, :events, :precipitation_inches, :population, :station_id]), names(bikes_train))]
bikes_train_Y = bikes_train[!, :avg_docks_available];

In [None]:
lnr = IAI.OptimalTreeRegressor(random_seed=1, max_depth=5, minbucket=10, missingdatamode = :separate_class)

grid = IAI.GridSearch(lnr, max_depth=1:10, minbucket = collect(50:20:100))

IAI.fit!(grid, bikes_train_X, bikes_train_Y)

In [None]:
lnr_fitted = IAI.get_learner(grid)

In [None]:
bikes_test_X = bikes_test[:, filter(x -> !(x in [:installation_date, :dock_count, :date,
                                   :avg_docks_available, :avg_bikes_available, :events, :precipitation_inches, :population, :station_id]), names(bikes_test))]
bikes_test_Y = bikes_test[!, :avg_docks_available];

In [None]:
println(IAI.score(lnr_fitted, bikes_test_X, bikes_test_Y))

In [None]:
lnr_fitted = IAI.fit!(lnr, bikes_train_X, bikes_train_Y)
bikes_test_prediction = IAI.predict(lnr_fitted, bikes_test_X);

In [None]:
SSE = sum((bikes_test_prediction - bikes_test_Y).^2)
SST = sum((mean(bikes_test_Y) .- bikes_test_Y).^2)
OSR2 = 1-SSE/SST
OSR2

In [None]:
SSE = sum((bikes_test_prediction - bikes_test_Y).^2)

In [None]:
lnr = IAI.OptimalTreeRegressor(random_seed=1, max_depth=7, minbucket=70, 
    cp = 0.000145028525998319728, missingdatamode = :separate_class)

In [None]:
bikes2 = bikes[bikes[!, :date] .< Date("2015-08-01"), :]
bikes2_X = bikes2[:, filter(x -> !(x in [:installation_date, :dock_count, :date,
                                   :avg_docks_available, :avg_bikes_available, :events, :precipitation_inches, :population, :station_id]), names(bikes_test))]
bikes2_Y = bikes2[!, :avg_docks_available];

In [None]:
lnr_final = IAI.fit!(lnr, bikes2_X, bikes2_Y)

In [None]:
function prediction_calculation(station_X, station_Y)
    prediction_Y = Array{Union{Missing, Any}}(missing, 31, 1)
    for i = 1:31
        point_prediction = IAI.predict(lnr_final, predict_matrix[i:i+1,:])
        #print(point_prediction[2,1])
        prediction_Y[i, 1] = point_prediction[2,1]
        if i < 31
            predict_matrix[i+2, 22] = point_prediction[2]  #lag 1
        end
    
        if i < 24
            predict_matrix[i+8, 23] = point_prediction[2] #lag 7
        end
    
        if i == 1
            predict_matrix[32, 24] = point_prediction[2]
        end
    end
 return prediction_Y
end

In [None]:
prediction = Array{Union{Missing, Any}}(missing, 31, 70)
k = 1
for station_id in unique(bikes[:,:station_id])
    #println(station_id)
    station = bikes[bikes[!, :station_id] .== station_id, :]
    station = station[station[!, :date] .>= Date("2015-07-01"),:]
    station_X = station[:, filter(x -> !(x in [:installation_date, :dock_count, :date,
                                   :avg_docks_available, :avg_bikes_available, :events, 
                    :precipitation_inches, :population, :station_id]), names(station))]
    station_Y = station[!, :avg_docks_available]
    
    predict_matrix = station_X[31:end, 1:21]
    predict_matrix[!,:avg_docks_1D] .= 0.0
    predict_matrix[!,:avg_docks_7D] .= 0.0
    predict_matrix[!,:avg_docks_30D] .= 0.0

    predict_matrix[1, 22] = station_Y[30, 1]
    predict_matrix[1, 23] = station_Y[24, 1]
    predict_matrix[1, 24] = station_Y[1, 1]

    predict_matrix[2, 22] = station_Y[31, 1]
    predict_matrix[2:8, 23] = station_Y[25:31, 1]
    predict_matrix[2:31, 24] = station_Y[2:31, 1]

    predict_matrix[:,22:24]
    prediction_Y = prediction_calculation(station_X, station_Y)
    prediction[:,k] = prediction_Y
    k +=1
end

In [None]:
prediction_table = Tables.table(prediction);

In [None]:
prediction

In [None]:
CSV.write("prediction.csv", prediction_table)

# Station 2 Analysis

In [None]:
bikes_station2 = bikes[bikes[!,:station_id] .== "2",:]
bikes_station2_train = bikes_station2[bikes_station2[!, :date] .<= Date("2015-05-31"), :]
bikes_station2_test = bikes_station2[bikes_station2[!, :date] .> Date("2015-05-31"), :];

In [None]:
bikes_station2_train_X = bikes_station2_train[:, filter(x -> !(x in [:installation_date, :dock_count, :date,
                :avg_docks_available, :avg_bikes_available, :events, :precipitation_inches, :station_id]), names(bikes_station2_train))]
bikes_station2_train_Y = bikes_station2_train[!, :avg_docks_available];

In [None]:
lnr = IAI.OptimalTreeRegressor(random_seed=1, max_depth=5, minbucket=10, missingdatamode = :separate_class)

grid = IAI.GridSearch(lnr, max_depth=1:10, minbucket = collect(5:5:50))

IAI.fit!(grid, bikes_station2_train_X, bikes_station2_train_Y)

In [None]:
lnr_fitted = IAI.get_learner(grid)

In [None]:
bikes_station2_test_X = bikes_station2_test[:, filter(x -> !(x in [:installation_date, :dock_count, :date,
                :avg_docks_available, :avg_bikes_available, :events, :precipitation_inches, :station_id]), names(bikes_station2_test))]
bikes_station2_test_Y = bikes_station2_test[!, :avg_docks_available];

In [None]:
println(IAI.score(lnr_fitted, bikes_station2_train_X, bikes_station2_train_Y))

In [None]:
println(IAI.score(lnr_fitted, bikes_station2_test_X, bikes_station2_test_Y))