# SF Bay Area Bike Share using XGBoost

We are going to use XGBoost to predict availability of bikes at various stations

First we update the local version of Pandas before we import it

*NOTE Once this installtion has completed, please restart the Kernel*

In [None]:
! pip install --upgrade pandas --user

Next we import the required libraries and set up the figure parameters

In [None]:
#Importing the required libraries and setting up the figure parameters
import pandas as pd
import s3fs
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import KFold
from matplotlib import rcParams
dark_colors = ["#99D699", "#B2B2B2",
                (0.8509803921568627, 0.37254901960784315, 0.00784313725490196),
                (0.4588235294117647, 0.4392156862745098, 0.7019607843137254),
                (0.9058823529411765, 0.1607843137254902, 0.5411764705882353),
                (0.4, 0.6509803921568628, 0.11764705882352941),
                (0.9019607843137255, 0.6705882352941176, 0.00784313725490196),
                (0.6509803921568628, 0.4627450980392157, 0.11372549019607843),
                (0.4, 0.4, 0.4)]
rcParams['figure.figsize'] = (12, 9)
rcParams['figure.dpi'] = 150
rcParams['lines.linewidth'] = 2
rcParams['axes.facecolor'] = "white"
rcParams['axes.titlesize'] = 20      
rcParams['axes.labelsize'] = 17.5
rcParams['xtick.labelsize'] = 15 
rcParams['ytick.labelsize'] = 15
rcParams['legend.fontsize'] = 17.5
rcParams['patch.edgecolor'] = 'none'
rcParams['grid.color']="white"   
rcParams['grid.linestyle']="-" 
rcParams['grid.linewidth'] = 1
rcParams['grid.alpha']=1
rcParams['text.color'] = "444444"
rcParams['axes.labelcolor'] = "444444"
rcParams['ytick.color'] = "444444"
rcParams['xtick.color'] = "444444"

Lets make suure the Pandas version installed is >=1.3.4

In [None]:
print(pd.__version__)

Replace \<REPLACE WITH YOUR BUCKET NAME\> with your bucket name

In [None]:
bucket = '<REPLACE WITH YOUR BUCKET NAME>' # Replace with your bucket name

Copy the datasets we will be using to your notebook instance

In [None]:
! aws s3 cp s3://example-lab-artifacts/geospatial/dataset/ . --recursive

Now we load the trips made and list of stations datasets

In [None]:
trips_df = pd.read_csv('./trip.csv')
stations_df = pd.read_csv('./station.csv')

Lets take a look at the stations dataset

In [None]:
stations_df.head()

In [None]:
stations_df.dtypes

In [None]:
stations_df["lat"] = stations_df["lat"].apply(lambda x:str(x))
stations_df["long"] = stations_df["long"].apply(lambda x:str(x))

In [None]:
stations_df.head()

In [None]:
trips_df.head()

In [None]:
trips_df['start_date'] = pd.to_datetime(trips_df['start_date'])
trips_df['end_date'] = pd.to_datetime(trips_df['end_date'])

Now we are going to join the trip and station datasets on station_id, alternatively same could be done through DataWrangler as explained last week.

In [None]:
start_station_info = stations_df[["id","lat","long"]]
start_station_info.columns = ["start_station_id","start_lat","start_long"]
end_station_info = stations_df[["id","lat","long"]]
end_station_info.columns = ["end_station_id","end_lat","end_long"]
trips_df = trips_df.merge(start_station_info,on="start_station_id")
trips_df = trips_df.merge(end_station_info,on="end_station_id")

In [None]:
trips_df.head()

Lets create a temporary dataset with fields of interest for plotting purposes 

In [None]:
plot_dict = dict()
for index,row in trips_df.iterrows():
    start_lat = row['start_lat']
    start_long = row['start_long']
    end_lat = row['end_lat']
    end_long = row['end_long']
    key = str(start_lat)+'_'+str(start_long)+'_'+str(end_lat)+'_'+str(end_long)
    if key in plot_dict:
        plot_dict[key] += 1
    else:
        plot_dict[key] = 1

In [None]:
start_lat = []
start_long = []
end_lat = []
end_long = []
nb_trips = []
for key,value in plot_dict.items():
    start_lat.append(float(key.split('_')[0]))
    start_long.append(float(key.split('_')[1]))
    end_lat.append(float(key.split('_')[2]))
    end_long.append(float(key.split('_')[3]))
    nb_trips.append(int(value))

In [None]:
temp_df = pd.DataFrame({"start_lat":start_lat,"start_long":start_long,"end_lat":end_lat,"end_long":end_long,"nb_trips":nb_trips})

In [None]:
temp_df.dtypes

In [None]:
temp_df.head()

Now lets plot the trip duration distribution. Shows that most trip durations are between 1-15 minutes

In [None]:
fig, ax1 = plt.subplots(figsize = (10,7))
ax1.grid(zorder=1)
ax1.xaxis.grid(False)
trip_dur = trips_df['duration'].values/60
plt.hist(trip_dur, bins = range(0,45,2),density=True,zorder=0,color=dark_colors[1])
plt.xlabel('Trip Duration (Minutes)')
plt.ylabel('Percent of Trips')
plt.title('Trip Duration Distribution')
plt.figure(figsize=(15,12))
hist, bin_edges = np.histogram(trip_dur, range(0,45,1), density=True)
cum_trip_dur = np.cumsum(hist)
ax2 = ax1.twinx()
ax2.plot(range(1,45,1),cum_trip_dur,c=dark_colors[0])
ax2.set_ylabel('Cumulative Proportion of Trips')
ax2.grid(b=False)

In [None]:
trips_df.head()

Lets parse out the date fields from the trips dataframe

In [None]:
trips_df['week']=trips_df.start_date.dt.dayofweek
trips_df['start_hour'] = trips_df.start_date.dt.hour
trips_df['start_day'] = trips_df.start_date.dt.day
trips_df['end_hour'] = trips_df.end_date.dt.hour
trips_df['end_day'] = trips_df.end_date.dt.day

Lets now plot the trip distribution over time by hour in the day for week days

In [None]:
plt.figure(figsize=(15,12))
weekdaytrips_df = trips_df.loc[(trips_df.duration <= 7200) & (trips_df.week <5)]
weekdaytrips_df.boxplot(column="duration",by="start_hour",figsize=(15,12))
plt.ylim(0,3600)
plt.ylabel('Trip Duration (Seconds)')
plt.xlabel('Hour of Day')
plt.title('Trip Duration Distribution Over Time of Day (Week Days)')
plt.show()

Lets now plot the trip distribution over time by hour in the day for weekend

In [None]:
plt.figure(figsize=(15,12))
weekendtrips_df = trips_df.loc[(trips_df.duration <= 7200) & (trips_df.week >4)]
weekendtrips_df.boxplot(column="duration",by="start_hour",figsize=(15,12))
plt.ylim(0,3600)
plt.ylabel('Trip Duration (Seconds)')
plt.xlabel('Hour of Day')
plt.title('Trip Duration Distribution Over Time of Day (Weekend days)')

We are going to define a function to lower our memory footprint of the dataframe that will store the station status dataset (alomost 2GB). 

The function will iterate through all of the columns of the dataframe and modify the data type according to the data contained. 

In [None]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    for col in df.columns:
        col_type = df[col].dtype
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    return df

Now lets load the station status dataset and leverage the function we just defined

In [None]:
status_df = reduce_mem_usage(pd.read_csv('./status.csv'))

In [None]:
status_df.head()

In [None]:
status_df.info()

In [None]:
status_df.time = pd.to_datetime(status_df.time)
status_df = status_df[status_df.time.dt.minute%5 ==0]

In [None]:
stations_df.rename(columns={"id":"station_id"},inplace=True)

In [None]:
stations_df.installation_date = pd.to_datetime(stations_df.installation_date)

Lets now merge with the stations dataframe on station_id

In [None]:
status_df = status_df.merge(stations_df,on="station_id",how="left")

Now we the status dataframe with lat/lon and names from the station dataset

In [None]:
status_df.head()

Now we drop the index colum in the status dataframe

In [None]:
status_df.reset_index(inplace=True)
status_df.drop(columns=["index"],inplace=True)

In [None]:
status_df["date"] = status_df.time.dt.date

In [None]:
status_df.head()

Now lets load up the weather dataset

In [None]:
weather_df = reduce_mem_usage(pd.read_csv('./weather.csv'))

In [None]:
weather_df.date = pd.to_datetime(weather_df.date)

In [None]:
weather_df.head()

## Mapping of Zip Codes ans City names
### 95113 - San Jose
### 94301 - Palo Alto
### 94107 - San Francisco

### 94063 - Redwood City
### 94041 - Mountain View


In [None]:
zipcode_city_dict = dict()
zipcode_city_dict[95113] = 'San Jose'
zipcode_city_dict[94301] = 'Palo Alto'
zipcode_city_dict[94107] = 'San Francisco'
zipcode_city_dict[94063] = 'Redwood City'
zipcode_city_dict[94041] = 'Mountain View'

Lets now map the zipcode to city name and apply it to the weather dataset

In [None]:
weather_df["city"] = weather_df.zip_code.apply(lambda x:zipcode_city_dict[x])

In [None]:
weather_df.head()

In [None]:
status_df.date = pd.to_datetime(status_df.date)

Now lets join the weather and status dataframes on date and city

In [None]:
status_df = status_df.merge(weather_df,how="left",on=["date","city"])

In [None]:
status_df.head()

In [None]:
status_df.dropna(inplace=True)

Now lets use LabelEncoder on events, precipitation_inches and name

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [None]:
status_df["events"] = le.fit_transform(status_df["events"])
status_df["precipitation_inches"] = le.fit_transform(status_df["precipitation_inches"])
status_df["name"] = le.fit_transform(status_df["name"])


In [None]:
status_df.head()

Now lets create our train and test datasets

In [None]:
df = pd.DataFrame(np.random.randn(len(status_df), 1))
msk = np.random.rand(len(df)) < 0.6666
status_df_train = status_df[msk]
status_df_test = status_df[~msk]

In [None]:
train_cols = [c for c in status_df_train.columns if c not in ['time','installation_date','date','city','lat','long','name','bikes_available']]

In [None]:
train_cols.insert(0,'bikes_available')

In [None]:
train_cols

In [None]:
test_cols = train_cols.copy()

In [None]:
test_cols.remove('bikes_available')

In [None]:
test_cols

In [None]:
import sagemaker

sess = sagemaker.Session()
bucket = sess.default_bucket()
prefix = "sagemaker/DEMO-xgboost-churn"

# Define IAM role
import boto3
import re
from sagemaker import get_execution_role

role = get_execution_role()

In [None]:
import matplotlib.pyplot as plt
import io
import os
import sys
import time
import json
from IPython.display import display
from time import strftime, gmtime
from sagemaker.inputs import TrainingInput
from sagemaker.serializers import CSVSerializer

In [None]:
status_df_train.info()

In [None]:
from io import StringIO
csv_buffer = StringIO()
status_df_train[train_cols].to_csv(csv_buffer, header=False, index = False)
s3_resource = boto3.resource('s3')
s3_resource.Object(bucket, 'geo/train/train.csv').put(Body=csv_buffer.getvalue())

In [None]:
csv_buffer = StringIO()
status_df_test[train_cols].to_csv(csv_buffer, header=False, index = False)
s3_resource = boto3.resource('s3')
s3_resource.Object(bucket, 'geo/test/test.csv').put(Body=csv_buffer.getvalue())

In [None]:
# Lets generate the files for AutoPilot in the next lab
csv_buffer = StringIO()
status_df_train[train_cols].to_csv(csv_buffer, index = False)
s3_resource = boto3.resource('s3')
s3_resource.Object(bucket, 'autopilot/train/train.csv').put(Body=csv_buffer.getvalue())
csv_buffer = StringIO()
status_df_test[test_cols].to_csv(csv_buffer, index = False)
s3_resource = boto3.resource('s3')
s3_resource.Object(bucket, 'autopilot/test/test.csv').put(Body=csv_buffer.getvalue())

In [None]:
container = sagemaker.image_uris.retrieve("xgboost", boto3.Session().region_name, "latest")
display(container)

In [None]:
prefix = 'geo'
s3_input_train = TrainingInput(
    s3_data="s3://{}/{}/train".format(bucket, prefix), content_type="csv"
)
s3_input_validation = TrainingInput(
    s3_data="s3://{}/{}/test/".format(bucket, prefix), content_type="csv"
)

In [None]:
sess = sagemaker.Session()

xgb = sagemaker.estimator.Estimator(
    container,
    role,
    instance_count=1,
    instance_type="ml.m4.xlarge",
    output_path="s3://{}/{}/output".format(bucket, prefix),
    sagemaker_session=sess,
)
xgb.set_hyperparameters(
    max_depth=6,
    eval_metric="rmse",
    silent=0,
    objective="reg:linear",
    num_round=100,
)

xgb.fit({"train": s3_input_train, "validation": s3_input_validation})

In [None]:
%%time
xgb_predictor = xgb.deploy(
    initial_instance_count=1, instance_type="ml.m4.xlarge", serializer=CSVSerializer()
)

In [None]:
%%time
for x in range(6):
    random_row=status_df_test[train_cols].sample()
    csv_input = random_row[test_cols].to_csv(header=False,index=False)
    test_bikes_avail = random_row['bikes_available'].to_csv(header=False,index=False)
    predicted_bikes_avail = xgb_predictor.predict(csv_input).decode("utf-8")
    print(f'Test {x} - \n\tPredicted Bike Available \t{predicted_bikes_avail} \n\tActual Bike Available \t\t{test_bikes_avail}')

In [None]:
xgb_predictor.delete_endpoint()