In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import statsmodels.formula.api as sm

%matplotlib inline

In [None]:
df = pd.read_csv("bus_data/cleaned_data/line15.csv", low_memory=False, header=None)
df.columns = ["Timestamp", "LineID", "Direction", "JourneyPatternID", "TimeFrame", 
              "VehicleJourneyID", "Operator", "Congestion", "Lon", "Lat", 
              "Delay", "BlockID", "VehicleID", "StopID", "AtStop"]

In [4]:
df.shape

(2254343, 15)

In [5]:
df.dtypes

Timestamp             int64
LineID                int64
Direction             int64
JourneyPatternID     object
TimeFrame            object
VehicleJourneyID      int64
Operator             object
Congestion            int64
Lon                 float64
Lat                 float64
Delay                 int64
BlockID               int64
VehicleID             int64
StopID               object
AtStop                int64
dtype: object

In [18]:
df.head(100)

Unnamed: 0,Timestamp,LineID,Direction,JourneyPatternID,TimeFrame,VehicleJourneyID,Operator,Congestion,Lon,Lat,Delay,BlockID,VehicleID,StopID,AtStop,Day,Hour
0,2012-11-06 00:00:00,15,0,00150001,2012-11-05,5826,RD,0,-6.258584,53.340099,-361,15013,33210,4870,0,1,0
1,2012-11-06 00:00:04,15,0,00151001,2012-11-05,5929,HN,0,-6.162066,53.402668,401,15105,33502,6317,1,1,0
2,2012-11-06 00:00:04,15,0,015A1001,2012-11-05,3118,RD,0,-6.236166,53.342152,0,15010,33469,2499,0,1,0
3,2012-11-06 00:00:10,15,0,00150001,2012-11-05,5843,RD,0,-6.323327,53.277756,-463,15021,33254,4869,0,1,0
4,2012-11-06 00:00:12,15,0,015A0001,2012-11-05,3130,RD,0,-6.328856,53.304470,-106,15009,33020,1102,0,1,0
5,2012-11-06 00:00:17,15,0,,2012-11-05,3339,RD,0,-6.322704,53.271786,0,15029,33484,6280,0,1,0
6,2012-11-06 00:00:19,15,0,00150001,2012-11-05,5826,RD,0,-6.257967,53.342365,-361,15013,33210,4870,0,1,0
7,2012-11-06 00:00:23,15,0,00151001,2012-11-05,5929,HN,0,-6.162233,53.401867,401,15105,33502,6317,1,1,0
8,2012-11-06 00:00:23,15,0,015A1001,2012-11-05,3118,RD,0,-6.234233,53.341999,0,15010,33469,2499,0,1,0
9,2012-11-06 00:00:31,15,0,00150001,2012-11-05,5843,RD,0,-6.327923,53.276974,-490,15021,33254,4869,0,1,0


In [7]:
#Select all columns of type 'object'
object_columns = df.select_dtypes(['object']).columns

In [8]:
#Convert selected columns to type 'category'
for column in object_columns:
    df[column] = df[column].astype('category')

In [9]:
# Convert Unix timestamp to datetime

# Take uneccesary trailing zeroes off of unix timestamp. 
df['Timestamp'] = df['Timestamp'].apply(lambda x: x//1000000)

# Convert the timestamp
df['Timestamp'] = pd.to_datetime(df['Timestamp'], unit='s')

In [10]:
# Add day of week column
df['Day'] = df['Timestamp'].dt.dayofweek

In [11]:
# Add hour of day column
df['Hour'] = df['Timestamp'].dt.hour

In [12]:
# Descriptive stats for categorical features
df.select_dtypes(['category']).describe().T

Unnamed: 0,count,unique,top,freq
JourneyPatternID,2254287,15,00150001,607458
TimeFrame,2254343,26,2012-11-07,104579
Operator,2254343,5,RD,1594991
StopID,2254343,250,6282,125537


In [13]:
# Descriptive stats for continuous features
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
LineID,2254343.0,15.0,0.0,15.0,15.0,15.0,15.0,15.0
Direction,2254343.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
VehicleJourneyID,2254343.0,6298.909057,24939.67583,2564.0,3189.0,5220.0,5881.0,997396.0
Congestion,2254343.0,0.024897,0.155811,0.0,0.0,0.0,0.0,1.0
Lon,2254343.0,-6.260996,0.050541,-6.387784,-6.300231,-6.263592,-6.236675,-6.100584
Lat,2254343.0,53.331982,0.040214,53.186035,53.304447,53.33371,53.348286,53.421665
Delay,2254343.0,-6.497476,473.51667,-14136.0,-191.0,0.0,136.0,10882.0
BlockID,2254343.0,15041.411825,805.495302,15001.0,15010.0,15020.0,15032.0,150099.0
VehicleID,2254343.0,32641.225763,2483.145428,24549.0,33020.0,33459.0,33493.0,40033.0
AtStop,2254343.0,0.236423,0.424885,0.0,0.0,0.0,0.0,1.0


In [14]:
# Convert some features to categorical
for column in ['LineID', 'VehicleJourneyID', 'Congestion', 'BlockID', 'VehicleID', 'AtStop']:
    df[column] = df[column].astype('category')

In [16]:
vehicledf.groupby(vehicledf.VehicleJourneyID)
vehicledf

NameError: name 'vehicledf' is not defined

In [17]:
vehicledf = df[(df.VehicleJourneyID == '00151001') & (df.VehicleID == 33210) & (df.AtStop == 1) & (df.TimeFrame == "2012-11-07")]

# vehicledf = vehicledf.drop_duplicates('StopID')

In [None]:
pd.set_option('display.max_rows', 20)
vehicledf

In [None]:
# Endings = df.JourneyPatternID.str.slice([4])

In [None]:
df15 = df[(df.LineID == 15)]
df15.head()

In [None]:
df15.JourneyPatternID.value_counts()

In [None]:
stops = df15.drop_duplicates('StopID')
stops = stops['StopID', 'Diretc']
stops = stops.reset_index(drop='True')

In [None]:
# df_unusual = df.JourneyPatternID.astype(str).str.slice([4::])

In [None]:
df_unusual.astype(str)

In [None]:
stops

In [None]:
stopdict = {}
for stop in stops:
    row = df[df.StopID == stop].iloc[0]
    stopdict[stop] = [row["Lon"], row["Lat"]]

In [None]:
stopdict

In [None]:
df["Congestion"] = df[column].astype('int64')

In [None]:
continuous_features = ["Timestamp", "LonWGS84", "LatWGS84", "Delay", "Congestion"]

In [None]:
# Calculate correlation of all pairs of continuous features
corr = df[continuous_features].corr()

# Generate a mask for the upper triangle
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))

# Generate a custom colormap - blue and red
cmap = sns.diverging_palette(220, 10, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, annot=True, mask=mask, cmap=cmap, vmax=1, vmin=-1,
            square=True, xticklabels=True, yticklabels=True,
            linewidths=.5, cbar_kws={"shrink": .5}, ax=ax)
plt.yticks(rotation = 0)
plt.xticks(rotation = 45)

In [None]:
df.plot(kind='scatter', x='Delay', y='Congestion')
# plt.savefig("images/congestion_delay_scatter.png")

In [None]:
categorical_columns = ["LineID", "JourneyPatternID", "TimeFrame", "VehicleJourneyID", 
                       "Operator", "Congestion", "BlockID", "VehicleID", "StopID", "AtStop"]
# for col in categorical_columns:
#     df[col].value_counts().plot(kind='bar')

In [None]:
df['Congestion'].value_counts().plot(kind='bar')
# plt.savefig("images/congestion_bar.png")

In [None]:
df["Delay"].hist(figsize=(10, 5), bins=100, range=[-2000, 2000])
# plt.savefig('images/delay_hist.png')

In [None]:
plt.figure(figsize=(10, 10))
df["Delay"].plot(kind='box')

In [None]:
def stacked_plot(clean_df, f1, f2):
    # Compare f2 and f1

    # Find unique values in f2
    featured = pd.unique(clean_df[f2].ravel())

    # Insert a new column called 'percent' and fill it with 0s
    clean_df['percent'] = 0

    # Iterate through the unique values in f2 and for each value count the amount of f1
    # Find the indexes of each row with a particular value and for each of these row insert count * 100 in the percent column
    for c in featured:
        count = 1 / clean_df[clean_df[f2] == c].count()[f1]
        index_list = clean_df[clean_df[f2] == c].index.tolist()
        for i in index_list:
            clean_df.loc[i, 'percent'] = count * 100

    # Group dataframe by ShipsFromCountry and IsWinner and sum percent
    group = clean_df[['percent',f2,f1]].groupby([f2,f1]).sum()
    
    # Plot values of group in a stacked bar chart
    my_plot = group.unstack().plot(kind='bar', stacked=True, title=f1 + " by " +  f2, figsize=(15,7))

    # Define label colours and text and add to the plot
    red_patch = mpatches.Patch(color='green', label=f1)
    blue_patch = mpatches.Patch(color='blue', label="Not " + f1)
    my_plot.legend(handles=[red_patch, blue_patch], frameon = True)

    # Define x and y labels and min and max values for the y axis
    my_plot.set_xlabel(f2)
    my_plot.set_ylabel("% " + f1)
    my_plot.set_ylim([0,100])

In [None]:
# Group by day of week and hour of day, get mean 
# hd_delay = df.groupby([df["Timestamp"].dt.dayofweek, df["Timestamp"].dt.hour])["Delay"].mean()

In [None]:
groups = df.groupby([df["Timestamp"].dt.dayofweek, df["Timestamp"].dt.hour])["Delay"].mean()

In [None]:
type(groups)

In [None]:
hourly_delay = df.groupby(['Hour'])["Delay"].mean().plot(figsize=(15,5))

hourly_delay.set(xlabel="Time of Day", ylabel="Delay")
hourly_delay.set_xticks([x for x in range(24)])
hourly_delay.set_xticklabels([x for x in range(24)])

In [None]:
hourly_delay = df.groupby(['Hour'])["Delay"].mean().plot(figsize=(15,5), kind='bar')

hourly_delay.set(xlabel="Time of Day", ylabel="Delay (seconds)")

# plt.savefig("hourly_delay_bar.png")

In [None]:
days = df.groupby(["Day"])

In [None]:
for name, day in days:
    
    hours = day.groupby(["Hour"])["Delay"].mean().plot()
#     hours["Delay"].mean().hist()

In [None]:
df.groupby["Hour"]

In [None]:
groups.hist(figsize=(10, 5))

In [None]:
for group in hd_delay:
    print(group)
    print()