In [None]:
import pandas as pd

speed_per_day = pd.read_csv("./metrics-calculation/speed-calculation/statistics/1-speed_per_vehicle_stats.csv")
speed_per_day.head(5)

In [None]:
# average speed per day

day_column = [day for day in range(1,32)]
speed_per_day["day"] = day_column

ax = speed_per_day.plot.bar(x ="day",y = "avg_speed",title=f"Velocidade média de ônibus por dia",figsize=(26,10))
# x axis label
ax.set_xlabel("dia",size=15)
# y axis label
ax.set_ylabel("velocidade média (km/h)",size=15)
# y range
ax.set_ylim(0,16)
# size of the number in both axis
ax.tick_params(axis='both', which='major', labelsize=15)
for p in ax.patches:
    ax.annotate(str(round(p.get_height(),2)), (p.get_x() * 1.00, p.get_height() * 1.006),size=13)
# title fontsize
ax.title.set_size(30)

# saving chart as png
ax.get_figure().savefig('/charts-results/avg-speed-per-day.png')


In [None]:
# Boxplot pandas - https://kanoki.org/2019/09/16/dataframe-visualization-with-pandas-plot/
# Boxplot pandas - https://www.simplypsychology.org/boxplots.html

import matplotlib.pyplot as plt

for i in range(0,31):
    fig, ax = plt.subplots()

    boxes = [
        {
            'label' : f"Intervalos da velocidade média de ônibus no dia {i+1}-10-2015",
            'whislo': speed_per_day["avg_speed_min"][i],    # Bottom whisker position
            'q1'    : speed_per_day["avg_speed_quantile_25"][i],    # First quartile (25th percentile)
            'med'   : speed_per_day["avg_speed_quantile_50"][i],    # Median         (50th percentile)
            'q3'    : speed_per_day["avg_speed_quantile_75"][i],    # Third quartile (75th percentile)
            'whishi': speed_per_day["avg_speed_max"][i],    # Top whisker position
            'fliers': [],        # Outliers,
            'mean':   speed_per_day["avg_speed"][i]
        }
    ]

    ax.bxp(boxes, showfliers=False)


    ax.set_ylabel("velocidade (km/h)")
    plt.plot()
    plt.savefig(f"./charts-results/boxplot-avg-speed-day/avg-speed_day_boxplot_{i+1}-10-2015.png")
    plt.close()

In [None]:
# average speed per hour per day
import pandas as pd
import matplotlib.pyplot as plt

for day in range(1,32):

    speed_per_hour = pd.read_parquet(f"./metrics-calculation/speed-calculation/speed-per-hour-per-day/MO_1510{day}/").sort_values(by=["hour_avl"])

    ax = speed_per_hour.plot(x="hour_avl",y ="avg_speed",title=f"Velocidade média de ônibus por hora",figsize=(26,10))
    # x axis label
    ax.set_xlabel("hora",size=15)
    # y axis label
    ax.set_ylabel("velocidade média (km/h)",size=15)
    # Setting x axis ticks
    plt.xticks([hour for hour in range(6,23)])
    # Title size
    ax.title.set_size(30)

    # size of the number in both axis
    ax.tick_params(axis='both', which='major', labelsize=15)

    # saving chart as png
    ax.get_figure().savefig(f'./charts-results/speed-per-hour-charts/avg-speed-per-hour-day-{day}-10-2015.png')
    plt.close()

In [None]:
len(speed_per_hour)

In [None]:
# average speed per region per day
# plotting on map using colormap --> https://matplotlib.org/3.1.0/tutorials/colors/colormaps.html

import geopandas as gpd

# Sao Paulo shape
sp = gpd.read_file("./shape-files-sp/DISTRITO_MUNICIPAL_SP_SMDUPolygon.shp")

for day in range(1,32):
    # speed by region per day
    speed_region = pd.read_parquet(f"./metrics-calculation/speed-calculation/speed-per-region-per-day/MO_1510{day}/")
    
    # getting the shape of the region
    df_final = pd.merge(speed_region, sp, left_on = "region", right_on = "Nome").drop(columns = ["Nome"])
    
    geo_df = gpd.GeoDataFrame(df_final, geometry = "geometry")

    # Plotting map
    geo_df.plot(column="avg_speed",legend=True,cmap='RdBu',figsize = (15,15),legend_kwds={'label': "velocidade média (km/h)"})
    plt.savefig(f'./charts-results/speed-region-day/speed-by-region-day_{day}-10-2015.png')
    plt.close()

In [None]:
# average speed per hour/per region/per day

import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt

# Sao Paulo shape
sp = gpd.read_file("./shape-files-sp/DISTRITO_MUNICIPAL_SP_SMDUPolygon.shp")

# analyzing weekends sat/sundays, holidays, workind days
day_to_analyze = [1,17,4,12,20]

for day in day_to_analyze:
    # reading traces
    speed_per_hour_per_region = pd.read_parquet(f"./metrics-calculation/speed-calculation/speed-per-hour-per-region-per-day/MO_1510{day}/")
    
    # getting the shape of the region
    df_with_shape = pd.merge(speed_per_hour_per_region, sp, left_on = "region", right_on = "Nome").drop(columns = ["Nome"])
    
    for hour in list(range(6,23)):
        df = df_with_shape[df_with_shape["hour_avl"] == hour]
        geo_df = gpd.GeoDataFrame(df, geometry = "geometry")
        # Plotting map
        geo_df.plot(column="avg_speed",legend=True,cmap='RdBu',figsize = (15,15),legend_kwds={'label': "velocidade média (km/h)"})
        plt.savefig(f'./charts-results/speed-hour-region-day/{day}-10-2015/speed-per-hour-per-region-day_hour-{hour}_{day}-10-2015.png')
        plt.close()

In [None]:
### just training

In [None]:
# PDF/ CDF --> https://stackoverflow.com/questions/25577352/plotting-cdf-of-a-pandas-series-in-python

In [None]:
# Find distribution 
#ECDF --> https://towardsdatascience.com/how-to-generate-ecdf-plot-using-python-and-r-247ef81fbf3f

In [None]:
# https://stackoverflow.com/questions/6620471/fitting-empirical-distribution-to-theoretical-ones-with-scipy-python

In [None]:
#https://www.kite.com/python/answers/how-to-fit-data-to-a-distribution-in-python

In [None]:
import scipy
import numpy as np
import matplotlib.pyplot as plt

In [None]:
t = scipy.stats.distributions.dgamma.fit(list(speed_per_day["avg_speed"]))

print(t)
x = np.linspace(13,15,100)

df_novo = speed_per_day.loc[speed_per_day["avg_speed"] < 14]

fitted_data = scipy.stats.distributions.norm.pdf(x, mean, var)

plt.hist(list(df_novo["avg_speed"]), density=True)

plt.plot(x,fitted_data,'r-')

In [None]:
dir(scipy.stats.distributions)

In [None]:
#https://medium.com/@amirarsalan.rajabi/distribution-fitting-with-python-scipy-bb70a42c0aed
#https://stackoverflow.com/questions/20011122/fitting-a-normal-distribution-to-1d-data
# https://www.johndcook.com/blog/distributions_scipy/
df = speed_per_day.loc[speed_per_day["avg_speed"]<14]
from scipy import stats
#list_of_dists = ['bradford','burr','burr12','cauchy','chi','chi2','cosine','dgamma','dweibull','erlang','expon','exponnorm','exponweib','exponpow','f','fatiguelife','fisk','foldcauchy','foldnorm','genlogistic','genpareto','gennorm','genexpon','genextreme','gausshyper','gamma','gengamma','genhalflogistic','gilbrat','gompertz','gumbel_r','gumbel_l','halfcauchy','halflogistic','halfnorm','halfgennorm','hypsecant','invgamma','invgauss','invweibull','johnsonsb','johnsonsu','kstwobign','laplace','levy','levy_l','logistic','loggamma','loglaplace','lognorm','lomax','maxwell','mielke','nakagami','ncx2','ncf','nct','norm','pareto','pearson3','powerlaw','powerlognorm','powernorm','rdist','reciprocal','rayleigh','rice','recipinvgauss','semicircular','t','triang','truncexpon','truncnorm','tukeylambda','uniform','vonmises','vonmises_line','wald','weibull_min','weibull_max']
list_of_dists = ["expon","gamma","lognorm","norm",'weibull_min','weibull_max',"exponweib"]
# exponencial,gamma,lognormal,normal,weibull
results = []
for i in list_of_dists:
    dist = getattr(stats, i)
    param = dist.fit(df["avg_speed"])
    a = stats.kstest(df["avg_speed"], i, args=param)
    results.append((i,a[0],a[1]))
    
    
results.sort(key=lambda x:float(x[2]), reverse=True)
for j in results:
    print("{}: statistic={}, pvalue={}".format(j[0], j[1], j[2]))

In [None]:
# https://medium.com/@amirarsalan.rajabi/distribution-fitting-with-python-scipy-bb70a42c0aed
# https://github.com/amirarsalan90/dist_fitting_medium/blob/master/dist_fitting.ipynb
# https://glowingpython.blogspot.com/2012/07/distribution-fitting-with-scipy.html

In [None]:
plt.hist(speed_per_day["avg_speed"])

In [None]:
dir(stats)

In [None]:
data = np.random.normal(0, 0.5, 1000)


t1,t2,t3 = scipy.stats.distributions.gennorm.fit(data)


x = np.linspace(-5,5,100)


In [None]:
(1.9753053701013514, -0.0008095699877689374, 0.6912687108373308)

In [None]:
list_of_dists = ["expon","gamma","lognomr","norm",'weibull_min','weibull_max']