In [None]:
import pandas as pd
import geopandas as gpd
import matplotlib
import matplotlib.pyplot as plt
import math
from math import radians, cos, sin, asin, sqrt
import seaborn as sns
# 设置matplotlib正常显示中文
matplotlib.rcParams['font.sans-serif']=['SimHei']   # 用黑体显示中文

In [None]:
# import transbigdata as tbd
from transbigdata.taxigps import (
         clean_taxi_status,
       taxigps_to_od,
         taxigps_traj_point
   ) 

In [None]:
import sys

# 临时禁用导入时的错误输出
sys.stderr = None

try:
    import transbigdata as tbd
finally:
    # 恢复错误输出
    sys.stderr = sys.__stderr__


In [None]:
# 读取taxi轨迹数据
taxi_trips_data = pd.read_json('data/taxi_trips.json')
# 读取bus/metro站点数据
stations_data = pd.read_json('data/stations.json')
# 读取原Taxi sample的数据
taxi_data = pd.read_csv("data/sample_taxi.csv")

In [None]:
tbd.visualization_trip(taxi_data, col=['Lng','Lat','VehicleNum','Time'])

In [None]:
# 数据清洗

In [None]:
from keplergl import KeplerGl  

#创建一个KeplerGl对象  
map_1 = KeplerGl(height=500)  

#激活KeplerGl对象到jupyter的窗口中  
map_1  

In [None]:
# 查看taxi数据格式
taxi_trips_data.head()

In [None]:
# 查看stations数据格式
stations_data.head()

In [None]:
#  由经纬度计算实际距离
def geodistance(lng1, lat1, lng2, lat2):
    lng1, lat1, lng2, lat2 = map(radians, [float(lng1), float(lat1), float(lng2), float(lat2)]) # 经纬度转换成弧度
    dlon=lng2-lng1
    dlat=lat2-lat1
    a=sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2 
    distance=2*asin(sqrt(a))*6371*10 # 地球平均半径，6371km
    distance=round(distance/1000,3)
    return distance # 结果的单位为km

    

# -------------------------------------------出租车路径长度分布----------------------------------#
path_column = pd.DataFrame(taxi_trips_data['path'],columns=['path'])
path_column.head()
lengths_trips = []
for row in path_column.itertuples(index=False):
    # 获取当前行的点列表
    points_list = row[0]
    # 初始化总距离为0
    total_distance = 0
    #迭代每一对相邻点
    for i in range(len(points_list)-1):
        point1 = points_list[i]
        point2 = points_list[i+1]
        # 计算相邻点之间的距离
#         distance = math.sqrt((point2[0] - point1[0])**2 + (point2[1] - point1[1])**2)
        distance = geodistance(point1[0], point1[1], point2[0], point2[1])
        # 距离累加
        total_distance += distance
        
    #总距离进入新数组
    lengths_trips.append(total_distance)

In [None]:
# 绘制出租车路径长度分布直方图
# 计算最高点的横坐标
counts, bins, _ = plt.hist(x = lengths_trips, bins=500, density=True, facecolor="blue", label = '直方图',edgecolor="black", alpha=0.7)
sns.kdeplot(lengths_trips,label = '密度图')
max_count = max(counts)
max_bin = bins[list(counts).index(max_count)]

# 保留小数点后三位
max_bin = round(max_bin, 3)

# 绘制最高点的垂直线
plt.axvline(x=max_bin, color='red', linestyle='--')
# 绘制最高点向y轴的垂线
plt.axhline(y=max_count, color='red', linestyle='--')

# 在横轴上显示最高点横坐标的值
plt.annotate(f'{max_bin}', xy=(max_bin, 0), xytext=(max_bin, -0.01),
             ha='center', va='top', color='red')

# 在纵轴上显示最高点纵坐标的值并标注
plt.annotate(f'y={max_count}', xy=(0, max_count), xytext=(0.5, max_count),
             ha='left', va='center', color='red')


# 调整x轴范围，将坐标集中在中部
plt.xlim(0.5,7)  # 设置x轴范围
# 显示横轴标签
plt.xlabel("路程/km")
# 显示纵轴标签
plt.ylabel("车辆频数")
# 显示图标题
plt.title("出租车路径长度分布图")

# 显示图例
plt.legend()
plt.show()

In [None]:
tbd.visualization_trip(df_exploded, col=['taxi_id', 'timestamps', 'longitude', 'latitude', 'passenger'])

In [None]:
# -------------------------------------------出租车载客路径长度的分布----------------------------------#
df = taxi_trips_data
filtered_df = df[df['passenger'] == 1]
path_column = pd.DataFrame(filtered_df['path'],columns=['path'])
path_column.head()
lengths_trips_passenger = []
for row in path_column.itertuples(index=False):
    # 获取当前行的点列表
    points_list = row[0]
    # 初始化总距离为0
    total_distance = 0
    #迭代每一对相邻点
    for i in range(len(points_list)-1):
        point1 = points_list[i]
        point2 = points_list[i+1]
        # 计算相邻点之间的距离
#         distance = math.sqrt((point2[0] - point1[0])**2 + (point2[1] - point1[1])**2)
        distance = geodistance(point1[0], point1[1], point2[0], point2[1])
        # 距离累加
        total_distance += distance
        
    #总距离进入新数组
    lengths_trips_passenger.append(total_distance)

In [None]:
# 计算最高点的横坐标
counts, bins, _ = plt.hist(x = lengths_trips_passenger, bins=500, density=True, facecolor="blue", label = '直方图',edgecolor="black", alpha=0.7)
sns.kdeplot(lengths_trips_passenger,label = '密度图')
max_count = max(counts)
max_bin = bins[list(counts).index(max_count)]

# 保留小数点后三位
max_bin = round(max_bin, 3)

# 绘制最高点的垂直线
plt.axvline(x=max_bin, color='red', linestyle='--')
# 绘制最高点向y轴的垂线
plt.axhline(y=max_count, color='red', linestyle='--')

# 在横轴上显示最高点横坐标的值
plt.annotate(f'{max_bin}', xy=(max_bin, 0), xytext=(max_bin, -0.01),
             ha='center', va='top', color='red')

# 在纵轴上显示最高点纵坐标的值并标注
plt.annotate(f'y={max_count}', xy=(0, max_count), xytext=(0.5, max_count),
             ha='left', va='center', color='red')


# 调整x轴范围，将坐标集中在中部
plt.xlim(0.5,7)  # 设置x轴范围
# 显示横轴标签
plt.xlabel("路程/km")
# 显示纵轴标签
plt.ylabel("车辆频数")
# 显示图标题
plt.title("出租车载客路径长度分布图")

# 显示图例
plt.legend()
plt.show()

In [None]:
filtered_df

In [None]:
# -------------------------------------------出租车载客次数分布----------------------------------#
# 在passenger = 1的数据里按照id分组
# 使用groupby和count函数进行分组计数
columns_to_drop =  ['path','passenger','timestamps']
count_by_category = filtered_df.drop(columns_to_drop, axis=1).groupby('taxi_id').value_counts().to_frame(name='count')['count']

# 绘制出租车载客次数分布直方图
# 计算最高点的横坐标
counts, bins, _ = plt.hist(x = count_by_category, bins=1, density=False, facecolor="blue", label = '直方图',edgecolor="black", alpha=0.7)
# sns.kdeplot(count_by_category,label = '密度图')
max_count = max(counts)
max_bin = bins[list(counts).index(max_count)]

# 保留小数点后三位
max_bin = round(max_bin, 3)

# 绘制最高点的垂直线
plt.axvline(x=max_bin, color='red', linestyle='--')
# 绘制最高点向y轴的垂线
plt.axhline(y=max_count, color='red', linestyle='--')

# 在横轴上显示最高点横坐标的值
plt.annotate(f'{max_bin}', xy=(max_bin, 0), xytext=(max_bin, -0.01),
             ha='center', va='top', color='red')

# 在纵轴上显示最高点纵坐标的值并标注
plt.annotate(f'y={max_count}', xy=(0, max_count), xytext=(0.5, max_count),
             ha='left', va='center', color='red')


# 调整x轴范围，将坐标集中在中部
plt.xlim(0.5,7)  # 设置x轴范围
# 显示横轴标签
plt.xlabel("载客次数")
# 显示纵轴标签
plt.ylabel("车辆数")
# 显示图标题
plt.title("出租车载客次数分布图")

# 显示图例
plt.legend()
plt.show()

In [None]:
# -------------------------------------------出租车空载速度的分布----------------------------------#
def speed(lng1, lat1, lng2, lat2, t1, t2):
    lng1, lat1, lng2, lat2 = map(radians, [float(lng1), float(lat1), float(lng2), float(lat2)]) # 经纬度转换成弧度
    dlon=lng2-lng1
    dlat=lat2-lat1
    a=sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2 
    distance=2*asin(sqrt(a))*6371*10 # 地球平均半径，6371km
    distance=round(distance/1000,3)
    return distance/(t1-t2) # 结果的单位为km


path_column = pd.DataFrame(taxi_trips_data['path'],columns=['path'])
path_column.head()
lengths_trips = []
for row in path_column.itertuples(index=False):
    # 获取当前行的点列表
    points_list = row[0]
    # 初始化总距离为0
    total_distance = 0
    #迭代每一对相邻点
    for i in range(len(points_list)-1):
        point1 = points_list[i]
        point2 = points_list[i+1]
        # 计算相邻点之间的距离
#         distance = math.sqrt((point2[0] - point1[0])**2 + (point2[1] - point1[1])**2)
        distance = geodistance(point1[0], point1[1], point2[0], point2[1])
        # 距离累加
        total_distance += distance
        
    #总距离进入新数组
    lengths_trips.append(total_distance)                                  

In [None]:
# -------------------------------------------载客平均速度出租车分布---------------------------------#