In [22]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from mpl_toolkits.mplot3d import Axes3D

# 1. read data
file_path1 = "../flood_tool/example_data/typical_day.csv"
file_path2 = "../flood_tool/example_data/wet_day.csv"
file_path3 = "../flood_tool/resources/stations.csv"
def read_data(file_path):
    data = pd.read_csv(file_path)
    print
    return data 



In [23]:
import pandas as pd
def processing_day(path):
    # 读取数据
    df=read_data(path)

    # 删除value列中的异常值
    df['value'] = pd.to_numeric(df['value'], errors='coerce')

    # 删除无用列
    df = df.drop(['dateTime', 'qualifier'], axis=1)
    df = df.dropna(subset=['value'])
    
    # 过滤单位不一致的数据
    df_filtered = df[((df['parameter'] == 'level') & (df['unitName'].isin(['mAOD', 'mASD']))) |
                 ((df['parameter'] == 'rainfall') & (df['unitName'].isin(['mm', 'm'])))]
    
    # 删除降雨量小于0的数据
    df_cleaned = df_filtered[(df_filtered['parameter'] != 'rainfall') | (df_filtered['value'] >= 0)]

    return df_cleaned

typical_day = processing_day(file_path1)
print(typical_day['unitName'].value_counts())
rainfall_data01 = typical_day[typical_day['parameter'] == 'rainfall']
rainfall_data01.describe()

unitName
mASD    107725
mm       85401
mAOD     14970
Name: count, dtype: int64


Unnamed: 0,value
count,85401.0
mean,0.001772
std,0.061972
min,0.0
25%,0.0
50%,0.0
75%,0.0
max,14.0


In [25]:
wet_day = processing_day(file_path2)
print(wet_day['unitName'].value_counts())
wet_day.describe()
rainfall_data02 = wet_day[wet_day['parameter'] == 'rainfall']

rainfall_data02.describe()



unitName
mASD    114127
mm       81921
mAOD      5487
Name: count, dtype: int64


Unnamed: 0,value
count,81921.0
mean,0.26259
std,2.233457
min,0.0
25%,0.0
50%,0.0
75%,0.29
max,307.5


In [26]:
def typical_wet_join(typical_day, wet_day,station):
    join_data = pd.concat([typical_day, wet_day], ignore_index=True)

    grouped = join_data.groupby(['stationReference', 'parameter', 'unitName'])['value'].agg(['mean', 'max', 'min'])
    grouped= grouped.reset_index()

    merged_data = pd.merge(station, grouped, 
                       on='stationReference', 
                       how='left')  
    return join_data, grouped,merged_data
station=read_data(file_path3)
null_columns = ["stationName","latitude","longitude"]
station = station.dropna(subset=null_columns)
station=station.drop(['maxOnRecord','minOnRecord','typicalRangeHigh','typicalRangeLow'], axis=1)

join_data,grouped,merge_data=typical_wet_join(typical_day, wet_day,station)
# join_data
# grouped
merge_data


Unnamed: 0,stationReference,stationName,latitude,longitude,parameter,unitName,mean,max,min
0,000008,Rainfall station,53.480556,-1.441674,rainfall,mm,0.386458,3.600,0.000
1,000028,Rainfall station,53.500289,-1.673575,rainfall,mm,0.442708,2.600,0.000
2,000075TP,Rainfall station,51.084022,-0.214597,rainfall,mm,0.015625,0.800,0.000
3,000076TP,Rainfall station,51.701508,-0.747539,rainfall,mm,0.109375,2.400,0.000
4,000180TP,Rainfall station,51.618838,0.173236,rainfall,mm,0.027604,1.300,0.000
...,...,...,...,...,...,...,...,...,...
2019,E71839,Portsmouth,50.802280,-1.111170,level,mAOD,0.316703,2.024,-1.404
2020,E71939,Bournemouth,50.714331,-1.874873,level,mAOD,-0.205497,0.923,-2.449
2021,E70739,Aberdeen,57.144060,-2.077360,level,mAOD,0.510167,1.886,-0.975
2022,E74239,Tobermory,56.623110,-6.064220,level,mAOD,0.468979,1.716,-0.917


In [27]:
merge_data.isnull().sum()

stationReference    0
stationName         0
latitude            0
longitude           0
parameter           0
unitName            0
mean                0
max                 0
min                 0
dtype: int64

In [28]:
merge_data['parameter'].value_counts()

parameter
level       1125
rainfall     899
Name: count, dtype: int64

In [29]:
level_data = merge_data[merge_data['parameter'] == 'level']
rainfall_data = merge_data[merge_data['parameter'] == 'rainfall']
level_data
rainfall_data
rainfall_data['mean'].min()

np.float64(0.0)

In [30]:
print(level_data['unitName'].value_counts())
print(rainfall_data['unitName'].value_counts())

unitName
mASD    1077
mAOD      48
Name: count, dtype: int64
unitName
mm    899
Name: count, dtype: int64


In [31]:
import folium
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
from folium.plugins import HeatMap

# 假设 rainfall_data 是包含经纬度和 mean 值的数据框

m = folium.Map(location=[51, 0], zoom_start=13)

# Normalize 用于归一化 mean 值到 [0, 1] 的区间
norm = mcolors.Normalize(vmin=rainfall_data['mean'].min(), vmax=rainfall_data['mean'].max())

# 使用 matplotlib 中的渐变颜色映射
cmap = plt.cm.Reds  # 你也可以选择其他的 colormap，例如 Blues, YlGnBu 等

# 设置一个较明显的颜色映射范围（比如从红色到浅红色）
# 遍历 rainfall_data 中的每一行，根据 mean 值设置颜色
for _, row in rainfall_data.iterrows():
    # 根据 mean 值归一化得到一个值，然后映射到颜色
    color = mcolors.to_hex(cmap(norm(row['mean']))) 
    
    # 创建带颜色的 CircleMarker
    folium.CircleMarker(
        location=[row['latitude'], row['longitude']],
        radius=5,  # 根据需要调整圆的半径
        color=color,  # 圆的边框颜色
        fill=True,  
        fill_color=color,  # 填充颜色
        fill_opacity=0.7  # 填充透明度
    ).add_to(m)

# 显示地图
m


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rainfall_data['log_mean'] = np.log(rainfall_data['mean'] + 1e-5)
