## 2. **特征构建**

In [21]:
import pandas as pd
import numpy as np

# 读取 CSV 文件
df = pd.read_csv("weatherHistory.csv")

### 一.特征分箱

在数据处理过程中，分箱是一种将连续变量划分为离散类别的方法。分箱有助于将连续数据转换为分类数据，以便于进一步分析和处理。在天气数据处理中，分箱可以帮助我们将时间、风速、风向等连续变量转换为有意义的分类，以便于后续的分析和建模

In [22]:
# 将 Formatted Date 列转换为 datetime 类型
df['Formatted Date'] = pd.to_datetime(df['Formatted Date'], utc=True)
# 提取小时信息
df['Hour'] = df['Formatted Date'].dt.hour
# 对小时进行分箱处理
hour_bins = [0, 5, 11, 17, 23]  # 分箱区间
hour_labels = ["凌晨", "早晨", "下午", "夜晚"]  # 分箱标签
df['TimeOfDay'] = pd.cut(df['Hour'], bins=hour_bins, labels=hour_labels, right=True, include_lowest=True)
# 提取月份信息
df['Month'] = df['Formatted Date'].dt.month
# 定义季节分箱
def get_season(month):
    if month in [12, 1, 2]:
        return '冬季'
    elif month in [3, 4, 5]:
        return '春季'
    elif month in [6, 7, 8]:
        return '夏季'
    elif month in [9, 10, 11]:
        return '秋季'
# 应用季节分箱
df['Season'] = df['Month'].apply(get_season)
# 对风速进行分箱处理（假设分为：低、中、高）
wind_speed_bins = [0, 10, 20, df['Wind Speed (km/h)'].max()]  # 分箱区间
wind_speed_labels = ["低", "中", "高"]  # 分箱标签
df['WindSpeedGroup'] = pd.cut(df['Wind Speed (km/h)'], bins=wind_speed_bins, labels=wind_speed_labels, right=False)
# 对风向进行分箱处理（分为8个方向）
wind_bearing_bins = [0, 45, 90, 135, 180, 225, 270, 315, 360]  # 分箱区间
wind_bearing_labels = ["北", "东北", "东", "东南", "南", "西南", "西", "西北"]  # 分箱标签
df['WindDirection'] = pd.cut(df['Wind Bearing (degrees)'], bins=wind_bearing_bins, labels=wind_bearing_labels, right=False, include_lowest=True)
# 查看分箱后的结果
df[['Month', 'Season', 'Hour', 'TimeOfDay', 'Wind Speed (km/h)', 'WindSpeedGroup', 'Wind Bearing (degrees)', 'WindDirection']].head()

Unnamed: 0,Month,Season,Hour,TimeOfDay,Wind Speed (km/h),WindSpeedGroup,Wind Bearing (degrees),WindDirection
0,1,冬季,0,凌晨,17.1143,中,140.0,东南
1,1,冬季,1,凌晨,16.6152,中,139.0,东南
2,1,冬季,2,凌晨,20.2538,高,140.0,东南
3,1,冬季,3,凌晨,14.49,中,140.0,东南
4,1,冬季,4,凌晨,13.9426,中,134.0,东


### 二.归一化 
归一化适用于数值特征，特别是当不同特征的值域差别很大时，归一化可以帮助加速收敛并提高模型性能。适合归一化的特征有：

Temperature (C)（温度）

Apparent Temperature (C)（体感温度）

Humidity（湿度）

Wind Speed (km/h)（风速）

Visibility (km)（能见度）

Pressure (millibars)（气压）


下面是一个用于归一化数值特征的函数。我们使用 Min-Max 归一化 方法，将特征缩放到 [0, 1] 的范围内。

In [23]:
# 定义归一化函数
def normalize(df, columns):
    """
    对指定列进行 Min-Max 归一化.
    
    参数:
    df: pandas DataFrame, 输入的数据框.
    columns: list, 需要进行归一化的列名列表.
    
    返回:
    pandas DataFrame, 归一化后的数据框.
    """
    df_normalized = df.copy()
    
    for column in columns:
        min_value = df_normalized[column].min()
        max_value = df_normalized[column].max()
        
        df_normalized[column] = (df_normalized[column] - min_value) / (max_value - min_value)
    
    return df_normalized
# 需要归一化的列
columns_to_normalize = [
    'Temperature (C)', 
    'Apparent Temperature (C)', 
    'Humidity', 
    'Wind Speed (km/h)', 
    'Wind Bearing (degrees)', 
    'Visibility (km)', 
    'Pressure (millibars)'
]

# 对指定列进行归一化
df_normalized = normalize(df, columns_to_normalize)
# 查看归一化后的数据
df_normalized[columns_to_normalize].head()

Unnamed: 0,Temperature (C),Apparent Temperature (C),Humidity,Wind Speed (km/h),Wind Bearing (degrees),Visibility (km),Pressure (millibars)
0,0.362884,0.352912,0.89,0.268028,0.389972,0.62,0.971597
1,0.372334,0.365007,0.85,0.260212,0.387187,0.615,0.97111
2,0.380524,0.36625,0.82,0.317196,0.389972,0.615,0.970842
3,0.381244,0.380582,0.82,0.226929,0.389972,0.615,0.970546
4,0.372694,0.37238,0.86,0.218356,0.373259,0.615,0.969992


### 三.特征交互： 
特征交互是指通过组合现有特征来创建新的特征，以捕捉更复杂的关系。适合特征交互的特征通常是数值特征，以下特征可以进行特征交互：

Temperature (C) 与 Apparent Temperature (C)：温度和体感温度的差值可以反映人体对气温的感知差异

Humidity 与 Temperature (C)：湿度和温度的组合可以用来计算热指数。

Wind Speed (km/h) 与 Wind Bearing (degrees)：风速和风向的组合可以用来计算风的分量（东向分量和北向分量）。

In [24]:
# 进行特征交互
df_interaction = df_normalized.copy()
# 新增温度和体感温度的交互特征
df_interaction['Temp_AppTemp'] = df_interaction['Temperature (C)'] * df_interaction['Apparent Temperature (C)']
# 新增温度和湿度的交互特征
df_interaction['Temp_Humidity'] = df_interaction['Temperature (C)'] * df_interaction['Humidity']
# 新增体感温度和湿度的交互特征
df_interaction['AppTemp_Humidity'] = df_interaction['Apparent Temperature (C)'] * df_interaction['Humidity']
# 查看特征交互后的数据
df_interaction[['Temp_AppTemp', 'Temp_Humidity', 'AppTemp_Humidity']].head()

Unnamed: 0,Temp_AppTemp,Temp_Humidity,AppTemp_Humidity
0,0.128066,0.322966,0.314092
1,0.135904,0.316484,0.310256
2,0.139367,0.31203,0.300325
3,0.145094,0.31262,0.312077
4,0.138784,0.320517,0.320247
