In [None]:
'''数据分析：删除缺失值、相关性分析、数据划分
特征处理：缺失值填充、编码、异常值处理、归一化、去重复值
模型训练：模型选择、调参、错误分析、训练集测试集评价
'''

# 数据分析

In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

df = pd.read_csv('US-pumpkins.csv') 
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1757 entries, 0 to 1756
Data columns (total 26 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   City Name        1757 non-null   object 
 1   Type             45 non-null     object 
 2   Package          1757 non-null   object 
 3   Variety          1752 non-null   object 
 4   Sub Variety      296 non-null    object 
 5   Grade            0 non-null      float64
 6   Date             1757 non-null   object 
 7   Low Price        1757 non-null   float64
 8   High Price       1757 non-null   float64
 9   Mostly Low       1654 non-null   float64
 10  Mostly High      1654 non-null   float64
 11  Origin           1754 non-null   object 
 12  Origin District  131 non-null    object 
 13  Item Size        1478 non-null   object 
 14  Color            1141 non-null   object 
 15  Environment      0 non-null      float64
 16  Unit of Sale     162 non-null    object 
 17  Quality       

Unnamed: 0,City Name,Type,Package,Variety,Sub Variety,Grade,Date,Low Price,High Price,Mostly Low,...,Unit of Sale,Quality,Condition,Appearance,Storage,Crop,Repack,Trans Mode,Unnamed: 24,Unnamed: 25
0,BALTIMORE,,24 inch bins,,,,4/29/17,270.0,280.0,270.0,...,,,,,,,E,,,
1,BALTIMORE,,24 inch bins,,,,5/6/17,270.0,280.0,270.0,...,,,,,,,E,,,
2,BALTIMORE,,24 inch bins,HOWDEN TYPE,,,9/24/16,160.0,160.0,160.0,...,,,,,,,N,,,
3,BALTIMORE,,24 inch bins,HOWDEN TYPE,,,9/24/16,160.0,160.0,160.0,...,,,,,,,N,,,
4,BALTIMORE,,24 inch bins,HOWDEN TYPE,,,11/5/16,90.0,100.0,90.0,...,,,,,,,N,,,


In [6]:
# 设置缺失率阈值
threshold = 0.75

# 计算每列的缺失率
missing_ratio = df.isnull().mean()

print(missing_ratio)

City Name          0.000000
Type               0.974388
Package            0.000000
Variety            0.002846
Sub Variety        0.831531
Grade              1.000000
Date               0.000000
Low Price          0.000000
High Price         0.000000
Mostly Low         0.058623
Mostly High        0.058623
Origin             0.001707
Origin District    0.925441
Item Size          0.158793
Color              0.350598
Environment        1.000000
Unit of Sale       0.907797
Quality            1.000000
Condition          1.000000
Appearance         1.000000
Storage            1.000000
Crop               1.000000
Repack             0.000000
Trans Mode         1.000000
Unnamed: 24        1.000000
Unnamed: 25        0.941377
dtype: float64


In [7]:
# 筛选出缺失率低于阈值的列
cols_to_keep = missing_ratio[missing_ratio < threshold].index
print(cols_to_keep)

Index(['City Name', 'Package', 'Variety', 'Date', 'Low Price', 'High Price',
       'Mostly Low', 'Mostly High', 'Origin', 'Item Size', 'Color', 'Repack'],
      dtype='object')


In [8]:
cols_to_drop = missing_ratio[missing_ratio > threshold].index
print(cols_to_drop)

Index(['Type', 'Sub Variety', 'Grade', 'Origin District', 'Environment',
       'Unit of Sale', 'Quality', 'Condition', 'Appearance', 'Storage', 'Crop',
       'Trans Mode', 'Unnamed: 24', 'Unnamed: 25'],
      dtype='object')


In [9]:
df = df.drop(cols_to_drop, axis=1)

In [10]:
df.shape

(1757, 12)

In [12]:
# 数值型列与 High Price 的相关性
numeric_cols = df.select_dtypes(include=[np.number]).columns
if 'High Price' in numeric_cols:
    high_price_corr = df[numeric_cols].corr()['High Price'].sort_values(ascending=False)
    print("Correlation with 'High Price' (numeric features):")
    print(high_price_corr)
else:
    print("'High Price' not found in numeric columns.")

Correlation with 'High Price' (numeric features):
High Price     1.000000
Mostly High    0.996174
Mostly Low     0.989539
Low Price      0.987353
Name: High Price, dtype: float64


In [23]:
# 非数值型列
non_numeric_cols = df.select_dtypes(exclude=[np.number]).columns

for col in non_numeric_cols:
    if 'High Price' in df.columns:
        grouped = df.groupby(col)['High Price'].mean().sort_values(ascending=False)
        print(f"\nAverage 'High Price' by '{col}':")
        print(grouped.head(5))


Average 'High Price' by 'City Name':
City Name
BOSTON      194.176647
MIAMI       170.000000
ATLANTA     153.416667
DALLAS      149.802920
COLUMBIA    141.790875
Name: High Price, dtype: float64

Average 'High Price' by 'Package':
Package
36 inch bins      183.359304
24 inch bins      180.082151
bins              172.307692
each              103.102941
bushel baskets     49.100000
Name: High Price, dtype: float64

Average 'High Price' by 'Variety':
Variety
BLUE TYPE                   231.315789
KNUCKLE HEAD                199.500000
FAIRYTALE                   197.753333
CINDERELLA                  178.153086
MIXED HEIRLOOM VARIETIES    172.765957
Name: High Price, dtype: float64

Average 'High Price' by 'Date':
Date
2017-06-03    302.5
2017-06-17    302.5
2017-06-10    302.5
2017-07-22    255.0
2017-07-29    255.0
Name: High Price, dtype: float64

Average 'High Price' by 'Origin':
Origin
MEXICO            211.400000
VERMONT           197.500000
MASSACHUSETTS     190.846995
CANADA    

In [15]:
from sklearn.model_selection import train_test_split

# 预测 'Mostly High' 价格
X = df.drop(columns=['Mostly High'])
y = df['Mostly High']

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"训练集大小: {X_train.shape}")
print(f"测试集大小: {X_test.shape}")

训练集大小: (1405, 11)
测试集大小: (352, 11)


# 特征处理

In [16]:
# 将价格列转换为数值类型（去除美元符号并转换为浮点数）
price_columns = ['Low Price', 'High Price', 'Mostly Low', 'Mostly High']
for col in price_columns:
    df[col] = df[col].replace({'\$': '', ',': ''}, regex=True).astype(float)

# 将日期列转换为日期类型
df['Date'] = pd.to_datetime(df['Date'])

In [17]:
# 查看转换后的数据类型
print(df[price_columns + ['Date']].dtypes)

# 查看前几行数据
print(df[price_columns + ['Date']].head())

Low Price             float64
High Price            float64
Mostly Low            float64
Mostly High           float64
Date           datetime64[ns]
dtype: object
   Low Price  High Price  Mostly Low  Mostly High       Date
0      270.0       280.0       270.0        280.0 2017-04-29
1      270.0       280.0       270.0        280.0 2017-05-06
2      160.0       160.0       160.0        160.0 2016-09-24
3      160.0       160.0       160.0        160.0 2016-09-24
4       90.0       100.0        90.0        100.0 2016-11-05


In [25]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder


# 提取数值型和分类型列
numeric_cols = X.select_dtypes(include=[np.number]).columns
categorical_cols = X.select_dtypes(exclude=[np.number]).columns

from sklearn.impute import SimpleImputer

# 填充剩余数值列的缺失值
imputer_num = SimpleImputer(strategy='median')

X_train[numeric_cols] = pd.DataFrame(
    imputer_num.fit_transform(X_train[numeric_cols]),
    columns=numeric_cols,
    index=X_train.index
)

# 创建分类变量的填充器（使用众数填充）
imputer_cat = SimpleImputer(strategy='most_frequent')

# 对分类变量进行填充
X_train[categorical_cols] = imputer_cat.fit_transform(X_train[categorical_cols])

# 对分类变量进行独热编码
df_encoded = pd.get_dummies(df, columns=categorical_cols, drop_first=True)
print("\n独热编码后数据集形状：", df_encoded.shape)


独热编码后数据集形状： (1621, 120)


In [33]:
# 去除重复值
df_clean = df_encoded.drop_duplicates()

print("去除重复值后，数据的基本信息：")
df_clean.info()

去除重复值后，数据的基本信息：
<class 'pandas.core.frame.DataFrame'>
Index: 1610 entries, 0 to 1653
Columns: 120 entries, Low Price to Repack_N
dtypes: bool(116), float64(4)
memory usage: 245.3 KB


# 模型构建

In [None]:
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.ensemble import GradientBoostingRegressor



仅使用的非数值型特征: ['Variety', 'Package', 'City Name', 'Origin', 'Item Size']


In [None]:
df_encoded.head()
'''删掉价格列，用highprice做目标变量，不用在选择特征，直接喂给模型'''

Unnamed: 0,Low Price,High Price,Mostly Low,Mostly High,City Name_BALTIMORE,City Name_BOSTON,City Name_CHICAGO,City Name_COLUMBIA,City Name_DALLAS,City Name_DETROIT,...,Origin_WASHINGTON,Item Size_jbo,Item Size_lge,Item Size_med,Item Size_med-lge,Item Size_sml,Item Size_xlge,Color_STRIPED,Color_WHITE,Repack_N
0,270.0,280.0,270.0,280.0,True,False,False,False,False,False,...,False,False,True,False,False,False,False,False,False,False
1,270.0,280.0,270.0,280.0,True,False,False,False,False,False,...,False,False,True,False,False,False,False,False,False,False
2,160.0,160.0,160.0,160.0,True,False,False,False,False,False,...,False,False,False,True,False,False,False,False,False,True
3,160.0,160.0,160.0,160.0,True,False,False,False,False,False,...,False,False,False,True,False,False,False,False,False,True
4,90.0,100.0,90.0,100.0,True,False,False,False,False,False,...,False,False,True,False,False,False,False,False,False,True


In [32]:
len(df_encoded.columns)

120