# 数据处理

In [39]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

df = pd.read_csv('US-pumpkins.csv')
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1757 entries, 0 to 1756
Data columns (total 26 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   City Name        1757 non-null   object 
 1   Type             45 non-null     object 
 2   Package          1757 non-null   object 
 3   Variety          1752 non-null   object 
 4   Sub Variety      296 non-null    object 
 5   Grade            0 non-null      float64
 6   Date             1757 non-null   object 
 7   Low Price        1757 non-null   float64
 8   High Price       1757 non-null   float64
 9   Mostly Low       1654 non-null   float64
 10  Mostly High      1654 non-null   float64
 11  Origin           1754 non-null   object 
 12  Origin District  131 non-null    object 
 13  Item Size        1478 non-null   object 
 14  Color            1141 non-null   object 
 15  Environment      0 non-null      float64
 16  Unit of Sale     162 non-null    object 
 17  Quality       

Unnamed: 0,City Name,Type,Package,Variety,Sub Variety,Grade,Date,Low Price,High Price,Mostly Low,...,Unit of Sale,Quality,Condition,Appearance,Storage,Crop,Repack,Trans Mode,Unnamed: 24,Unnamed: 25
0,BALTIMORE,,24 inch bins,,,,4/29/17,270.0,280.0,270.0,...,,,,,,,E,,,
1,BALTIMORE,,24 inch bins,,,,5/6/17,270.0,280.0,270.0,...,,,,,,,E,,,
2,BALTIMORE,,24 inch bins,HOWDEN TYPE,,,9/24/16,160.0,160.0,160.0,...,,,,,,,N,,,
3,BALTIMORE,,24 inch bins,HOWDEN TYPE,,,9/24/16,160.0,160.0,160.0,...,,,,,,,N,,,
4,BALTIMORE,,24 inch bins,HOWDEN TYPE,,,11/5/16,90.0,100.0,90.0,...,,,,,,,N,,,


In [40]:
df.shape

(1757, 26)

In [41]:
missing_values = df.isnull().sum()
print(missing_values[missing_values > 0])

Type               1712
Variety               5
Sub Variety        1461
Grade              1757
Mostly Low          103
Mostly High         103
Origin                3
Origin District    1626
Item Size           279
Color               616
Environment        1757
Unit of Sale       1595
Quality            1757
Condition          1757
Appearance         1757
Storage            1757
Crop               1757
Trans Mode         1757
Unnamed: 24        1757
Unnamed: 25        1654
dtype: int64


In [42]:
# 定义缺失值比例阈值
missing_threshold = 0.75

# 删除缺失值比例超过阈值的列
drop = df.columns[df.isnull().mean() > missing_threshold]
df = df.drop(drop, axis=1)

missing_values = df.isnull().sum()
print(missing_values[missing_values > 0])

Variety          5
Mostly Low     103
Mostly High    103
Origin           3
Item Size      279
Color          616
dtype: int64


In [43]:
# 填充缺失值
df['Variety'] = df['Variety'].fillna('unknown')
df['Origin'] = df['Origin'].fillna('unknown')
df['Item Size'] = df['Item Size'].fillna('unknown')
df['Color'] = df['Color'].fillna('unknown')
df['Mostly High'] = df['Mostly High'].fillna(df['Mostly High'].mean())
df['Mostly Low'] = df['Mostly Low'].fillna(df['Mostly Low'].mean())

In [44]:
df.isnull().sum()

City Name      0
Package        0
Variety        0
Date           0
Low Price      0
High Price     0
Mostly Low     0
Mostly High    0
Origin         0
Item Size      0
Color          0
Repack         0
dtype: int64

In [45]:
# 将价格列转换为数值类型（去除美元符号并转换为浮点数）
price_columns = ['Low Price', 'High Price', 'Mostly Low', 'Mostly High']
for col in price_columns:
    df[col] = df[col].replace({'\$': '', ',': ''}, regex=True).astype(float)

# 将日期列转换为日期类型
df['Date'] = pd.to_datetime(df['Date'])

In [46]:
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1757 entries, 0 to 1756
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   City Name    1757 non-null   object        
 1   Package      1757 non-null   object        
 2   Variety      1757 non-null   object        
 3   Date         1757 non-null   datetime64[ns]
 4   Low Price    1757 non-null   float64       
 5   High Price   1757 non-null   float64       
 6   Mostly Low   1757 non-null   float64       
 7   Mostly High  1757 non-null   float64       
 8   Origin       1757 non-null   object        
 9   Item Size    1757 non-null   object        
 10  Color        1757 non-null   object        
 11  Repack       1757 non-null   object        
dtypes: datetime64[ns](1), float64(4), object(7)
memory usage: 164.8+ KB


Unnamed: 0,City Name,Package,Variety,Date,Low Price,High Price,Mostly Low,Mostly High,Origin,Item Size,Color,Repack
0,BALTIMORE,24 inch bins,unknown,2017-04-29,270.0,280.0,270.0,280.0,MARYLAND,lge,unknown,E
1,BALTIMORE,24 inch bins,unknown,2017-05-06,270.0,280.0,270.0,280.0,MARYLAND,lge,unknown,E
2,BALTIMORE,24 inch bins,HOWDEN TYPE,2016-09-24,160.0,160.0,160.0,160.0,DELAWARE,med,ORANGE,N
3,BALTIMORE,24 inch bins,HOWDEN TYPE,2016-09-24,160.0,160.0,160.0,160.0,VIRGINIA,med,ORANGE,N
4,BALTIMORE,24 inch bins,HOWDEN TYPE,2016-11-05,90.0,100.0,90.0,100.0,MARYLAND,lge,ORANGE,N


In [48]:
# 识别分类变量列
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
print("分类变量列：", categorical_cols)

# 对分类变量进行独热编码
df_encoded = pd.get_dummies(df, columns=categorical_cols, drop_first=True)
print("\n独热编码后数据集形状：", df_encoded.shape)

# 查看编码后数据示例
print("\n独热编码后数据前5行：")
pd.DataFrame(df_encoded.iloc[:5, :]).head()

分类变量列： ['City Name', 'Package', 'Variety', 'Origin', 'Item Size', 'Color', 'Repack']

独热编码后数据集形状： (1757, 76)

独热编码后数据前5行：


Unnamed: 0,Date,Low Price,High Price,Mostly Low,Mostly High,City Name_BALTIMORE,City Name_BOSTON,City Name_CHICAGO,City Name_COLUMBIA,City Name_DALLAS,...,Item Size_lge,Item Size_med,Item Size_med-lge,Item Size_sml,Item Size_unknown,Item Size_xlge,Color_STRIPED,Color_WHITE,Color_unknown,Repack_N
0,2017-04-29,270.0,280.0,270.0,280.0,True,False,False,False,False,...,True,False,False,False,False,False,False,False,True,False
1,2017-05-06,270.0,280.0,270.0,280.0,True,False,False,False,False,...,True,False,False,False,False,False,False,False,True,False
2,2016-09-24,160.0,160.0,160.0,160.0,True,False,False,False,False,...,False,True,False,False,False,False,False,False,False,True
3,2016-09-24,160.0,160.0,160.0,160.0,True,False,False,False,False,...,False,True,False,False,False,False,False,False,False,True
4,2016-11-05,90.0,100.0,90.0,100.0,True,False,False,False,False,...,True,False,False,False,False,False,False,False,False,True


# 特征选择

In [52]:
# 计算特征相关性矩阵
correlation = df_encoded.corr()
pd.DataFrame(correlation)

Unnamed: 0,Date,Low Price,High Price,Mostly Low,Mostly High,City Name_BALTIMORE,City Name_BOSTON,City Name_CHICAGO,City Name_COLUMBIA,City Name_DALLAS,...,Item Size_lge,Item Size_med,Item Size_med-lge,Item Size_sml,Item Size_unknown,Item Size_xlge,Color_STRIPED,Color_WHITE,Color_unknown,Repack_N
Date,1.000000,0.071540,0.059298,0.054294,0.060231,0.054379,-0.115743,0.085420,0.022125,0.001860,...,0.111968,-0.068749,0.062681,-0.094333,0.063560,-0.001622,0.049546,-0.053625,0.145190,-0.050552
Low Price,0.071540,1.000000,0.987353,0.971983,0.965711,-0.085013,0.228439,-0.055817,0.084067,0.070797,...,0.137547,-0.017499,0.135061,-0.245615,-0.275919,0.278827,-0.104817,-0.205297,0.029768,-0.134699
High Price,0.059298,0.987353,1.000000,0.965687,0.972163,-0.082528,0.303481,-0.078269,0.041347,0.054691,...,0.131661,-0.035755,0.148516,-0.234910,-0.288283,0.298502,-0.105523,-0.212239,-0.007723,-0.123574
Mostly Low,0.054294,0.971983,0.965687,1.000000,0.992581,-0.098199,0.265744,-0.073165,0.066011,0.058273,...,0.125867,-0.034562,0.133921,-0.269695,-0.236020,0.300084,-0.108299,-0.175993,-0.008374,-0.132311
Mostly High,0.060231,0.965711,0.972163,0.992581,1.000000,-0.085082,0.254620,-0.079393,0.045481,0.058634,...,0.134178,-0.039841,0.136604,-0.270351,-0.237488,0.301052,-0.109629,-0.181366,-0.010091,-0.129223
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Item Size_xlge,-0.001622,0.278827,0.298502,0.300084,0.301052,0.082676,0.230016,-0.076051,-0.107764,-0.099078,...,-0.163256,-0.177052,-0.104007,-0.185461,-0.157905,1.000000,-0.030139,-0.031786,-0.122137,-0.080428
Color_STRIPED,0.049546,-0.104817,-0.105523,-0.108299,-0.109629,-0.025612,-0.041507,-0.033618,0.062057,-0.024115,...,-0.037250,-0.040398,-0.023732,-0.042317,0.190866,-0.030139,1.000000,-0.030801,-0.060931,0.004430
Color_WHITE,-0.053625,-0.205297,-0.212239,-0.175993,-0.181366,0.169779,-0.063923,0.004683,0.005458,-0.003957,...,-0.115524,-0.149938,-0.027184,0.167946,0.110580,-0.031786,-0.030801,1.000000,-0.272906,0.019842
Color_unknown,0.145190,0.029768,-0.007723,-0.008374,-0.010091,-0.011175,-0.367774,0.253690,-0.070904,0.119970,...,-0.017316,0.220699,-0.061461,-0.109794,0.101770,-0.122137,-0.060931,-0.272906,1.000000,-0.072706


In [53]:
# 提取与价格相关的特征相关性
price_correlation = correlation['High Price'].sort_values(ascending=False)
print("与High Price相关性强的特征：")
print(price_correlation[abs(price_correlation) > 0.3])

与High Price相关性强的特征：
High Price                    1.000000
Low Price                     0.987353
Mostly High                   0.972163
Mostly Low                    0.965687
Package_36 inch bins          0.433154
Package_24 inch bins          0.337257
City Name_BOSTON              0.303481
Package_1/2 bushel cartons   -0.504620
Variety_MINIATURE            -0.544909
Name: High Price, dtype: float64


In [61]:
df.shape

(1757, 12)

In [None]:
# 选择与价格相关性强的特征
# selected_features = ['Package_36 inch bins', 'Package_24 inch bins', 'City Name_BOSTON', 'Package_1/2 bushel cartons', 'Variety_MINIATURE']
selected_features = price_correlation[abs(price_correlation) > 0.3].index
print(selected_features)

# 准备特征和目标变量
y = df_encoded['High Price']
X = df_encoded[selected_features].drop(columns = 'High Price')

print(X.shape, y.shape)

Index(['High Price', 'Low Price', 'Mostly High', 'Mostly Low',
       'Package_36 inch bins', 'Package_24 inch bins', 'City Name_BOSTON',
       'Package_1/2 bushel cartons', 'Variety_MINIATURE'],
      dtype='object')
(1757, 8) (1757,)
