In [8]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score

In [9]:

import Amazon_datacleaning as cl
import Amazon_counting as counting

In [10]:
# 讀取數據
df = pd.read_csv('C:/python-training/爬蟲/0530_Aamazon_sunglasses+men/Amazon商品資料_sunglasses+men_加月銷量.csv')

# 清理資料
cl.clean_price(df)
cl.clean_star(df)
cl.clean_monthly_sales(df)


0       2000
1        200
2        200
3        100
4      10000
       ...  
185       50
186      300
187        0
188      700
189      100
Name: 過去一個月銷量, Length: 190, dtype: int32

In [11]:

# 設置特徵和目標變量
features = ['商品定價', '星星評分', '顏色選項', '過去一個月銷量']
target = '過去一個月銷量'

# 對於顏色選項進行一熱編碼
df['顏色選項'] = df['顏色選項'].apply(counting.preprocess_text)


In [15]:
# 分割數據集
X = df[features]
y = df[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(X_train, X_test, y_train, y_test)

      商品定價  星星評分                                               顏色選項  過去一個月銷量
51   87.00   4.5                       iridium jade iridium iridium      300
35   26.99   4.3  black frame black temple grey lens b gunmetal ...      400
118  19.99   4.2  colors colors colors colors colors colors colo...      300
60   24.99   4.5  black frame gray lens gunmetal frame gray lens...     1000
161  50.21   4.6                                        multi multi      100
..     ...   ...                                                ...      ...
106  11.99   4.4  matteblack lens matteblack lens matte black mi...     1000
14   18.99   4.4                                                        5000
92   17.99   4.3                         mirrored mirrored mirrored      200
179  36.75   4.2                                                           0
102  18.99   4.0  frames grey lens frames brown lens frames gree...      100

[152 rows x 4 columns]        商品定價  星星評分                                   

In [13]:
# 數據預處理和模型管道
numeric_features = ['商品定價', '星星評分']
categorical_features = ['顏色選項']

numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])


In [14]:
# 訓練模型
model.fit(X_train, y_train)

# 預測
y_pred = model.predict(X_test)

# 評估模型
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error: {mse}')
print(f'R-squared: {r2}')


ValueError: Input X contains NaN.
LinearRegression does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [None]:
# 查看模型係數
print('模型係數:', model.named_steps['regressor'].coef_)
print('截距:', model.named_steps['regressor'].intercept_)