In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, mean_absolute_percentage_error

In [2]:
df = pd.read_csv('../cleaning/output/processed_data.tsv', sep='\t')

X = df.drop(['price', 'title', 'province', 'url_id'], axis=1)
y = df['price']

print(X.shape)
print(y.shape)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=128)

(11658, 7)
(11658,)


In [3]:
model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

In [5]:
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred)
print(f'Mean Absolute Error: {mae}')
print(f'Mean Squared Error: {mse}')
print(f'R-squared: {r2}')
print(f'Mean Absolute Percentage Error (MAPE): {mape}')

Mean Absolute Error: 3.437937763750475
Mean Squared Error: 16.909566106450015
R-squared: 0.21012411812155718
Mean Absolute Percentage Error (MAPE): 0.31883876825879526


In [6]:
import matplotlib.pyplot as plt
n = 50

index = range(n)

# Chọn 100 giá trị đầu tiên từ y_test và y_pred
y_test_subset = y_test[:n]
y_pred_subset = y_pred[:n]

# Vẽ biểu đồ đường cho y_test và y_pred
plt.figure(figsize=(10, 6))
plt.plot(index, y_test_subset, label='y_test', marker='o', linestyle='-')
plt.plot(index, y_pred_subset, label='y_pred', marker='x', linestyle='--')

plt.xlabel('Index')
plt.ylabel('Mức giá')
plt.title('So sánh y_test và y_pred (50 điểm đầu tiên)')
plt.legend()
plt.grid(True)

plt.show()

ModuleNotFoundError: No module named 'matplotlib'