In [1]:
import numpy as np
import pandas as pd
import joblib
from sklearn.preprocessing import StandardScaler, MinMaxScaler,LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error,mean_absolute_percentage_error, mean_absolute_error, r2_score, accuracy_score

tbb = pd.read_csv("D:/bigdata/dacn_nhom12/data/data_train/tbb.csv")
tn = pd.read_csv('D:/bigdata/dacn_nhom12/data/data_train/tn.csv')
dh = pd.read_csv('D:/bigdata/dacn_nhom12/data/data_train/dh.csv')
tbb, tn, dh

(          tên hồ          Qve      Tqx   Qxt      Qxm
 0       Bản Chát    11.500000     0.00     0     0.00
 1     Huội Quảng     5.300000     0.00     0     0.00
 2         Sơn La   861.000000     0.00     0     0.00
 3       Hòa Bình   200.000000   829.00     0   829.00
 4        Thác Bà   117.170515     0.00     0     0.00
 ...          ...          ...      ...   ...      ...
 7023    Bản Chát   170.400000   245.90     0   245.90
 7024  Huội Quảng   300.000000   351.00     0   351.00
 7025      Sơn La  2561.000000  3000.00     0  3000.00
 7026    Hòa Bình  5011.000000  5534.00  3279  2255.00
 7027     Thác Bà   450.000000   446.72     0   446.72
 
 [7028 rows x 5 columns],
            tên hồ     Qve     Tqx   Qxt     Qxm
 0       Pleikrông   30.00   42.00  0.00   42.00
 1            Ialy  153.00  309.00  0.00  309.00
 2        Sê San 3  316.00  243.00  0.00  243.00
 3       Sê San 3A  216.00  245.00  0.00  245.00
 4        Sê San 4  230.00  230.00  0.00  230.00
 ...           ...

In [2]:
def train(df):
    # Tách cột mục tiêu và đặc trưng
    targets = df['Tqx']
    features = df.drop(['Tqx'], axis=1)

    # Mã hóa cột 'tên hồ'
    label_encoder = LabelEncoder()
    features['tên hồ'] = label_encoder.fit_transform(df['tên hồ'].values.ravel())


    # Chia dữ liệu thành tập huấn luyện và kiểm tra
    X_train, X_test, y_train, y_test = train_test_split(features, targets, test_size=0.15, random_state=42)

    # Chuẩn hóa dữ liệu
    scaler = MinMaxScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    # Huấn luyện mô hình
    model = LinearRegression()
    model.fit(X_train, y_train)

    # Đánh giá mô hình
    y_pred = model.predict(X_test)
    print(f"R^2 Score: {model.score(X_test, y_test):.4f}")
    print(f"MSE: {mean_squared_error(y_test, y_pred):.4f}")
    print(f"RMSE: {np.sqrt(mean_squared_error(y_test, y_pred)):.4f}")
    print(f"MAE: {mean_absolute_error(y_test, y_pred):.4f}")
    print(f"MAPE: {mean_absolute_percentage_error(y_test, y_pred) * 100:.2f}%")

    return model, label_encoder, scaler


In [3]:
model_tbb, encoder_tbb, scaler_tbb = train(tbb)
model_tn, encoder_tn, scaler_tn = train(tn)
model_dh, encoder_dh, scaler_dh = train(dh)

R^2 Score: 1.0000
MSE: 0.0000
RMSE: 0.0000
MAE: 0.0000
MAPE: 65592.73%
R^2 Score: 1.0000
MSE: 0.0000
RMSE: 0.0000
MAE: 0.0000
MAPE: 3011.66%
R^2 Score: 1.0000
MSE: 0.0000
RMSE: 0.0000
MAE: 0.0000
MAPE: 20812.74%


In [4]:
joblib.dump(model_tbb, 'D:/bigdata/dacn_nhom12/model/model_tbb.joblib')
joblib.dump(model_tn, 'D:/bigdata/dacn_nhom12/model/model_tn.joblib')
joblib.dump(model_dh, 'D:/bigdata/dacn_nhom12/model/model_dh.joblib')

joblib.dump(scaler_tbb, 'D:/bigdata/dacn_nhom12/scaler/scaler_tbb.joblib')
joblib.dump(scaler_tn, 'D:/bigdata/dacn_nhom12/scaler/scaler_tn.joblib')
joblib.dump(scaler_dh, 'D:/bigdata/dacn_nhom12/scaler/scaler_dh.joblib')

joblib.dump(encoder_tbb, 'D:/bigdata/dacn_nhom12/encoder/encoder_tbb.joblib')
joblib.dump(encoder_tn, 'D:/bigdata/dacn_nhom12/encoder/encoder_tn.joblib')
joblib.dump(encoder_dh, 'D:/bigdata/dacn_nhom12/encoder/encoder_dh.joblib')


['D:/bigdata/dacn_nhom12/encoder/encoder_dh.joblib']