<a href="https://colab.research.google.com/github/Zappu1204/ai-samsung/blob/main/NeuralNetwork_AISAMSUNG.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Xử lý dữ liệu

In [20]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

# Đọc dữ liệu từ file CSV
data = pd.read_csv("/content/sample_data/hotel.csv")

print(data.isnull().sum())

# Xử lý dữ liệu missing
numerical_cols = data.select_dtypes(include=['number']).columns
imputer = SimpleImputer(strategy='mean')
data[numerical_cols] = imputer.fit_transform(data[numerical_cols])

categorical_cols = data.select_dtypes(include=['object']).columns
imputer = SimpleImputer(strategy='most_frequent')
data[categorical_cols] = imputer.fit_transform(data[categorical_cols])

# Loại bỏ 'arrival_date_month' và 'reservation_status_date' khỏi categorical_cols
categorical_cols = [col for col in categorical_cols if col not in ['arrival_date_month', 'reservation_status_date']]

# One-Hot Encoding
encoder = OneHotEncoder(handle_unknown='ignore')
encoded_data = encoder.fit_transform(data[categorical_cols])
encoded_df = pd.DataFrame(encoded_data.toarray(), columns=encoder.get_feature_names_out(categorical_cols))

data_processed = data.copy()

# Chuyển đổi dữ liệu ngày thành số ngày kể từ ngày đặt phòng
data_processed['arrival_date'] = pd.to_datetime(data['arrival_date_year'].astype(int).astype(str) + '-' +
                                               data['arrival_date_month'] + '-' +
                                               data['arrival_date_day_of_month'].astype(int).astype(str))
data_processed['reservation_status_date'] = pd.to_datetime(data['reservation_status_date'])
data_processed['days_since_booking'] = (data_processed['reservation_status_date'] - data_processed['arrival_date']).dt.days

data_processed = data_processed.drop(['arrival_date_year', 'arrival_date_month', 'arrival_date_day_of_month', 'arrival_date', 'reservation_status_date'], axis=1)

data_processed = pd.concat([data_processed, encoded_df], axis=1)

data_processed = data_processed.drop(categorical_cols, axis=1)

# Chia dữ liệu
X = data_processed.drop('is_canceled', axis=1)
y = data_processed['is_canceled']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Xử lý dữ liệu hoàn tất!")

hotel                                  0
is_canceled                            0
lead_time                              0
arrival_date_year                      0
arrival_date_month                     0
arrival_date_week_number               0
arrival_date_day_of_month              0
stays_in_weekend_nights                0
stays_in_week_nights                   0
adults                                 0
children                               4
babies                                 0
meal                                   0
country                              488
market_segment                         0
distribution_channel                   0
is_repeated_guest                      0
previous_cancellations                 0
previous_bookings_not_canceled         0
reserved_room_type                     0
assigned_room_type                     0
booking_changes                        0
deposit_type                           0
agent                              16340
company         

# Huấn luyện mô hình:

In [8]:
import tensorflow as tf

print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  1


In [11]:
! pip install tensorflow



In [17]:
from sklearn.model_selection import GridSearchCV
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from sklearn.base import BaseEstimator, ClassifierMixin
from tensorflow.keras.layers import Input

# class custom estimator
class KerasEstimator(BaseEstimator, ClassifierMixin):
    def __init__(self, optimizer='adam', dropout_rate=0.2, neurons=64):
        self.optimizer = optimizer
        self.dropout_rate = dropout_rate
        self.neurons = neurons
        self.model = None

    def fit(self, X, y):
        self.model = Sequential()
        self.model.add(Input(shape=(X_train.shape[1],)))
        self.model.add(Dense(self.neurons, activation='relu'))
        self.model.add(Dropout(self.dropout_rate))
        self.model.add(Dense(self.neurons, activation='relu'))
        self.model.add(Dropout(self.dropout_rate))
        self.model.add(Dense(1, activation='sigmoid'))
        self.model.compile(loss='binary_crossentropy', optimizer=self.optimizer, metrics=['accuracy'])
        self.model.fit(X, y)
        self.classes_ = [0, 1]
        return self

    def predict(self, X):
        return (self.model.predict(X) > 0.5).astype(int)

    def get_params(self, deep=True):
        return {
            'optimizer': self.optimizer,
            'dropout_rate': self.dropout_rate,
            'neurons': self.neurons
        }

    def set_params(self, **params):
        for parameter, value in params.items():
            setattr(self, parameter, value)
        return self

# GridSearchCV
model = KerasEstimator()
param_grid = {
    'batch_size': [32, 64],
    'epochs': [10, 20],
    'optimizer': ['adam', 'rmsprop'],
    'dropout_rate': [0.1, 0.2],
    'neurons': [32, 64]
}
grid = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='f1')

grid_result = grid.fit(X_train, y_train)

print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

best_model = grid_result.best_estimator_

print("Huấn luyện mô hình hoàn tất!")

[1m2388/2388[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 2ms/step - accuracy: 0.9068 - loss: 0.3427
[1m597/597[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step
[1m2388/2388[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 2ms/step - accuracy: 0.9095 - loss: 0.4393
[1m597/597[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step
[1m2388/2388[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 2ms/step - accuracy: 0.9036 - loss: 0.4008
[1m597/597[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step
[1m2388/2388[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 2ms/step - accuracy: 0.8780 - loss: 0.5494
[1m597/597[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step
[1m2388/2388[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 2ms/step - accuracy: 0.8856 - loss: 0.6238
[1m597/597[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step
[1m2388/2388[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 2m

# Đánh giá

In [21]:
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, roc_auc_score
import tensorflow as tf
# Dự đoán trên tập kiểm tra
y_pred = best_model.predict(X_test)

# Tính toán các chỉ số đánh giá
f1 = f1_score(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_pred)

# In ra các chỉ số đánh giá
print("F1-score: ", f1)
print("Accuracy: ", accuracy)
print("Precision: ", precision)
print("Recall: ", recall)
print("AUC: ", auc)

# Lưu lại best model
best_model.model.save('best_model.keras')
print("Best model đã được lưu")

[1m747/747[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step
F1-score:  0.9997770594136662
Accuracy:  0.9998324817823938
Precision:  0.9997770594136662
Recall:  0.9997770594136662
AUC:  0.9998214471281789
Best model đã được lưu
