In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score

def preprocess_data(df, label_encoders=None):
    label_encoders = {} if label_encoders is None else label_encoders
    for col in df.columns:
        if df[col].dtype == 'object':
            if col not in label_encoders:
                le = LabelEncoder()
                df[col] = le.fit_transform(df[col].astype(str))
                label_encoders[col] = le
            else:
                le = label_encoders[col]
                df[col] = le.transform(df[col].astype(str))

    imputer = SimpleImputer(strategy='most_frequent')
    df = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)

    scaler = StandardScaler()
    df[['X6', 'X7', 'X8', 'X9', 'X10']] = scaler.fit_transform(df[['X6', 'X7', 'X8', 'X9', 'X10']])
    
    return df, label_encoders

train_df = pd.read_csv("Train_samsung.csv")
test_df = pd.read_csv("Test_samsung_noclass.csv")
original_test_df = test_df.copy()

train_df, label_encoders = preprocess_data(train_df)
test_df, _ = preprocess_data(test_df, label_encoders) 

X = train_df.drop('Class', axis=1)
y = train_df['Class']

# Chia dữ liệu
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

y_pred_val = model.predict(X_test)

# tính f1-score
f1_val = f1_score(y_test, y_pred_val)
print(f"F1-score on validation set: {f1_val}")
# dự đoán nhãn cho tập test
y_pred_test = model.predict(test_df)
y_pred_test = y_pred_test.astype(int)
le = label_encoders['Class']
original_test_df['Class'] = le.inverse_transform(y_pred_test)

original_test_df.to_csv("Test_samsung_predicted_ver_random_forest.csv", index=False)

print("Predicted labels saved to Test_samsung_predicted.csv")

F1-score on validation set: 0.8243243243243243
Predicted labels saved to Test_samsung_predicted.csv
