# Implementation of Other Models

In [1]:
# Load dataset
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score, make_scorer

train_df = pd.read_csv("./dataset/train_tfidf_features.csv")
test_df = pd.read_csv("./dataset/test_tfidf_features.csv")

X_train = train_df.drop(["id", "label"], axis=1)
y_train = train_df["label"]
X_test = test_df.drop(["id"], axis=1)

import os
if not os.path.exists('./predictions'):
    os.makedirs('./predictions')

def cross_validation(model, X, y):
    scores = cross_val_score(model, X, y, cv=5, scoring=make_scorer(f1_score, average='weighted'))
    return scores.mean()

def save_predictions(y_pred, filename):
        output = pd.DataFrame({"id": test_df["id"], "label": y_pred})
        output.to_csv(filename, index=False)

In [3]:
# Functions for self-evaluation
def accuracy(y_true, y_pred):
    return np.sum(y_true == y_pred) / len(y_true)

def f1_score(y_true, y_pred, class_label):
    tp = np.sum((y_true == class_label) & (y_pred == class_label))
    fp = np.sum((y_true != class_label) & (y_pred == class_label))
    fn = np.sum((y_true == class_label) & (y_pred != class_label))
    
    if tp + 0.5 * (fp + fn) == 0:
        return 0
    
    f1 = tp / (tp + 0.5 * (fp + fn))
    return f1

def macro_f1_score(y_true, y_pred):
    f1_hateful = f1_score(y_true, y_pred, class_label=1)
    f1_non_hateful = f1_score(y_true, y_pred, class_label=0)
    return (f1_hateful + f1_non_hateful) / 2

def self_evaluatation_training_set(y_train, y_train_pred):
    print("Training set accuracy:", accuracy(y_train, y_train_pred))
    print("F1 Score for Hateful (class 1):", f1_score(y_train, y_train_pred, 1))
    print("F1 Score for Non-Hateful (class 0):", f1_score(y_train, y_train_pred, 0))
    print("Macro F1 Score:", macro_f1_score(y_train, y_train_pred))

In [8]:
# PCA dimensionality reduction and logistic regression
# 2000 components seems to be too many features as indicated in task 2, reduce to test 1000, 500 and 100 for less time complexity
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegressionCV

components_list = [2000, 1000, 500, 100]
log_reg_cv = LogisticRegressionCV(cv=5, random_state=0) # max_iter 1000 no dif

for n_components in components_list:
    pca = PCA(n_components=n_components)
    X_train_pca = pca.fit_transform(X_train)
    X_test_pca = pca.transform(X_test)

    log_reg_cv.fit(X_train_pca, y_train)

    y_pred = log_reg_cv.predict(X_test_pca)

    # Self evaluation on training set
    y_train_pred = log_reg_cv.predict(X_train_pca)
    print("Components:", n_components)
    self_evaluatation_training_set(y_train, y_train_pred)

    save_predictions(y_pred, f'./predictions/LogisticRegressionCV_PCA_{n_components}_components_Predictions.csv')


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Components: 2000
Training set accuracy: 0.7832867783985102
F1 Score for Hateful (class 1): 0.684085510688836
F1 Score for Non-Hateful (class 0): 0.8350752878653676
Macro F1 Score: 0.7595803992771017


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Components: 1000
Training set accuracy: 0.7543063314711359
F1 Score for Hateful (class 1): 0.6357832988267771
F1 Score for Non-Hateful (class 0): 0.8146294344924482
Macro F1 Score: 0.7252063666596127
Components: 500
Training set accuracy: 0.7371973929236499
F1 Score for Hateful (class 1): 0.60886887233674
F1 Score for Non-Hateful (class 0): 0.8021207606695294
Macro F1 Score: 0.7054948165031347
Components: 100
Training set accuracy: 0.6881401303538175
F1 Score for Hateful (class 1): 0.4970436414828719
F1 Score for Non-Hateful (class 0): 0.7740058195926285
Macro F1 Score: 0.6355247305377502


In [5]:
# PCA dimensionality reduction and SVM, as SVMs typically perform better with high-dimensional and unstructured datasets, such as image and text data, compared to logistic regression.
from sklearn.decomposition import PCA
from sklearn.svm import SVC

pca = PCA(n_components=500)
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

kernels = ['linear', 'poly', 'rbf', 'sigmoid']

for kernel in kernels:
    svm = SVC(kernel=kernel, random_state=0)
    svm.fit(X_train_pca, y_train)
    
    y_pred = svm.predict(X_test_pca)

    # Self evaluation on training set
    y_train_pred = svm.predict(X_train_pca)
    print("Kernel:", kernel)
    self_evaluatation_training_set(y_train, y_train_pred)

    save_predictions(y_pred, f'./predictions/SVM_PCA_500_components_{kernel}_Predictions.csv')

Kernel: linear
Training set accuracy: 0.7314944134078212
F1 Score for Hateful (class 1): 0.5715081723625557
F1 Score for Non-Hateful (class 0): 0.8044915254237288
Macro F1 Score: 0.6879998488931423
Kernel: poly
Training set accuracy: 0.8564944134078212
F1 Score for Hateful (class 1): 0.7715397443023903
F1 Score for Non-Hateful (class 0): 0.8953932298294731
Macro F1 Score: 0.8334664870659316
Kernel: rbf
Training set accuracy: 0.8780842644320298
F1 Score for Hateful (class 1): 0.8254893794252395
F1 Score for Non-Hateful (class 0): 0.9063184724768591
Macro F1 Score: 0.8659039259510493
Kernel: sigmoid
Training set accuracy: 0.6504888268156425
F1 Score for Hateful (class 1): 0.4891988433407042
F1 Score for Non-Hateful (class 0): 0.7343653250773994
Macro F1 Score: 0.6117820842090518


In [None]:
# Decision Tree Classifier, no tuning
from sklearn.tree import DecisionTreeClassifier

decision_tree = DecisionTreeClassifier(random_state=0)
decision_tree.fit(X_train, y_train)

y_pred = decision_tree.predict(X_test)

# Self evaluation on training set. Shows a lot of overfitting
y_train_pred = decision_tree.predict(X_train)
self_evaluatation_training_set(y_train, y_train_pred)

save_predictions(y_pred, f'./predictions/DecisionTree_Predictions.csv')

In [15]:
# Decision Tree Classifier with max_depth after considering overfitting. Performs worse
from sklearn.tree import DecisionTreeClassifier

max_depth_values = [10, 100, 500]

# Train and predict using Decision Tree for each max_depth value
for max_depth in max_depth_values:
    decision_tree = DecisionTreeClassifier(random_state=0, max_depth=max_depth)
    decision_tree.fit(X_train, y_train)
    
    y_pred = decision_tree.predict(X_test)
    
    save_predictions(y_pred, f'./predictions/DecisionTree_max_depth_{max_depth}_Predictions.csv')

In [16]:
# Random Forest Implementation, no tuning
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.metrics import f1_score, make_scorer

rf = RandomForestClassifier(n_estimators=200, random_state=0)

rf.fit(X_train, y_train)

y_pred = rf.predict(X_test)

# Self evaluation on training set
# y_train_pred = rf.predict(X_train)
# self_evaluatation_training_set(y_train, y_train_pred)

f1_macro = make_scorer(f1_score, average='macro')
cv_scores = cross_val_score(rf, X_train, y_train, cv=5, scoring=f1_macro)
print("Default RF F1 Score (cross-validation):", cv_scores.mean())

# save_predictions(y_pred, './predictions/RandomForest_Predictions.csv')

KeyboardInterrupt: 

In [11]:
# Random Forest Implementation, no tuning
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=500, random_state=0)

rf.fit(X_train, y_train)

y_pred = rf.predict(X_test)

# Self evaluation on training set
y_train_pred = rf.predict(X_train)
self_evaluatation_training_set(y_train, y_train_pred)

save_predictions(y_pred, './predictions/RandomForest_500_Predictions.csv')

# Random Forest with 100 estimators
rf_1000 = RandomForestClassifier(n_estimators=100, random_state=0)
rf_1000.fit(X_train, y_train)
y_pred_1000 = rf_1000.predict(X_test)

# Self evaluation on training set
y_train_pred_100 = rf_1000.predict(X_train)
self_evaluatation_training_set(y_train, y_train_pred)

save_predictions(y_pred_1000, './predictions/RandomForest_1000_Predictions.csv')

Training set accuracy: 0.996566573556797
F1 Score for Hateful (class 1): 0.9954951515614263
F1 Score for Non-Hateful (class 0): 0.9972262705091439
Macro F1 Score: 0.9963607110352851
100 Estimators - Training set accuracy: 0.996566573556797
100 Estimators - F1 Score for Hateful (class 1): 0.9954951515614263
100 Estimators - F1 Score for Non-Hateful (class 0): 0.9972262705091439
100 Estimators - Macro F1 Score: 0.9963607110352851


In [4]:
# Random Forest Hyperparameter tuning
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import make_scorer, f1_score
import numpy as np

param_distributions = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

rf = RandomForestClassifier(random_state=0)

scorer = make_scorer(f1_score, average='macro')

random_search = RandomizedSearchCV(estimator=rf, param_distributions=param_distributions, 
                                   n_iter=20, cv=3, scoring=scorer, verbose=2, random_state=0, n_jobs=-1)

random_search.fit(X_train, y_train)

best_params = random_search.best_params_
best_rf = random_search.best_estimator_

print("Best Parameters:", best_params)

y_pred = best_rf.predict(X_test)

save_predictions(y_pred, './predictions/RandomForest_Tuned_Predictions.csv')

# Self evaluation on training set
y_train_pred = best_rf.predict(X_train)
self_evaluatation_training_set(y_train, y_train_pred)

Fitting 3 folds for each of 20 candidates, totalling 60 fits
Best Parameters: {'n_estimators': 200, 'min_samples_split': 10, 'min_samples_leaf': 1, 'max_depth': None}
Training set accuracy: 0.9917946927374302
F1 Score for Hateful (class 1): 0.9892358195282083
F1 Score for Non-Hateful (class 0): 0.9933706333160939
Macro F1 Score: 0.9913032264221511


In [None]:
# Decision Tree Classifier, no tuning
from sklearn.tree import DecisionTreeClassifier

decision_tree = DecisionTreeClassifier(random_state=0)
decision_tree.fit(X_train, y_train)

y_pred = decision_tree.predict(X_test)

# Self evaluation on training set. Shows a lot of overfitting
y_train_pred = decision_tree.predict(X_train)
self_evaluatation_training_set(y_train, y_train_pred)

save_predictions(y_pred, f'./predictions/DecisionTree_Predictions.csv')

Training set accuracy: 0.996566573556797
F1 Score for Hateful (class 1): 0.9954903309638462
F1 Score for Non-Hateful (class 0): 0.9972280949025135
Macro F1 Score: 0.9963592129331799


In [8]:
# PCA and random forest
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier

n_components_list = [100, 500, 1000]

for n_components in n_components_list:
    pca = PCA(n_components=n_components)
    X_train_pca = pca.fit_transform(X_train)
    X_test_pca = pca.transform(X_test)
    
    n_estimators_list = [100, 200, 300]
    
    for n_estimators in n_estimators_list:
        print(f"\nTraining Random Forest with {n_estimators} estimators, {n_components} components")
        
        # Random Forest Implementation
        rf = RandomForestClassifier(n_estimators=n_estimators, random_state=0)
        rf.fit(X_train_pca, y_train)
        
        y_pred = rf.predict(X_test_pca)
        
        # Self evaluation on training set
        y_train_pred = rf.predict(X_train_pca)
        self_evaluatation_training_set(y_train, y_train_pred)
        
        # Save predictions
        save_predictions(y_pred, f'./predictions/RandomForest_Predictions_{n_components}_components_{n_estimators}_estimators.csv')


Training Random Forest with 100 estimators, 100 components
Training set accuracy: 0.996566573556797
F1 Score for Hateful (class 1): 0.9954965269826731
F1 Score for Non-Hateful (class 0): 0.9972257488127145
Macro F1 Score: 0.9963611378976938

Training Random Forest with 200 estimators, 100 components
Training set accuracy: 0.996566573556797
F1 Score for Hateful (class 1): 0.9954937752997786
F1 Score for Non-Hateful (class 0): 0.9972267920094007
Macro F1 Score: 0.9963602836545896

Training Random Forest with 300 estimators, 100 components
Training set accuracy: 0.996566573556797
F1 Score for Hateful (class 1): 0.9954979015642884
F1 Score for Non-Hateful (class 0): 0.9972252269200019
Macro F1 Score: 0.9963615642421452

Training Random Forest with 100 estimators, 500 components
Training set accuracy: 0.996566573556797
F1 Score for Hateful (class 1): 0.9954972143783867
F1 Score for Non-Hateful (class 0): 0.9972254878909005
Macro F1 Score: 0.9963613511346436

Training Random Forest with 200

In [4]:
# Gradient Boosting using XGBoost
import xgboost as xgb

xgb_model = xgb.XGBClassifier(random_state=0, use_label_encoder=False, eval_metric='logloss')
xgb_model.fit(X_train, y_train)

y_pred = xgb_model.predict(X_test)

# Self evaluation on training set
y_train_pred = xgb_model.predict(X_train)
self_evaluatation_training_set(y_train, y_train_pred)

save_predictions(y_pred, './predictions/XGBoost_Predictions.csv')

Parameters: { "use_label_encoder" } are not used.



Training set accuracy: 0.7852071694599627
F1 Score for Hateful (class 1): 0.660909508497933
F1 Score for Non-Hateful (class 0): 0.8428224673167823
Macro F1 Score: 0.7518659879073577


In [20]:
# TORUN: TVSD and Logistic Regression
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import TruncatedSVD

n_components_list = [100, 500, 1000]

for n_components in n_components_list:
    print(f"\nApplying TSVD with {n_components} components")
    
    tsvd = TruncatedSVD(n_components=n_components)
    X_train_tsvd = tsvd.fit_transform(X_train)
    X_test_tsvd = tsvd.transform(X_test)
    
    print(f"\nTraining Logistic Regression with {n_components} TSVD components")
    
    lr = LogisticRegression(random_state=0, max_iter=100)
    lr.fit(X_train_tsvd, y_train)
    
    y_pred = lr.predict(X_test_tsvd)
    
    # Self evaluation on training set
    y_train_pred = lr.predict(X_train_tsvd)
    self_evaluatation_training_set(y_train, y_train_pred)
    
    save_predictions(y_pred, f'./predictions/LogisticRegression_Predictions_{n_components}_components_TSVD.csv')


Applying TSVD with 100 components


KeyboardInterrupt: 

In [24]:
# SVM
from sklearn.svm import SVC

kernels = ['sigmoid', 'linear', 'rbf']

for kernel in kernels:
    svm = SVC(kernel=kernel)
    svm.fit(X_train, y_train)
    
    # Make predictions on the test set
    y_test_pred = svm.predict(X_test)

    #Self evaluation on training set
    y_train_pred = svm.predict(X_train)
    self_evaluatation_training_set(y_train, y_train_pred)
    
    save_predictions(y_test_pred, f"./predictions/svm_{kernel}_predictions_default.csv")

Training set accuracy: 0.7540735567970205
F1 Score for Hateful (class 1): 0.6373154823206316
F1 Score for Non-Hateful (class 0): 0.8139637260081
Macro F1 Score: 0.7256396041643658
Training set accuracy: 0.8127327746741154
F1 Score for Hateful (class 1): 0.7275651879444632
F1 Score for Non-Hateful (class 0): 0.8573328604362476
Macro F1 Score: 0.7924490241903555
Training set accuracy: 0.9362779329608939
F1 Score for Hateful (class 1): 0.9122947537044453
F1 Score for Non-Hateful (class 0): 0.94996115706256
Macro F1 Score: 0.9311279553835027


In [25]:
# SVM Tuning. Took 8 hours and no result
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

def self_evaluatation_training_set(y_true, y_pred):
    accuracy = accuracy_score(y_true, y_pred)
    print(f"Self-evaluation accuracy on training set: {accuracy}")

# Define parameter grid for RBF kernel
param_grid_rbf = {'C': [0.1, 1, 10, 100], 'gamma': [1e-3, 1e-2, 1e-1, 1]}

# Perform Grid Search for RBF kernel
print("Tuning parameters for RBF kernel")
svm_rbf = SVC(kernel='rbf')
grid_search_rbf = GridSearchCV(svm_rbf, param_grid_rbf, cv=5, scoring='accuracy')
grid_search_rbf.fit(X_train, y_train)

# Best parameters and model for RBF kernel
best_params_rbf = grid_search_rbf.best_params_
best_model_rbf = grid_search_rbf.best_estimator_
print(f"Best parameters for RBF kernel: {best_params_rbf}")

# Make predictions on the test set with the best model
y_test_pred_rbf = best_model_rbf.predict(X_test)

# Self evaluation on training set
y_train_pred_rbf = best_model_rbf.predict(X_train)
self_evaluatation_training_set(y_train, y_train_pred_rbf)

# Save predictions to a CSV file for RBF kernel
filename_rbf = "./predictions/svm_rbf_predictions_tuned.csv"
save_predictions(y_test_pred_rbf, filename_rbf)
print(f"Predictions saved to {filename_rbf}")

Tuning parameters for RBF kernel


In [8]:
# Random Forest Implementation, varying estimators
from sklearn.ensemble import RandomForestClassifier

n_estimators_values = [180, 220]

for n_estimators in n_estimators_values:
    print(f"Training Random Forest with n_estimators={n_estimators}")
    rf = RandomForestClassifier(n_estimators=n_estimators, random_state=0)
    rf.fit(X_train, y_train)
    
    y_pred = rf.predict(X_test)
    
    y_train_pred = rf.predict(X_train)
    self_evaluatation_training_set(y_train, y_train_pred)
    
    save_predictions(y_pred, f"./predictions/RandomForest_{n_estimators}_estimators_Predictions.csv")

Training Random Forest with n_estimators=180
Training set accuracy: 0.996566573556797
F1 Score for Hateful (class 1): 0.9954965269826731
F1 Score for Non-Hateful (class 0): 0.9972257488127145
Macro F1 Score: 0.9963611378976938
Training Random Forest with n_estimators=220
Training set accuracy: 0.996566573556797
F1 Score for Hateful (class 1): 0.9954965269826731
F1 Score for Non-Hateful (class 0): 0.9972257488127145
Macro F1 Score: 0.9963611378976938


In [9]:
# Random Forest Implementation, varying max_depth. performs worse
from sklearn.ensemble import RandomForestClassifier

max_depth_values = [10, 50, 100, 500]

for max_depth in max_depth_values:
    print(f"Training Random Forest with max_depth={max_depth}")
    rf = RandomForestClassifier(n_estimators=200, max_depth=max_depth, random_state=0)
    rf.fit(X_train, y_train)
    
    y_pred = rf.predict(X_test)
    
    y_train_pred = rf.predict(X_train)
    self_evaluatation_training_set(y_train, y_train_pred)
    
    save_predictions(y_pred, f"./predictions/RandomForest_max_depth_{max_depth}_Predictions.csv")

Training Random Forest with max_depth=10
Training set accuracy: 0.6213337988826816
F1 Score for Hateful (class 1): 0.0133434420015163
F1 Score for Non-Hateful (class 0): 0.7657077017246966
Macro F1 Score: 0.38952557186310643
Training Random Forest with max_depth=50
Training set accuracy: 0.8119762569832403
F1 Score for Hateful (class 1): 0.6755046700813498
F1 Score for Non-Hateful (class 0): 0.8676416369669412
Macro F1 Score: 0.7715731535241455
Training Random Forest with max_depth=100
Training set accuracy: 0.8948440409683427
F1 Score for Hateful (class 1): 0.8403851249889586
F1 Score for Non-Hateful (class 0): 0.9215950015186358
Macro F1 Score: 0.8809900632537973
Training Random Forest with max_depth=500
Training set accuracy: 0.9962756052141527
F1 Score for Hateful (class 1): 0.9951130116065975
F1 Score for Non-Hateful (class 0): 0.9969913501316284
Macro F1 Score: 0.996052180869113


In [10]:
# Random Forest Implementation, varying criterions. perform equal or worse
from sklearn.ensemble import RandomForestClassifier

criterions = ["entropy", "log_loss"]

for criterion in criterions:
    print(f"Training Random Forest with criterion={criterion}")
    rf = RandomForestClassifier(n_estimators=200, criterion=criterion, random_state=0)
    rf.fit(X_train, y_train)
    
    y_pred = rf.predict(X_test)
    
    y_train_pred = rf.predict(X_train)
    self_evaluatation_training_set(y_train, y_train_pred)
    
    save_predictions(y_pred, f"./predictions/RandomForest_{criterion}_Predictions.csv")

Training Random Forest with criterion=entropy
Training set accuracy: 0.996566573556797
F1 Score for Hateful (class 1): 0.9954965269826731
F1 Score for Non-Hateful (class 0): 0.9972257488127145
Macro F1 Score: 0.9963611378976938
Training Random Forest with criterion=log_loss
Training set accuracy: 0.996566573556797
F1 Score for Hateful (class 1): 0.9954965269826731
F1 Score for Non-Hateful (class 0): 0.9972257488127145
Macro F1 Score: 0.9963611378976938


In [14]:
# Gradient Boosting
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.metrics import f1_score, make_scorer
from xgboost import XGBClassifier

xgb = XGBClassifier(eval_metric='error')

f1_macro = make_scorer(f1_score, average='macro')
cv_scores = cross_val_score(xgb, X_train, y_train, cv=5, scoring=f1_macro)
print("Default XGBoost F1 Score (cross-validation):", cv_scores.mean())

xgb.fit(X_train, y_train)

final_predictions = xgb.predict(X_test)
save_predictions(final_predictions, './predictions/XGBoost.csv')

Default XGBoost F1 Score (cross-validation): 0.6667001387640876


In [3]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

rfe = RFE(estimator=RandomForestClassifier(random_state=0), n_features_to_select=1000, step=10)
X_train_selected = rfe.fit_transform(X_train, y_train)
X_test_selected = rfe.transform(X_test)
selected_features = X_train.columns[rfe.support_]

print(f"Number of selected features: {len(selected_features)}")

rf_classifier = RandomForestClassifier(n_estimators=200, random_state=0)
rf_classifier.fit(X_train_selected, y_train)

test_predictions = rf_classifier.predict(X_test_selected)

save_predictions(test_predictions, './predictions/rf_rfe_submission.csv')