In [2]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import f1_score
import pandas as pd


In [3]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [4]:
train_features, train_labels = train.iloc[:, :-1] , train.iloc[:, -1]

In [5]:
print(train_features.shape, train_labels.shape)
print(train_features.head()) 
print(train_labels.head())    

(4800, 26) (4800,)
        x_0       x_1       x_2       x_3       x_4       x_5       x_6  \
0 -0.789364 -1.434296  0.324867 -1.089570 -3.186751 -1.915492 -1.985052   
1  1.698561 -0.530175  0.756504 -0.849795  0.738859  4.321680  3.322877   
2  2.480805 -2.933747  1.407295 -0.356059  1.179147 -4.181063 -4.177118   
3  1.457755 -0.106902 -0.852411  1.175998  6.619029  2.156072 -0.777952   
4  1.427555  0.649282  0.254497 -1.064585  0.303576 -4.545240  4.577316   

        x_7       x_8        x_9  ...      x_16      x_17      x_18      x_19  \
0 -5.109614 -1.776943 -10.228432  ... -0.868734 -1.145350 -1.157258 -4.935825   
1 -4.769473 -1.148654  -0.623213  ... -2.094125  1.077191 -3.360013 -7.324134   
2 -2.854105 -0.223620  -2.034928  ... -1.735243 -3.219309 -0.026445 -8.659095   
3  5.031490  0.476906   3.009128  ...  2.225405  4.263037 -0.784668  5.115430   
4 -5.233015 -1.007334   0.511030  ... -0.103665  3.363232 -1.767302 -3.108375   

       x_20      x_21      x_22      x_23  

In [6]:
# Random Forest classifier
rf_classifier = RandomForestClassifier()

# Hyperparameters to tune
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

In [7]:
# Grid search with F1-score as the scoring metric
grid_search = GridSearchCV(estimator=rf_classifier, param_grid=param_grid,
                           cv=10, n_jobs=-1, verbose=2, scoring='f1_weighted')

# Perform the grid search on the training data
grid_search.fit(train_features, train_labels)

# Print the best hyperparameters
print("Best Hyperparameters:", grid_search.best_params_)

# Predictions on the test set using the best model
y_pred = grid_search.predict(train_features)

# Calculate F1-score on the test set
f1 = f1_score(train_labels, y_pred, average='weighted')
print("Weighted F1-score:", f1)

Fitting 10 folds for each of 108 candidates, totalling 1080 fits


Best Hyperparameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 150}
Weighted F1-score: 1.0


In [8]:
print(test.head())

   ID       x_0       x_1       x_2       x_3       x_4       x_5       x_6  \
0   1 -0.230293 -3.466028  1.511166  0.740295  3.696918 -2.578689  2.263205   
1   2 -0.589310  2.695952 -0.447133  1.742419 -3.912262  7.050236 -2.624268   
2   3  2.070704 -1.921016  1.352349  1.948624 -1.549088 -0.623295 -0.013214   
3   4  0.130017  2.225700 -0.504748 -0.401777  2.244243  4.770526  1.789050   
4   5 -1.718615  0.253217 -0.539986  0.261817  0.246253 -0.502865 -1.190270   

        x_7       x_8  ...      x_16      x_17      x_18      x_19      x_20  \
0 -0.126368  1.207075  ... -0.717038 -2.280132 -4.019121  4.175089 -2.939001   
1  2.292610 -0.640342  ... -3.313892 -7.084135 -0.161589 -3.913306 -3.592095   
2  4.281549 -0.569961  ...  1.295106  0.363587  2.207610  4.304411 -1.301508   
3 -1.553924  1.116070  ...  8.556711 -5.356854  2.574727  6.959246  0.220325   
4 -1.416252 -1.735776  ... -4.632323 -1.219645  2.092873 -2.675771 -4.998719   

       x_21      x_22      x_23      x_24   

In [9]:
test_features = test.iloc[:, 1:]

In [10]:
grid_search_labels = grid_search.predict(test_features)

In [11]:
test_pred_df = pd.DataFrame({'Predicted_Labels': grid_search_labels})

# Assuming test_features has an 'ID' column for each test sample
# If not, you can generate an ID column or use any other unique identifier for each sample
test_pred_df['ID'] = test['ID']  # Assuming 'ID' is the column name for the sample ID

# Save the predictions to a CSV file
test_pred_df.to_csv('grid_search.csv', index=False)

print("Predictions saved.")

Predictions saved.
