In [7]:
from sklearn.svm import SVC
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import numpy as np
import json

feature_matrix = np.load("features.npy") # Defines nodes feature
labels = np.load("labels.npy") 

# Load the indices
with open("splits.json", "r") as f:
    data_splits = json.load(f)

train_indices0 = data_splits["idx_train"]
test_indices = data_splits["idx_test"]

# Assuming the length of the main data frame is `num_samples`
num_samples = len(feature_matrix)

# Create an array with the same length as the main data frame, filled with -1
new_labels_array = np.full(num_samples, -1, dtype=np.int8)

# Assign the labels to the corresponding indices in train_indices0
for i, index in enumerate(train_indices0):
    new_labels_array[index] = labels[i]

# Get the train_indices as a boolean mask
train_indices_mask = new_labels_array != -1

# Extract features and labels for the training dataset
train_features = feature_matrix[train_indices_mask]
train_labels = new_labels_array[train_indices_mask]

# Apply PCA with 85% variance coverage
pca = PCA(n_components=0.85)
pca_train_features = pca.fit_transform(train_features)
pca_feature_matrix = pca.transform(feature_matrix)

# Grid search for the best hyperparameters
param_grid = {
    'C': [0.1, 1, 10, 100],
    'kernel': ['linear', 'rbf', 'poly'],
    'degree': [2, 3, 4],
    'gamma': ['scale', 'auto']
}

svc = SVC()
grid_search = GridSearchCV(svc, param_grid, cv=5, scoring='accuracy', verbose=1)
grid_search.fit(pca_train_features, train_labels)

print("Best hyperparameters:", grid_search.best_params_)
print("Best average validation accuracy:", grid_search.best_score_)

# Train the SVM classifier with the best hyperparameters
best_svc = grid_search.best_estimator_

# Split the train_indices0 into train and validation sets for final evaluation
train_indices, val_indices = train_test_split(train_indices0, test_size=0.3, stratify=labels, random_state=42)

# Train and evaluate the SVM classifier
best_svc.fit(pca_feature_matrix[train_indices], new_labels_array[train_indices])
val_preds = best_svc.predict(pca_feature_matrix[val_indices])
val_accuracy = accuracy_score(new_labels_array[val_indices], val_preds)

print("Validation accuracy:", val_accuracy)


Fitting 5 folds for each of 72 candidates, totalling 360 fits
Best hyperparameters: {'C': 10, 'degree': 2, 'gamma': 'auto', 'kernel': 'rbf'}
Best average validation accuracy: 0.6350505050505051
Validation accuracy: 0.6510067114093959


In [8]:
test_preds = best_svc.predict(pca_feature_matrix[test_indices])
# val_accuracy = accuracy_score(new_labels_array[test_indices], test_preds)
test_preds

array([2, 2, 1, ..., 1, 0, 2], dtype=int8)

In [9]:
from sklearn.tree import DecisionTreeClassifier

# Grid search for the best hyperparameters
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

dtc = DecisionTreeClassifier(random_state=42)
grid_search = GridSearchCV(dtc, param_grid, cv=5, scoring='accuracy', verbose=1)
grid_search.fit(pca_train_features, train_labels)

# ... (same as your code from here)

print("Best hyperparameters:", grid_search.best_params_)
print("Best average validation accuracy:", grid_search.best_score_)

# Train the SVM classifier with the best hyperparameters
best_dtc = grid_search.best_estimator_

# Split the train_indices0 into train and validation sets for final evaluation
train_indices, val_indices = train_test_split(train_indices0, test_size=0.3, stratify=labels, random_state=42)

# Train and evaluate the SVM classifier
best_dtc.fit(pca_feature_matrix[train_indices], new_labels_array[train_indices])
val_preds = best_dtc.predict(pca_feature_matrix[val_indices])
val_accuracy = accuracy_score(new_labels_array[val_indices], val_preds)

print("Validation accuracy:", val_accuracy)


Fitting 5 folds for each of 72 candidates, totalling 360 fits
Best hyperparameters: {'criterion': 'gini', 'max_depth': None, 'min_samples_leaf': 4, 'min_samples_split': 10}
Best average validation accuracy: 0.4032323232323233
Validation accuracy: 0.3825503355704698


In [10]:
test_preds = best_dtc.predict(pca_feature_matrix[test_indices])
# val_accuracy = accuracy_score(new_labels_array[test_indices], test_preds)
test_preds

array([1, 2, 6, ..., 0, 0, 6], dtype=int8)

In [11]:
from sklearn.neighbors import KNeighborsClassifier

# Grid search for the best hyperparameters
param_grid = {
    'n_neighbors': [3, 5, 7, 9, 11],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan', 'minkowski']
}

knn = KNeighborsClassifier()
grid_search = GridSearchCV(knn, param_grid, cv=5, scoring='accuracy', verbose=1)
grid_search.fit(pca_train_features, train_labels)

# ... (same as your code from here)
print("Best hyperparameters:", grid_search.best_params_)
print("Best average validation accuracy:", grid_search.best_score_)

# Train the SVM classifier with the best hyperparameters
best_knn = grid_search.best_estimator_

# Split the train_indices0 into train and validation sets for final evaluation
train_indices, val_indices = train_test_split(train_indices0, test_size=0.3, stratify=labels, random_state=42)

# Train and evaluate the SVM classifier
best_knn.fit(pca_feature_matrix[train_indices], new_labels_array[train_indices])
val_preds = best_knn.predict(pca_feature_matrix[val_indices])
val_accuracy = accuracy_score(new_labels_array[val_indices], val_preds)

print("Validation accuracy:", val_accuracy)

Fitting 5 folds for each of 30 candidates, totalling 150 fits
Best hyperparameters: {'metric': 'euclidean', 'n_neighbors': 7, 'weights': 'distance'}
Best average validation accuracy: 0.4111919191919192
Validation accuracy: 0.4563758389261745


In [12]:
test_preds = best_knn.predict(pca_feature_matrix[test_indices])
# val_accuracy = accuracy_score(new_labels_array[test_indices], test_preds)
test_preds

array([6, 2, 2, ..., 4, 2, 6], dtype=int8)

In [5]:
from sklearn.ensemble import RandomForestClassifier

# Grid search for the best hyperparameters
param_grid = {
    'n_estimators': [10, 50, 100, 200],
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

rfc = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(rfc, param_grid, cv=5, scoring='accuracy', verbose=1)
grid_search.fit(pca_train_features, train_labels)

# ... (same as your code from here)
print("Best hyperparameters:", grid_search.best_params_)
print("Best average validation accuracy:", grid_search.best_score_)

# Train the SVM classifier with the best hyperparameters
best_rfc = grid_search.best_estimator_

# Split the train_indices0 into train and validation sets for final evaluation
train_indices, val_indices = train_test_split(train_indices0, test_size=0.3, stratify=labels, random_state=42)

# Train and evaluate the SVM classifier
best_rfc.fit(pca_feature_matrix[train_indices], new_labels_array[train_indices])
val_preds = best_rfc.predict(pca_feature_matrix[val_indices])
val_accuracy = accuracy_score(new_labels_array[val_indices], val_preds)

print("Validation accuracy:", val_accuracy)

Fitting 5 folds for each of 576 candidates, totalling 2880 fits
Best hyperparameters: {'bootstrap': False, 'criterion': 'gini', 'max_depth': None, 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 50}
Best average validation accuracy: 0.49000000000000005
Validation accuracy: 0.48322147651006714


In [6]:
test_preds = best_rfc.predict(pca_feature_matrix[test_indices])
# val_accuracy = accuracy_score(new_labels_array[test_indices], test_preds)
test_preds

array([3, 2, 2, 1, 3, 1, 1, 2, 6, 2, 2, 2, 2, 2, 2, 2, 6, 2, 4, 2, 6, 2,
       2, 2, 2, 2, 1, 2, 2, 2, 2, 1, 2, 2, 2, 3, 2, 2, 2, 3, 3, 2, 6, 2,
       2, 2, 6, 1, 2, 2, 2, 2, 2, 3, 1, 2, 0, 2, 2, 2, 2, 4, 3, 1, 2, 2,
       2, 2, 2, 2, 3, 1, 2, 0, 2, 2, 2, 2, 6, 3, 2, 1, 2, 3, 2, 2, 2, 2,
       2, 3, 2, 1, 2, 1, 2, 2, 2, 2, 2, 2, 2, 1, 1, 6, 0, 3, 2, 2, 6, 2,
       0, 2, 2, 2, 1, 6, 2, 6, 6, 2, 3, 2, 3, 6, 2, 2, 2, 2, 2, 2, 2, 2,
       3, 3, 1, 6, 2, 2, 2, 2, 2, 2, 2, 2, 6, 2, 0, 2, 2], dtype=int8)

In [13]:
from sklearn.linear_model import LogisticRegression

# Grid search for the best hyperparameters
param_grid = {
    'penalty': ['l1', 'l2', 'elasticnet', 'none'],
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
    'max_iter': [100, 200, 500]
}

lr = LogisticRegression(random_state=42)
grid_search = GridSearchCV(lr, param_grid, cv=5, scoring='accuracy', verbose=1)
grid_search.fit(pca_train_features, train_labels)

# ... (same as your code from here)
print("Best hyperparameters:", grid_search.best_params_)
print("Best average validation accuracy:", grid_search.best_score_)

# Train the SVM classifier with the best hyperparameters
best_lr = grid_search.best_estimator_

# Split the train_indices0 into train and validation sets for final evaluation
train_indices, val_indices = train_test_split(train_indices0, test_size=0.3, stratify=labels, random_state=42)

# Train and evaluate the SVM classifier
best_lr.fit(pca_feature_matrix[train_indices], new_labels_array[train_indices])
val_preds = best_lr.predict(pca_feature_matrix[val_indices])
val_accuracy = accuracy_score(new_labels_array[val_indices], val_preds)

print("Validation accuracy:", val_accuracy)

Fitting 5 folds for each of 360 candidates, totalling 1800 fits




Best hyperparameters: {'C': 0.1, 'max_iter': 100, 'penalty': 'l2', 'solver': 'liblinear'}
Best average validation accuracy: 0.6733737373737374
Validation accuracy: 0.6912751677852349


810 fits failed out of a total of 1800.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
90 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\Aditya\anaconda3\envs\gnn\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\Aditya\anaconda3\envs\gnn\lib\site-packages\sklearn\linear_model\_logistic.py", line 1162, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "c:\Users\Aditya\anaconda3\envs\gnn\lib\site-packages\sklearn\linear_model\_logistic.py", line 54, in _check_solver
    raise ValueError(
ValueError: Solver newton-cg supports only 'l2' or 'none' penalties, got l1 pe

In [19]:
test_preds = best_lr.predict(pca_feature_matrix[test_indices])
# val_accuracy = accuracy_score(new_labels_array[test_indices], test_preds)
test_preds

array([2, 3, 1, ..., 1, 0, 2], dtype=int8)

In [16]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

# Grid search for the best hyperparameters
param_grid = {
    'base_estimator': [DecisionTreeClassifier(max_depth=d) for d in range(3, 11)],
    'n_estimators': [10, 50, 100],
    'learning_rate': [0.001, 0.01, 0.1, 1]
}

abc = AdaBoostClassifier(random_state=42)
grid_search = GridSearchCV(abc, param_grid, cv=5, scoring='accuracy', verbose=1)
grid_search.fit(pca_train_features, train_labels)

# ... (same as your code from here)
print("Best hyperparameters:", grid_search.best_params_)
print("Best average validation accuracy:", grid_search.best_score_)

# Train the SVM classifier with the best hyperparameters
best_abc = grid_search.best_estimator_

# Split the train_indices0 into train and validation sets for final evaluation
train_indices, val_indices = train_test_split(train_indices0, test_size=0.3, stratify=labels, random_state=42)

# Train and evaluate the SVM classifier
best_abc.fit(pca_feature_matrix[train_indices], new_labels_array[train_indices])
val_preds = best_abc.predict(pca_feature_matrix[val_indices])
val_accuracy = accuracy_score(new_labels_array[val_indices], val_preds)

print("Validation accuracy:", val_accuracy)

Fitting 5 folds for each of 96 candidates, totalling 480 fits




Best hyperparameters: {'base_estimator': DecisionTreeClassifier(max_depth=7), 'learning_rate': 0.01, 'n_estimators': 100}
Best average validation accuracy: 0.4758383838383839




Validation accuracy: 0.5100671140939598


In [17]:
test_preds = best_abc.predict(pca_feature_matrix[test_indices])
# val_accuracy = accuracy_score(new_labels_array[test_indices], test_preds)
test_preds

array([2, 2, 2, ..., 2, 0, 4], dtype=int8)

In [20]:
# Save predictions to a file
with open("submissionlr.txt", "w") as submission_file:
    for prediction in test_preds:
        submission_file.write(str(prediction) + "\n")


In [18]:
from sklearn.ensemble import GradientBoostingClassifier

# Grid search for the best hyperparameters
param_grid = {
    'n_estimators': [10, 50, 100],
    'learning_rate': [0.001, 0.01, 0.1, 1],
    'max_depth': [2, 5, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2', None]
}

gbc = GradientBoostingClassifier(random_state=42)
grid_search = GridSearchCV(gbc, param_grid, cv=5, scoring='accuracy', verbose=1)
grid_search.fit(pca_train_features, train_labels)

# ... (same as your code from here)
print("Best hyperparameters:", grid_search.best_params_)
print("Best average validation accuracy:", grid_search.best_score_)

# Train the SVM classifier with the best hyperparameters
best_gbc = grid_search.best_estimator_

# Split the train_indices0 into train and validation sets for final evaluation
train_indices, val_indices = train_test_split(train_indices0, test_size=0.3, stratify=labels, random_state=42)

# Train and evaluate the SVM classifier
best_gbc.fit(pca_feature_matrix[train_indices], new_labels_array[train_indices])
val_preds = best_gbc.predict(pca_feature_matrix[val_indices])
val_accuracy = accuracy_score(new_labels_array[val_indices], val_preds)

print("Validation accuracy:", val_accuracy)

Fitting 5 folds for each of 1296 candidates, totalling 6480 fits




KeyboardInterrupt: 