In [1]:
import numpy as np
import pandas as pd

from sklearn import cross_validation
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.decomposition import PCA

from time import time

def all_same(items):
  return len(set(items)) == 1

# Load training data from csv file
data = pd.read_csv("train.csv")

# Extract feature columns
feature_cols = list(data.columns[1:])

# Extract target column 'label'
target_col = data.columns[0]

# Separate the data into feature data and target data (X and y, respectively)
X = data[feature_cols]
y = data[target_col]

# Apply PCA by fitting the data with only 60 dimensions
pca = PCA(n_components=60).fit(X)
# Transform the data using the PCA fit above
X = pca.transform(X)
y = y.values


# Shuffle and split the dataset into the number of training and testing points above
sss = cross_validation.StratifiedShuffleSplit(y, 3, test_size=0.4, random_state=42)
for train_index, test_index in sss:
  X_train, X_test = X[train_index], X[test_index]
  y_train, y_test = y[train_index], y[test_index]

# Fit a KNN classifier on the training set
knn_clf = KNeighborsClassifier(n_neighbors=3, p=2)
knn_clf.fit(X_train, y_train)

# Initialize the array of predicted labels
y_pred = np.empty(len(y_test), dtype=np.int)

start = time()

# Find the nearest neighbors indices for each sample in the test set
kneighbors = knn_clf.kneighbors(X_test, return_distance=False)

# For each set of neighbors indices
for idx, indices in enumerate(kneighbors):
  # Find the actual training samples & their labels
  neighbors = [X_train[i] for i in indices]
  neighbors_labels = [y_train[i] for i in indices]

  # if all labels are the same, use it as the prediction
  if all_same(neighbors_labels):
    y_pred[idx] = neighbors_labels[0]
  else:
    # else fit a SVM classifier using the neighbors, and label the test samples
    svm_clf = svm.SVC(C=0.5, kernel='rbf', decision_function_shape='ovo', random_state=42)
    svm_clf.fit(neighbors, neighbors_labels)
    label = svm_clf.predict(X_test[idx].reshape(1, -1))

    y_pred[idx] = label
end = time()

print(accuracy_score(y_test, y_pred))
print("Made predictions in {:.4f} seconds.".format(end - start))

ImportError: cannot import name 'cross_validation' from 'sklearn' (/usr/local/lib/python3.10/dist-packages/sklearn/__init__.py)

In [2]:
import numpy as np
import pandas as pd

from sklearn import model_selection # Import model_selection instead of cross_validation
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.decomposition import PCA

from time import time

def all_same(items):
  return len(set(items)) == 1

# Load training data from csv file
data = pd.read_csv("train.csv")

# Extract feature columns
feature_cols = list(data.columns[1:])

# Extract target column 'label'
target_col = data.columns[0]

# Separate the data into feature data and target data (X and y, respectively)
X = data[feature_cols]
y = data[target_col]

# Apply PCA by fitting the data with only 60 dimensions
pca = PCA(n_components=60).fit(X)
# Transform the data using the PCA fit above
X = pca.transform(X)
y = y.values


# Shuffle and split the dataset into the number of training and testing points above
# Use model_selection.StratifiedShuffleSplit instead of cross_validation.StratifiedShuffleSplit
sss = model_selection.StratifiedShuffleSplit(n_splits=3, test_size=0.4, random_state=42)
for train_index, test_index in sss.split(X, y): # Use split(X, y) to generate indices
  X_train, X_test = X[train_index], X[test_index]
  y_train, y_test = y[train_index], y[test_index]

# Fit a KNN classifier on the training set
knn_clf = KNeighborsClassifier(n_neighbors=3, p=2)
knn_clf.fit(X_train, y_train)

# Initialize the array of predicted labels
y_pred = np.empty(len(y_test), dtype=np.int)

start = time()

# Find the nearest neighbors indices for each sample in the test set
kneighbors = knn_clf.kneighbors(X_test, return_distance=False)

# For each set of neighbors indices
for idx, indices in enumerate(kneighbors):
  # Find the actual training samples & their labels
  neighbors = [X_train[i] for i in indices]
  neighbors_labels = [y_train[i] for i in indices]

  # if all labels are the same, use it as the prediction
  if all_same(neighbors_labels):
    y_pred[idx] = neighbors_labels[0]
  else:
    # else fit a SVM classifier using the neighbors, and label the test samples
    svm_clf = svm.SVC(C=0.5, kernel='rbf', decision_function_shape='ovo', random_state=42)
    svm_clf.fit(neighbors, neighbors_labels)
    label = svm_clf.predict(X_test[idx].reshape(1, -1))

    y_pred[idx] = label
end = time()

print(accuracy_score(y_test, y_pred))
print("Made predictions in {:.4f} seconds.".format(end - start))

ValueError: Input X contains NaN.
PCA does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [3]:
import numpy as np
import pandas as pd

from sklearn import model_selection # Import model_selection instead of cross_validation
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer # Import SimpleImputer to handle missing values

from time import time

def all_same(items):
  return len(set(items)) == 1

# Load training data from csv file
data = pd.read_csv("train.csv")

# Extract feature columns
feature_cols = list(data.columns[1:])

# Extract target column 'label'
target_col = data.columns[0]

# Separate the data into feature data and target data (X and y, respectively)
X = data[feature_cols]
y = data[target_col]

# Impute missing values using the mean strategy
imputer = SimpleImputer(strategy='mean') # Create an imputer object
X = imputer.fit_transform(X) # Impute missing values in X

# Apply PCA by fitting the data with only 60 dimensions
pca = PCA(n_components=60).fit(X)
# Transform the data using the PCA fit above
X = pca.transform(X)
y = y.values


# Shuffle and split the dataset into the number of training and testing points above
# Use model_selection.StratifiedShuffleSplit instead of cross_validation.StratifiedShuffleSplit
sss = model_selection.StratifiedShuffleSplit(n_splits=3, test_size=0.4, random_state=42)
for train_index, test_index in sss.split(X, y): # Use split(X, y) to generate indices
  X_train, X_test = X[train_index], X[test_index]
  y_train, y_test = y[train_index], y[test_index]

# Fit a KNN classifier on the training set
knn_clf = KNeighborsClassifier(n_neighbors=3, p=2)
knn_clf.fit(X_train, y_train)

# Initialize the array of predicted labels
y_pred = np.empty(len(y_test), dtype=np.int)

start = time()

# Find the nearest neighbors indices for each sample in the test set
kneighbors = knn_clf.kneighbors(X_test, return_distance=False)

# For each set of neighbors indices
for idx, indices in enumerate(kneighbors):
  # Find the actual training samples & their labels
  neighbors = [X_train[i] for i in indices]
  neighbors_labels = [y_train[i] for i in indices]

  # if all labels are the same, use it as the prediction
  if all_same(neighbors_labels):
    y_pred[idx] = neighbors_labels[0]
  else:
    # else fit a SVM classifier using the neighbors, and label the test samples
    svm_clf = svm.SVC(C=0.5, kernel='rbf', decision_function_shape='ovo', random_state=42)
    svm_clf.fit(neighbors, neighbors_labels)
    label = svm_clf.predict(X_test[idx].reshape(1, -1))

    y_pred[idx] = label
end = time()

print(accuracy_score(y_test, y_pred))
print("Made predictions in {:.4f} seconds.".format(end - start))

AttributeError: module 'numpy' has no attribute 'int'.
`np.int` was a deprecated alias for the builtin `int`. To avoid this error in existing code, use `int` by itself. Doing this will not modify any behavior and is safe. When replacing `np.int`, you may wish to use e.g. `np.int64` or `np.int32` to specify the precision. If you wish to review your current use, check the release note link for additional information.
The aliases was originally deprecated in NumPy 1.20; for more details and guidance see the original release note at:
    https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations

In [5]:
import numpy as np
import pandas as pd

from sklearn import model_selection # Import model_selection instead of cross_validation
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer # Import SimpleImputer to handle missing values

from time import time

def all_same(items):
  return len(set(items)) == 1

# Load training data from csv file
data = pd.read_csv("train.csv")

# Extract feature columns
feature_cols = list(data.columns[1:])

# Extract target column 'label'
target_col = data.columns[0]

# Separate the data into feature data and target data (X and y, respectively)
X = data[feature_cols]
y = data[target_col]

# Impute missing values using the mean strategy
imputer = SimpleImputer(strategy='mean') # Create an imputer object
X = imputer.fit_transform(X) # Impute missing values in X

# Apply PCA by fitting the data with only 60 dimensions
pca = PCA(n_components=60).fit(X)
# Transform the data using the PCA fit above
X = pca.transform(X)
y = y.values


# Shuffle and split the dataset into the number of training and testing points above
# Use model_selection.StratifiedShuffleSplit instead of cross_validation.StratifiedShuffleSplit
sss = model_selection.StratifiedShuffleSplit(n_splits=3, test_size=0.4, random_state=42)
for train_index, test_index in sss.split(X, y): # Use split(X, y) to generate indices
  X_train, X_test = X[train_index], X[test_index]
  y_train, y_test = y[train_index], y[test_index]

# Fit a KNN classifier on the training set
knn_clf = KNeighborsClassifier(n_neighbors=3, p=2)
knn_clf.fit(X_train, y_train)

# Initialize the array of predicted labels
y_pred = np.empty(len(y_test), dtype=int) # Use int instead of np.int

start = time()

# Find the nearest neighbors indices for each sample in the test set
kneighbors = knn_clf.kneighbors(X_test, return_distance=False)

# For each set of neighbors indices
for idx, indices in enumerate(kneighbors):
  # Find the actual training samples & their labels
  neighbors = [X_train[i] for i in indices]
  neighbors_labels = [y_train[i] for i in indices]

  # if all labels are the same, use it as the prediction
  if all_same(neighbors_labels):
    y_pred[idx] = neighbors_labels[0]
  else:
    # else fit a SVM classifier using the neighbors, and label the test samples
    svm_clf = svm.SVC(C=0.5, kernel='rbf', decision_function_shape='ovo', random_state=42)
    svm_clf.fit(neighbors, neighbors_labels)
    label = svm_clf.predict(X_test[idx].reshape(1, -1))

    y_pred[idx] = label
end = time()

print(accuracy_score(y_test, y_pred))
print("Made predictions in {:.4f} seconds.".format(end - start))

  y_pred[idx] = label
  y_pred[idx] = label
  y_pred[idx] = label
  y_pred[idx] = label
  y_pred[idx] = label
  y_pred[idx] = label
  y_pred[idx] = label
  y_pred[idx] = label
  y_pred[idx] = label
  y_pred[idx] = label
  y_pred[idx] = label
  y_pred[idx] = label
  y_pred[idx] = label
  y_pred[idx] = label
  y_pred[idx] = label
  y_pred[idx] = label
  y_pred[idx] = label
  y_pred[idx] = label
  y_pred[idx] = label
  y_pred[idx] = label
  y_pred[idx] = label
  y_pred[idx] = label
  y_pred[idx] = label
  y_pred[idx] = label
  y_pred[idx] = label
  y_pred[idx] = label
  y_pred[idx] = label
  y_pred[idx] = label
  y_pred[idx] = label
  y_pred[idx] = label
  y_pred[idx] = label
  y_pred[idx] = label
  y_pred[idx] = label
  y_pred[idx] = label
  y_pred[idx] = label
  y_pred[idx] = label
  y_pred[idx] = label
  y_pred[idx] = label
  y_pred[idx] = label
  y_pred[idx] = label
  y_pred[idx] = label
  y_pred[idx] = label
  y_pred[idx] = label
  y_pred[idx] = label
  y_pred[idx] = label
  y_pred[i

0.964018326069952
Made predictions in 2.6969 seconds.


  y_pred[idx] = label
  y_pred[idx] = label
  y_pred[idx] = label
  y_pred[idx] = label
  y_pred[idx] = label
  y_pred[idx] = label
  y_pred[idx] = label
  y_pred[idx] = label
  y_pred[idx] = label
  y_pred[idx] = label
  y_pred[idx] = label
  y_pred[idx] = label
  y_pred[idx] = label
  y_pred[idx] = label
  y_pred[idx] = label
  y_pred[idx] = label
  y_pred[idx] = label
  y_pred[idx] = label
  y_pred[idx] = label
  y_pred[idx] = label
  y_pred[idx] = label
  y_pred[idx] = label
  y_pred[idx] = label
  y_pred[idx] = label
  y_pred[idx] = label
  y_pred[idx] = label
  y_pred[idx] = label
  y_pred[idx] = label
  y_pred[idx] = label
  y_pred[idx] = label
  y_pred[idx] = label
  y_pred[idx] = label
  y_pred[idx] = label
  y_pred[idx] = label
  y_pred[idx] = label
  y_pred[idx] = label
  y_pred[idx] = label
  y_pred[idx] = label
  y_pred[idx] = label
  y_pred[idx] = label
  y_pred[idx] = label
  y_pred[idx] = label
  y_pred[idx] = label
  y_pred[idx] = label
  y_pred[idx] = label
  y_pred[i