In [4]:
import pandas as pd
import numpy as np
import time
import math
import matplotlib.pyplot as plt

# spliting tests
from sklearn.model_selection import train_test_split

# preprocessing and model piplines
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# Models
from sklearn.neighbors import (
    KNeighborsClassifier,
    RadiusNeighborsClassifier,
    NeighborhoodComponentsAnalysis,
)
from sklearn.linear_model import LogisticRegression, Lasso, Ridge, ElasticNet

In [5]:
df = pd.read_csv(
    r"/home/ahmed/Ml-algorithms-from-scratch-and-scikit-learn-in-python-/DataSets/diabetes.csv"
)
df

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


In [6]:
features = df[
    [
        "Pregnancies",
        "Glucose",
        "BloodPressure",
        "SkinThickness",
        "Insulin",
        "BMI",
        "DiabetesPedigreeFunction",
        "Age",
    ]
]
tests = df["Outcome"]

x_train, x_test, y_train, y_test = train_test_split(
    features, tests, test_size=0.3, random_state=0
)

# Logistic Regression Model


In [7]:
model = LogisticRegression()
scaler = StandardScaler()
log_reg = Pipeline([("StandardScaler", scaler), ("LogisticRegression", model)])

In [8]:
log_reg.fit(x_train, y_train)
prediction = log_reg.predict(X=x_test)
print("x_test score = ", log_reg.score(X=x_test, y=y_test) * 100)
print("x_train score = ", log_reg.score(X=x_train, y=y_train) * 100)

x_test score =  77.92207792207793
x_train score =  76.35009310986965


# Nearest Neighbors

provides functionality for unsupervised and supervised neighbors-based learning methods, used with classification for data with discrete labels, and regression for data with continuous labels

use a technique to find a predefined number of training samples closest in distance to the new point, and predict the label from these. The number of samples can be a user-defined constant (k-nearest neighbor learning), or vary based on the local density of points (radius-based neighbor learning)


# Nearest Neighbors Classification

it does not attempt to construct a general internal model, but use the, but simply stores instances of the training data. Classification is computed from a simple majority vote of the nearest neighbors of each point: a query point is assigned the data class which has the most representatives within the nearest neighbors of the point.

there is 2 different nearest neighbors classifiers: KNeighborsClassifier (KNC) implements learning based on the k nearest neighbors of each query point, where is an integer value specified by the user. RadiusNeighborsClassifier (RNC) implements learning based on the number of neighbors within a fixed radius r of each training point, where is a floating-point value specified by the user.

### KNC

implements learning based on the k nearest neighbors of each query point, and then assighn the label to the most voted label in tringing data set

usally k = sqrt for n_samples

### RNC

implements learning based on the neighbors within a fixed radius r of the query point, where r is a floating-point value specified by the user.


# 1) KNC


In [9]:
algorithms = ["brute", "kd_tree", "ball_tree"]

for algo in algorithms:
    start_time = time.time()
    k = int(math.sqrt(len(x_train)))
    if not (k & 1):
        k += 1
    model = KNeighborsClassifier(n_neighbors=k, algorithm=algo)
    scaler = StandardScaler()
    KNC = Pipeline([("StandardScaler", scaler), ("KNeighborsClassifier", model)])
    KNC.fit(x_train, y_train)
    prediction = KNC.predict(X=x_test)
    end_time = time.time()
    print("k = ", k)
    print("algorithm is ", algo)
    print("x_test score = ", KNC.score(X=x_test, y=y_test) * 100)
    print("x_train score = ", KNC.score(X=x_train, y=y_train) * 100)
    print("time taken = ", end_time - start_time)
    print()
    print()

k =  23
algorithm is  brute
x_test score =  74.45887445887446
x_train score =  76.35009310986965
time taken =  0.030486106872558594


k =  23
algorithm is  kd_tree
x_test score =  74.45887445887446
x_train score =  76.35009310986965
time taken =  0.011073827743530273


k =  23
algorithm is  ball_tree
x_test score =  74.45887445887446
x_train score =  76.35009310986965
time taken =  0.0093231201171875




# 2 RNC


In [10]:
algorithms = ["brute", "kd_tree", "ball_tree"]

for algo in algorithms:
    start_time = time.time()
    model = RadiusNeighborsClassifier(radius=4, algorithm=algo)
    scaler = StandardScaler()
    RNC = Pipeline([("StandardScaler", scaler), ("KNeighborsClassifier", model)])
    RNC.fit(x_train, y_train)
    prediction = RNC.predict(X=x_test)
    end_time = time.time()
    print("algorithm is ", algo)
    print("x_test score = ", RNC.score(X=x_test, y=y_test) * 100)
    print("x_train score = ", RNC.score(X=x_train, y=y_train) * 100)
    print("time taken = ", end_time - start_time)
    print()
    print()

algorithm is  brute
x_test score =  72.72727272727273
x_train score =  68.52886405959032
time taken =  0.005442619323730469


algorithm is  kd_tree
x_test score =  72.72727272727273
x_train score =  68.52886405959032
time taken =  0.0059680938720703125


algorithm is  ball_tree
x_test score =  72.72727272727273
x_train score =  68.52886405959032
time taken =  0.0057218074798583984




# Diffrent between algorithms

### 1) Brute Force

apply Brute Force technique which search in all dataset point for each query, and get the knn

time

    O(n * q * m)

Efficient brute-force neighbors searches can be very competitive for small data samples

### 2) K-D Tree

developed to improve brute-force approach, which use BST data structures, to search in it, which is faster than linear search

time

    O(n * q * log(m))

Though the KD tree approach is very fast for low-dimensional (n < 20)
when n increased the KD tree approach becomes slow

### 3) Ball Tree

To address the inefficiencies of KD Trees in higher dimensions, the ball tree data structure was developed. Where KD trees partition data along Cartesian axes, ball trees partition data in a series of nesting hyper-spheres. This makes tree construction more costly than that of the KD tree, but results in a data structure which can be very efficient on highly structured data, even in very high dimensions.

A ball tree recursively divides the data into nodes defined by a centroid C and radius r, such that each point in the node lies within the hyper-sphere defined by r and C . The number of candidate points for a neighbor search is reduced through use of the triangle inequality.

time

    O(n * q * log(m))

but it works efficiently with large number of features

#### in general choosing the algorithm is important to can control the time of execution


# Neighborhood Components Analysis

NCA is used for for classification because it can naturally handle multi-class problems without any increase in the model size, and does not introduce additional parameters that require fine-tuning by the user.

NCA classification work well in practice for data sets of varying size and difficulty

space complexity

    n_samples^2

time :

    O(n_components x n_samples x min(n_samples, n_features))


# KNC with NCA


In [11]:
algorithms = ["brute", "kd_tree", "ball_tree"]

for algo in algorithms:
    start_time = time.time()
    k = int(math.sqrt(len(x_train)))
    if not (k & 1):
        k += 1
    model = KNeighborsClassifier(n_neighbors=k, algorithm=algo)
    scaler = StandardScaler()
    NCA = NeighborhoodComponentsAnalysis()
    KNC = Pipeline(
        [
            ("StandardScaler", scaler),
            ("NeighborhoodComponentsAnalysis", NCA),
            ("KNeighborsClassifier", model),
        ]
    )
    KNC.fit(x_train, y_train)
    prediction = KNC.predict(X=x_test)
    end_time = time.time()
    print("k = ", k)
    print("algorithm is ", algo)
    print("x_test score = ", KNC.score(X=x_test, y=y_test) * 100)
    print("x_train score = ", KNC.score(X=x_train, y=y_train) * 100)
    print("time taken = ", end_time - start_time)
    print()
    print()

k =  23
algorithm is  brute
x_test score =  74.45887445887446
x_train score =  77.28119180633148
time taken =  0.9841864109039307


k =  23
algorithm is  kd_tree
x_test score =  74.45887445887446
x_train score =  77.28119180633148
time taken =  0.9245028495788574


k =  23
algorithm is  ball_tree
x_test score =  74.45887445887446
x_train score =  77.28119180633148
time taken =  0.9991514682769775


