In [45]:
import numpy as np
import pandas as pd
from sklearn.neighbors import KDTree

In [80]:
XEXAMPLE = np.array([[1,1],[2,2], [3,3]])  # 3 points in 2 dimensions
treeEXAMPLE = KDTree(XEXAMPLE, metric='l1')                                         # !!!
distEXAMPLE, indEXAMPLE = treeEXAMPLE.query([[1.25, 1.35]], k=2)
print(indEXAMPLE)  # indices of 2 closest neighbors
print(distEXAMPLE)  # distances to 2 closest neighbors

[[0 1]]
[[0.6 1.4]]


In [8]:
# Define the headers since the data does not have any
headers = ["symboling", "normalized_losses", "make", "fuel_type", "aspiration",
           "num_doors", "body_style", "drive_wheels", "engine_location",
           "wheel_base", "length", "width", "height", "curb_weight",
           "engine_type", "num_cylinders", "engine_size", "fuel_system",
           "bore", "stroke", "compression_ratio", "horsepower", "peak_rpm",
           "city_mpg", "highway_mpg", "price"]

# Read in the CSV file and convert "?" to NaN
df = pd.read_csv("C:/Users/btspanswick/Desktop/PYproject/data/cars.csv",
                  header=None, names=headers, na_values="?" )
df.head()

Unnamed: 0,symboling,normalized_losses,make,fuel_type,aspiration,num_doors,body_style,drive_wheels,engine_location,wheel_base,...,engine_size,fuel_system,bore,stroke,compression_ratio,horsepower,peak_rpm,city_mpg,highway_mpg,price
0,3,,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111.0,5000.0,21,27,13495.0
1,3,,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111.0,5000.0,21,27,16500.0
2,1,,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154.0,5000.0,19,26,16500.0
3,2,164.0,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102.0,5500.0,24,30,13950.0
4,2,164.0,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115.0,5500.0,18,22,17450.0


In [9]:
df.dtypes

symboling              int64
normalized_losses    float64
make                  object
fuel_type             object
aspiration            object
num_doors             object
body_style            object
drive_wheels          object
engine_location       object
wheel_base           float64
length               float64
width                float64
height               float64
curb_weight            int64
engine_type           object
num_cylinders         object
engine_size            int64
fuel_system           object
bore                 float64
stroke               float64
compression_ratio    float64
horsepower           float64
peak_rpm             float64
city_mpg               int64
highway_mpg            int64
price                float64
dtype: object

In [14]:
obj_df = df.select_dtypes(include=['object']).copy()
other_df = df.select_dtypes(include=['float64','int64']).copy()
obj_df.head()
other_df.head()

Unnamed: 0,symboling,normalized_losses,wheel_base,length,width,height,curb_weight,engine_size,bore,stroke,compression_ratio,horsepower,peak_rpm,city_mpg,highway_mpg,price
0,3,,88.6,168.8,64.1,48.8,2548,130,3.47,2.68,9.0,111.0,5000.0,21,27,13495.0
1,3,,88.6,168.8,64.1,48.8,2548,130,3.47,2.68,9.0,111.0,5000.0,21,27,16500.0
2,1,,94.5,171.2,65.5,52.4,2823,152,2.68,3.47,9.0,154.0,5000.0,19,26,16500.0
3,2,164.0,99.8,176.6,66.2,54.3,2337,109,3.19,3.4,10.0,102.0,5500.0,24,30,13950.0
4,2,164.0,99.4,176.6,66.4,54.3,2824,136,3.19,3.4,8.0,115.0,5500.0,18,22,17450.0


In [37]:
dummy_df = pd.get_dummies(obj_df)

In [39]:
cars_dummy = other_df.join(dummy_df)
cars_dummy.head()

Unnamed: 0,symboling,normalized_losses,wheel_base,length,width,height,curb_weight,engine_size,bore,stroke,...,num_cylinders_twelve,num_cylinders_two,fuel_system_1bbl,fuel_system_2bbl,fuel_system_4bbl,fuel_system_idi,fuel_system_mfi,fuel_system_mpfi,fuel_system_spdi,fuel_system_spfi
0,3,,88.6,168.8,64.1,48.8,2548,130,3.47,2.68,...,0,0,0,0,0,0,0,1,0,0
1,3,,88.6,168.8,64.1,48.8,2548,130,3.47,2.68,...,0,0,0,0,0,0,0,1,0,0
2,1,,94.5,171.2,65.5,52.4,2823,152,2.68,3.47,...,0,0,0,0,0,0,0,1,0,0
3,2,164.0,99.8,176.6,66.2,54.3,2337,109,3.19,3.4,...,0,0,0,0,0,0,0,1,0,0
4,2,164.0,99.4,176.6,66.4,54.3,2824,136,3.19,3.4,...,0,0,0,0,0,0,0,1,0,0


In [40]:
cars_dummy

Unnamed: 0,symboling,normalized_losses,wheel_base,length,width,height,curb_weight,engine_size,bore,stroke,...,num_cylinders_twelve,num_cylinders_two,fuel_system_1bbl,fuel_system_2bbl,fuel_system_4bbl,fuel_system_idi,fuel_system_mfi,fuel_system_mpfi,fuel_system_spdi,fuel_system_spfi
0,3,,88.6,168.8,64.1,48.8,2548,130,3.47,2.68,...,0,0,0,0,0,0,0,1,0,0
1,3,,88.6,168.8,64.1,48.8,2548,130,3.47,2.68,...,0,0,0,0,0,0,0,1,0,0
2,1,,94.5,171.2,65.5,52.4,2823,152,2.68,3.47,...,0,0,0,0,0,0,0,1,0,0
3,2,164.0,99.8,176.6,66.2,54.3,2337,109,3.19,3.40,...,0,0,0,0,0,0,0,1,0,0
4,2,164.0,99.4,176.6,66.4,54.3,2824,136,3.19,3.40,...,0,0,0,0,0,0,0,1,0,0
5,2,,99.8,177.3,66.3,53.1,2507,136,3.19,3.40,...,0,0,0,0,0,0,0,1,0,0
6,1,158.0,105.8,192.7,71.4,55.7,2844,136,3.19,3.40,...,0,0,0,0,0,0,0,1,0,0
7,1,,105.8,192.7,71.4,55.7,2954,136,3.19,3.40,...,0,0,0,0,0,0,0,1,0,0
8,1,158.0,105.8,192.7,71.4,55.9,3086,131,3.13,3.40,...,0,0,0,0,0,0,0,1,0,0
9,0,,99.5,178.2,67.9,52.0,3053,131,3.13,3.40,...,0,0,0,0,0,0,0,1,0,0


In [41]:
test_car = cars_dummy.iloc[4]

test_car

symboling                    2.00
normalized_losses          164.00
wheel_base                  99.40
length                     176.60
width                       66.40
height                      54.30
curb_weight               2824.00
engine_size                136.00
bore                         3.19
stroke                       3.40
compression_ratio            8.00
horsepower                 115.00
peak_rpm                  5500.00
city_mpg                    18.00
highway_mpg                 22.00
price                    17450.00
make_alfa-romero             0.00
make_audi                    1.00
make_bmw                     0.00
make_chevrolet               0.00
make_dodge                   0.00
make_honda                   0.00
make_isuzu                   0.00
make_jaguar                  0.00
make_mazda                   0.00
make_mercedes-benz           0.00
make_mercury                 0.00
make_mitsubishi              0.00
make_nissan                  0.00
make_peugot   

In [57]:
dummy = cars_dummy.values
car = test_car.values
car = car.reshape(1, -1)

dummy
car

array([[2.000e+00, 1.640e+02, 9.940e+01, 1.766e+02, 6.640e+01, 5.430e+01,
        2.824e+03, 1.360e+02, 3.190e+00, 3.400e+00, 8.000e+00, 1.150e+02,
        5.500e+03, 1.800e+01, 2.200e+01, 1.745e+04, 0.000e+00, 1.000e+00,
        0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00,
        0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00,
        0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00,
        0.000e+00, 0.000e+00, 0.000e+00, 1.000e+00, 1.000e+00, 0.000e+00,
        1.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 1.000e+00,
        0.000e+00, 1.000e+00, 0.000e+00, 0.000e+00, 1.000e+00, 0.000e+00,
        0.000e+00, 0.000e+00, 0.000e+00, 1.000e+00, 0.000e+00, 0.000e+00,
        0.000e+00, 0.000e+00, 1.000e+00, 0.000e+00, 0.000e+00, 0.000e+00,
        0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00,
        0.000e+00, 1.000e+00, 0.000e+00, 0.000e+00]])

In [70]:
KDTree.valid_metrics

['euclidean',
 'l2',
 'minkowski',
 'p',
 'manhattan',
 'cityblock',
 'l1',
 'chebyshev',
 'infinity']

In [66]:
treel1 = KDTree(dummy, metric='l1')                                
distl1, indl1 = treel1.query(car, k=5)
print(indl1)  # indices of 2 closest neighbors
print(distl1)  # distances to 2 closest neighbors

[[  4   6 136 200 104]]
[[  0.   327.4  823.88 955.94 956.47]]


In [None]:
print(df.loc[[4, 6, 136, 200, 104]])

In [71]:
treeE = KDTree(dummy, metric='euclidean')                                
distE, indE = treeE.query(car, k=5)
print(indE)  # indices of 2 closest neighbors
print(distE)  # distances to 2 closest neighbors

[[  4   6 104 200 136]]
[[  0.         261.53657488 468.08963298 630.49605915 702.02462307]]


In [83]:
print(df.loc[[4,6,104,200,136]])

     symboling  normalized_losses    make fuel_type aspiration num_doors  \
4            2              164.0    audi       gas        std      four   
6            1              158.0    audi       gas        std      four   
104          3              194.0  nissan       gas        std       two   
200         -1               95.0   volvo       gas        std      four   
136          3              150.0    saab       gas      turbo       two   

    body_style drive_wheels engine_location  wheel_base   ...     engine_size  \
4        sedan          4wd           front        99.4   ...             136   
6        sedan          fwd           front       105.8   ...             136   
104  hatchback          rwd           front        91.3   ...             181   
200      sedan          rwd           front       109.1   ...             141   
136  hatchback          fwd           front        99.1   ...             121   

     fuel_system  bore  stroke compression_ratio horsepo

In [78]:
treeMan = KDTree(dummy, metric='l1')                                
distMan, indMan = treeMan.query(car, k=5)
print(indMan)  # indices of 2 closest neighbors
print(distMan)  # distances to 2 closest neighbors

[[  4   6 136 200 104]]
[[  0.   327.4  823.88 955.94 956.47]]


In [79]:
print(df.loc[[4,6,136,200,104]])


     symboling  normalized_losses    make fuel_type aspiration num_doors  \
4            2              164.0    audi       gas        std      four   
6            1              158.0    audi       gas        std      four   
136          3              150.0    saab       gas      turbo       two   
200         -1               95.0   volvo       gas        std      four   
104          3              194.0  nissan       gas        std       two   

    body_style drive_wheels engine_location  wheel_base   ...     engine_size  \
4        sedan          4wd           front        99.4   ...             136   
6        sedan          fwd           front       105.8   ...             136   
136  hatchback          fwd           front        99.1   ...             121   
200      sedan          rwd           front       109.1   ...             141   
104  hatchback          rwd           front        91.3   ...             181   

     fuel_system  bore  stroke compression_ratio horsepo