In [1]:
import os.path
import re

import pandas as pd
import numpy as np

from scipy.cluster.vq import kmeans
from sklearn.cluster import KMeans

Load the data and pre-process it.

In [2]:
FILEPATH = os.path.abspath("../data/2023-03-25_22hr_29min_ptwr_data.tsv")
df = pd.read_csv(FILEPATH, sep="\t")

MAKE = "make"
MODEL = "model"
SPECS = "specs"

CLEAN_MAKE = re.compile(r"((/make/)|(\-power\-to\-weight\-ratio\-stats))")
CLEAN_MODEL = re.compile(r"(/model/)")
CLEAN_SPECS = re.compile(r"(\\t|\\n)")

df[MAKE] = df[MAKE].apply(lambda x: CLEAN_MAKE.sub("", x))
df[MODEL] = df[MODEL].apply(lambda x: CLEAN_MODEL.sub("", x))
df[SPECS] = df[SPECS].apply(lambda x: CLEAN_SPECS.sub("", x))

SEPARATOR = ":"
SPECS1 = "specs1"
df[SPECS1] = df[SPECS].apply(lambda x: float(x.split(SEPARATOR)[-1]))
df[SPECS] = df[SPECS].apply(lambda x: SEPARATOR.join((x.split(SEPARATOR)[:-1])))

print(df.describe())
print(df.head())

             specs1
count  43219.000000
mean       0.064918
std        0.029648
min        0.017000
25%        0.050000
50%        0.059000
75%        0.070000
max        0.636000
    make     model                      specs  specs1
0  acura  acura-cl         1997 Acura CL 2.2    0.047
1  acura  acura-cl         1998 Acura CL 2.3    0.049
2  acura  acura-cl  1998 Acura CL 2.3 Premium   0.049
3  acura  acura-cl         1999 Acura CL 2.3    0.049
4  acura  acura-cl         1997 Acura CL 3.0    0.062


# Attempt 1: Mean

Create a discriminant based upon being above or below the mean.

In [3]:
mean = np.mean(df[SPECS1])
print(mean)

0.06491797589023347


# Attempt 2: Clustering

## K-Means Clustering

Clustering with k=2 because we want 'high power to weight ratio' vs 'low power to weight ratio', just those two.

In [4]:
K = 2

### Scipy

In [5]:
scipy_centroids, distortion = kmeans(df[SPECS1], K)
print(scipy_centroids)

[0.10406804 0.0549868 ]


### Sklearn (kmeans++)

In [6]:
X = df[SPECS1].to_numpy().reshape(-1,1)
cf = KMeans(n_clusters=K, n_init='auto').fit(X)
sklearn_centroids = cf.cluster_centers_
print(sklearn_centroids)

[[0.05921039]
 [0.14489163]]
