In [1]:
import os.path
import re

import pandas as pd
import numpy as np
import seaborn as sns

from scipy.cluster.vq import kmeans, kmeans2
from sklearn.cluster import KMeans

Load the data and pre-process it.

In [2]:
FILEPATH = os.path.abspath("../data/2023-03-25_22hr_29min_ptwr_data.tsv")
df = pd.read_csv(FILEPATH, sep="\t")

MAKE = "make"
MODEL = "model"
SPECS = "specs"

CLEAN_MAKE = re.compile(r"((/make/)|(\-power\-to\-weight\-ratio\-stats))")
CLEAN_MODEL = re.compile(r"(/model/)")
CLEAN_SPECS = re.compile(r"(\\t|\\n)")

df[MAKE] = df[MAKE].apply(lambda x: CLEAN_MAKE.sub("", x))
df[MODEL] = df[MODEL].apply(lambda x: CLEAN_MODEL.sub("", x))
df[SPECS] = df[SPECS].apply(lambda x: CLEAN_SPECS.sub("", x))

SEPARATOR = ":"
SPECS1 = "specs1"
df[SPECS1] = df[SPECS].apply(lambda x: float(x.split(SEPARATOR)[-1]))
df[SPECS] = df[SPECS].apply(lambda x: SEPARATOR.join((x.split(SEPARATOR)[:-1])))

print(df.describe())
print(df.head())

             specs1
count  43219.000000
mean       0.064918
std        0.029648
min        0.017000
25%        0.050000
50%        0.059000
75%        0.070000
max        0.636000
    make     model                      specs  specs1
0  acura  acura-cl         1997 Acura CL 2.2    0.047
1  acura  acura-cl         1998 Acura CL 2.3    0.049
2  acura  acura-cl  1998 Acura CL 2.3 Premium   0.049
3  acura  acura-cl         1999 Acura CL 2.3    0.049
4  acura  acura-cl         1997 Acura CL 3.0    0.062


# Attempt 1: Mean

Create a discriminant based upon being above or below the mean.

In [3]:
mean = np.mean(df[SPECS1])
print(mean)

0.06491797589023347


# Attemp 2: Median

Create a discriminant based upon being above or below the median.

In [4]:
median = np.median(df[SPECS1])
print(median)

0.059


# Attempt 3: Clustering

## K-Means Clustering

Clustering with k=2 because we want 'high power to weight ratio' vs 'low power to weight ratio', just those two.
Create a discriminant based upon which centroid a datum is closer to.

In [5]:
K = 2

### Scipy kmeans

In [6]:
kmeans_centroids, distortion = kmeans(df[SPECS1], K)
print(kmeans_centroids)

[0.10053996 0.05440708]


### Scipy kmeans2

In [7]:
kmeans2_centroids, distortion2 = kmeans2(df[SPECS1], K)
print(kmeans2_centroids)

[0.14092449 0.05891207]


### Sklearn KMeans

In [8]:
X = df[SPECS1].to_numpy().reshape(-1,1)
cf = KMeans(n_clusters=K, n_init='auto').fit(X)
sklearn_centroids = cf.cluster_centers_
print(sklearn_centroids)

[[0.05891207]
 [0.14092449]]


# Discussion: Attempts 1-3

Now that I have some models I can start to classify the data and compare the results. A classification of '1' means 'high power-to-weight ratio' and a classification of '0' means 'low power-to-weight ratio'.

# Attempt 1 Classification: Mean Discriminator

In [9]:
mean_discriminator = df.copy(deep=True)
mean_discriminator[SPECS1].apply(lambda x: 1 if x > mean else 0)
print()  # silence output from line above




# Attempt 2 Classification: Median Discriminator

In [10]:
median_discriminator = df.copy(deep=True)
median_discriminator[SPECS1].apply(lambda x: 1 if x > median else 0)
print()  # silence output from line above




# Attempt 3 Classification: K-Means Clustering Discriminator

### Scipy kmeans

In [11]:
kmeans_discriminator = df.copy(deep=True)
kmeans_discriminator[SPECS1].apply(lambda x: 0 if np.abs(x-kmeans_centroids[0]) < np.abs(x-kmeans_centroids[1]) else 1)  # 0th index represents low power-to-weight, 1st index represents high power-to-weight
print()  # silence output from line above

0        1
1        1
2        1
3        1
4        1
        ..
43214    0
43215    0
43216    0
43217    0
43218    0
Name: specs1, Length: 43219, dtype: int64

### Scipy kmeans2

In [12]:
kmeans2_discriminator = df.copy(deep=True)
kmeans2_discriminator[SPECS1].apply(lambda x: 0 if np.abs(x-kmeans2_centroids[1]) < np.abs(x-kmeans2_centroids[0]) else 1)  # 0th index represents high power-to-weight, 1st index represents low power-to-weight
print()  # silence output from line above

0        0
1        0
2        0
3        0
4        0
        ..
43214    0
43215    0
43216    0
43217    0
43218    0
Name: specs1, Length: 43219, dtype: int64

### Sklearn KMeans

In [16]:
sklearn_discriminator = df.copy(deep=True)
sklearn_discriminator[SPECS1] = cf.predict(sklearn_discriminator[SPECS1].array.reshape(-1,1))
print()  # silence output from line above