In [12]:
import pandas as pd
import numpy as np

df = pd.read_csv('usa_mercedes_benz_prices.csv')

# remove rows with missing values
df = df.dropna()

# remove rows with Not Priced values
df = df[df['price'] != 'Not Priced']

# take only the 2 last columns (review_count and price)
df = df.iloc[:, -2:]
print(df.head())

# normalize the price column
df['price'] = df['price'].str.replace('$', '').str.replace(',', '').astype(float)

# normalize the review_count column
df['review_count'] = df['review_count'].str.replace(' reviews', '').str.replace(' review', '').astype(int)

   review_count     price
0        1800.0   $30,900
1        1239.0  $139,999
2        1239.0  $132,999
3         752.0   $58,587
4        1502.0   $95,990


AttributeError: Can only use .str accessor with string values!

In [None]:
# normalisation par la moyenne et l'écart type
df['price'] = (df['price'] - df['price'].mean()) / df['price'].std()
df['review_count'] = (df['review_count'] - df['review_count'].mean()) / df['review_count'].std()

print(df.head())

# save the cleaned data
df.to_csv('usa_mercedes_benz_prices_cleaned.csv', index=False)

In [None]:
# creation de l'algorithme kmeans classique
def kmeans(X, k, max_iters=100):
    # on choisit k points aléatoires dans l'ensemble des données
    centers = X[np.random.choice(range(X.shape[0]), k, replace=False)]
    for _ in range(max_iters):
        # on calcule la distance entre chaque point et chaque centre
        distances = np.array([np.linalg.norm(X - center, axis=1) for center in centers])
        # on affecte chaque point au cluster du centre le plus proche
        clusters = np.argmin(distances, axis=0)
        # 3: Calculer les nouveaux centres des clusters
        new_centers = np.array([X[clusters == i].mean(axis=0) for i in range(k)])
        # 4: Répéter 3 et 5 jusqu’à ce que les objets se stabilisent dans leurs clusters
        if np.all(centers == new_centers):
            break
        centers = new_centers
    return clusters

In [None]:
# on charge les données nettoyées
df = pd.read_csv('usa_mercedes_benz_prices_cleaned.csv')
X = df.values

# on applique l'algorithme kmeans
clusters = kmeans(X, 3)
print(clusters)