In [None]:
#import library used

import numpy as np
import pandas as pd
import copy
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv('used_cars.csv')

### Data Exploration

In [None]:
df.head()

In [None]:
#number of records and column
df.shape

In [None]:
df.describe()

In [None]:
#datatype for each column
df.dtypes

In [None]:
df._get_numeric_data().columns

In [None]:
#correlation matrix

corrmat = df.corr()
f, ax = plt.subplots(figsize=(12, 9))
sns.heatmap(corrmat, annot=True, square=True)

In [None]:
#check missing value

total = df.isnull().sum().sort_values(ascending=False)
percent = (df.isnull().sum()/df.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data

In [None]:
#check outliers using boxplot

plt.figure(figsize=(60, 60))
f, axes = plt.subplots(1, 5)
sns.boxplot(y=df["price"], ax=axes[0])
sns.boxplot(y=df["odometer"], ax=axes[1])
sns.boxplot(y=df["year"], ax=axes[2])
sns.boxplot(y=df["lat"], ax=axes[3])
sns.boxplot(y=df["long"], ax=axes[4])
plt.subplots_adjust(wspace=1)

#### Shows the number of occurrences of some category

In [None]:
sns.countplot(x = 'condition', data = df)

In [None]:
sns.countplot(x = 'fuel', data = df)

In [None]:
sns.countplot(x = 'cylinders', data = df)

In [None]:
sns.countplot(x = 'type', data = df)

In [None]:
sns.countplot(x = 'title_status', data = df)

In [None]:
sns.countplot(x = 'transmission', data = df)

In [None]:
sns.countplot(x = 'drive', data = df)

In [None]:
sns.countplot(x = 'size', data = df)

In [None]:
sns.countplot(x = 'state', data = df)

In [None]:
sns.countplot(x = 'region', data = df)

In [None]:
sns.countplot(x = 'paint_color', data = df)

#### Check data distribution

In [None]:
sns.distplot(df[df['year'].notnull()]['year'], kde=False, bins=10);

In [None]:
sns.distplot(df[df['odometer'].notnull()]['odometer'], kde=False, bins=10);

In [None]:
sns.distplot(df[df['price'].notnull()]['price'], kde=False, bins=10);

In [None]:
#correlation between odometer and price

plt.figure(figsize=(100, 100))
df_nona= df[df['odometer'].notnull()]
g = sns.pairplot(df_nona[['price', 'odometer']] )

In [None]:
#correlation between price and year

plt.figure(figsize=(100, 100))
df_nona= df[df['year'].notnull()]
g = sns.pairplot(df_nona[['price', 'year']] )

### Data Preparation

In [None]:
#county doesn't have any values and we don't need unnamed column
df = df.drop(columns = ['Unnamed: 0','county'])

In [None]:
df.columns

#### Dealing with Missing Value

In [None]:
df = df.replace(0, np.NaN)

# fill missing values with mean column values
df['odometer'].fillna(df['odometer'].mean(), inplace=True)
df['price'].fillna(df['price'].mean(), inplace=True)
df['year'].fillna(df['year'].mean(), inplace=True)
df['lat'].fillna(df['lat'].mean(), inplace=True)
df['long'].fillna(df['long'].mean(), inplace=True)

In [None]:
#fill missing values with 'other'

df['cylinders'].fillna('other', inplace=True)

In [None]:
df['cylinders'].isnull().sum()

In [None]:
#fill missing values with 'other'

df['fuel'].fillna('other', inplace=True)

In [None]:
df['fuel'].isnull().sum()

In [None]:
#Max fill function for title_status
df['title_status'].fillna(df['title_status'].value_counts().idxmax(), inplace=True)

In [None]:
df['title_status'].isnull().sum()

In [None]:
#fill missing values with 'other'

df['transmission'].fillna('other', inplace=True)

In [None]:
df['transmission'].isnull().sum()

In [None]:
#fill missing values with 'other'

df['drive'].fillna('other', inplace=True)

In [None]:
#fill missing values with 'other'

df['type'].fillna('other', inplace=True)

In [None]:
df = df.drop(columns = ['vin', 'paint_color','size','id','url','region_url','image_url','description'])

In [None]:
df.dropna(inplace=True)

##### Dealing with Outliers

In [None]:
from scipy import stats

z = np.abs(stats.zscore(df[['odometer','price','year','lat','long']]))
df= df[(z < 3).all(axis=1)]

In [None]:
df.shape

### Feature Engineering

In [None]:
from sklearn.preprocessing import LabelEncoder

labelencoder = LabelEncoder()
df['cylinders'] = labelencoder.fit_transform(df['cylinders'])
labelencoder = LabelEncoder()
df['title_status'] = labelencoder.fit_transform(df['title_status'])
labelencoder = LabelEncoder()
df['manufacturer'] = labelencoder.fit_transform(df['manufacturer'])
labelencoder = LabelEncoder()
df['model'] = labelencoder.fit_transform(df['model'])
labelencoder = LabelEncoder()
df['region'] = labelencoder.fit_transform(df['region'])

In [None]:
labelencoder = LabelEncoder()
df['condition'] = labelencoder.fit_transform(df['condition'])

In [None]:
enc = pd.get_dummies(df[['drive', 'fuel', 'state', 'transmission', 'type']])
df = df.join(enc)
df.head()

In [None]:
sc = MinMaxScaler()
df[['price', 'year', 'odometer', 'cylinders', 'title_status', 'manufacturer', 'model', 'region', 'lat', 'long']] = sc.fit_transform(df[['price', 'year', 'odometer', 'cylinders', 'title_status', 'manufacturer', 'model','region', 'lat', 'long']])
df.head()

In [None]:
df.columns

In [None]:
# dataset for clustering
df.to_csv('used_car_clustering.csv')

### Clustering

In [None]:
#calculate euclidian distance
def euclidian(u, v):
    return sum((p-q)**2 for p, q in zip(u, v))**0.5

In [None]:
def kmeans(n_neighbour, n_feat, centroids):
    #perform k-means algorithm until all centroid point has same value
    while (True):
        cluster = []

        for i in range(len(X)):
            euclid = []
            #calculate euclidian distance
            for l in range(0, n_neighbour):
                euclid.append(euclidian(X[i][:n_feat],centroids[l]))
            #select cluster from minimum euclidean
            idx = np.argmin(euclid)
            cluster.append(idx+1)
            #add cluster to X
            X[i][n_feat] = idx+1

        #clustering the centroids
        group = {}
        for j in set(cluster):
            group[j] = [i for i in range(len(cluster)) if cluster[i] == j]
    
        #store centroid information of each cluster
        dataX = {}
        for j in range(1,n_neighbour+1):
            dataX[j] = [X[group[j][i]][:n_feat] for i in range(len(group[j]))]
        
        #assign new centroids to each cluster
        new_centroids = []
        for l in range(1,n_neighbour+1):
            new_centroids.append(np.mean(dataX[l], axis=0).tolist())
        
        if (centroids == new_centroids):
            return centroids
#             break
        
        centroids = copy.copy(new_centroids)

### Eksperimen 1

In [None]:
kn = [2,3,4,5,7]
sil = []

#perform silhoutte metrics for each kmeans clustering with different k in feature manufacturer and price
for k in kn:
    X = df[['manufacturer','price']]
    X['cluster'] = 0
    X = X.values.tolist()
    n_feat = 2
    centroids = []
    for i in range(k):
        rand = np.random.randint(0, len(X)-1)
        centroids.append(X[rand][:n_feat])
    km = kmeans(k, n_feat, centroids)
    xy = pd.DataFrame(data=X)
    sc = silhouette_score(xy.iloc[:,[0,1]], xy[2], metric = 'euclidean')
    sil.append(sc)

In [None]:
plt.plot(kn, sil, 'bx-') 
plt.xlabel('Values of K') 
plt.ylabel('Silhoutte score') 
plt.title('Silhoutte score for each K') 
plt.show() 

In [None]:
X = df[['manufacturer','price']]
X['cluster'] = 0
X = X.values.tolist()
k = 4
n_feat = 2
centroids = []
for i in range(k):
    rand = np.random.randint(0, len(X)-1)
    centroids.append(X[rand][:n_feat])
km = kmeans(k, n_feat, centroids)

In [None]:
X = np.array(X)
xy = pd.DataFrame(data=X)
output = plt.scatter(X[:,0], X[:,1], s = 100, c = xy[2], marker = 'o', alpha = 1, )
centers = np.array(centroids)
plt.scatter(centers[:,0], centers[:,1], c='red', s=200, alpha=1 , marker='o');
plt.title('Hasil Klustering K-Means')
plt.colorbar (output)
plt.show()

In [None]:
score = silhouette_score(xy.iloc[:,[0,n_feat-1]], xy[n_feat], metric = 'euclidean')
score

### Eksperimen 2

In [None]:
kn = [2,3,4,5,7]
sil = []
#perform silhoutte metrics for each kmeans clustering with different k in feature lat and long
for k in kn:
    X = df[['lat','long']]
    X['cluster'] = 0
    scaler = MinMaxScaler()
    X = scaler.fit_transform(X)
    X = X.tolist()
    n_feat = 2
    centroids = []
    for i in range(k):
        rand = np.random.randint(0, len(X)-1)
        centroids.append(X[rand][:n_feat])
    km = kmeans(k, n_feat, centroids)
    xy = pd.DataFrame(data=X)
    sc = silhouette_score(xy.iloc[:,[0,1]], xy[2], metric = 'euclidean')
    sil.append(sc)

In [None]:
plt.plot(kn, sil, 'bx-') 
plt.xlabel('Values of K') 
plt.ylabel('Silhoutte score') 
plt.title('Silhoutte score for each K') 
plt.show() 

In [None]:
X = df[['lat','long']]
X['cluster'] = 0
X.head()
scaler = MinMaxScaler()
X = scaler.fit_transform(X)
X = X.tolist()
k = 2
n_feat = 2
centroids = []
for i in range(k):
    rand = np.random.randint(0, len(X)-1)
    centroids.append(X[rand][:n_feat])
km = kmeans(k, n_feat, centroids)

In [None]:
X = np.array(X)
xy = pd.DataFrame(data=X)
output = plt.scatter(X[:,0], X[:,1], s = 100, c = xy[2], marker = 'o', alpha = 1, )
centers = np.array(centroids)
plt.scatter(centers[:,0], centers[:,1], c='red', s=200, alpha=1 , marker='o');
plt.title('Hasil Klustering K-Means')
plt.colorbar (output)
plt.show()

In [None]:
score = silhouette_score(xy.iloc[:,[0,n_feat-1]], xy[n_feat], metric = 'euclidean')
score