In [28]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.cluster import AgglomerativeClustering, KMeans
from sklearn.datasets import load_boston, load_wine
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import silhouette_score, homogeneity_score, completeness_score



In [42]:
loc = r'http://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data'
auto = pd.read_csv(loc, delim_whitespace = True)
auto = auto.replace("?", np.NaN)
auto.head()


Unnamed: 0,18.0,8,307.0,130.0,3504.,12.0,70,1,chevrolet chevelle malibu
0,15.0,8,350.0,165.0,3693.0,11.5,70,1,buick skylark 320
1,18.0,8,318.0,150.0,3436.0,11.0,70,1,plymouth satellite
2,16.0,8,304.0,150.0,3433.0,12.0,70,1,amc rebel sst
3,17.0,8,302.0,140.0,3449.0,10.5,70,1,ford torino
4,15.0,8,429.0,198.0,4341.0,10.0,70,1,ford galaxie 500


In [30]:
auto.columns = ["mpg", "cylinders", "displacement", "horsepower", "weight", "acceleration", "model_year",
             "origin", "car_name"]
auto.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,car_name
0,15.0,8,350.0,165.0,3693.0,11.5,70,1,buick skylark 320
1,18.0,8,318.0,150.0,3436.0,11.0,70,1,plymouth satellite
2,16.0,8,304.0,150.0,3433.0,12.0,70,1,amc rebel sst
3,17.0,8,302.0,140.0,3449.0,10.5,70,1,ford torino
4,15.0,8,429.0,198.0,4341.0,10.0,70,1,ford galaxie 500


In [31]:
df_auto = auto[["mpg", "displacement", "horsepower", "weight", "acceleration"]]
df_auto.head()

Unnamed: 0,mpg,displacement,horsepower,weight,acceleration
0,15.0,350.0,165.0,3693.0,11.5
1,18.0,318.0,150.0,3436.0,11.0
2,16.0,304.0,150.0,3433.0,12.0
3,17.0,302.0,140.0,3449.0,10.5
4,15.0,429.0,198.0,4341.0,10.0


In [37]:
impute = SimpleImputer(missing_values = np.NaN, strategy='mean')
impute = impute.fit(df_auto)
df_auto = impute.transform(df_auto)
df_auto = pd.DataFrame(df_auto)
df_auto.columns = ["mpg", "displacement", "horsepower", "weight", "acceleration"]
cls = auto[["origin"]]

In [38]:
model = AgglomerativeClustering(n_clusters=3, linkage='average', affinity='euclidean')
f_model = model.fit(df_auto)
labels = f_model.labels_
origin = []

In [40]:

for idx,row in auto.iterrows():
    origin.append(row["origin"])

origin = np.array(origin)
df_auto["origin"] = origin
df_auto["cluster_id"] = labels
both = zip(labels, origin)
group_by_origin = df_auto[["mpg", "displacement", "horsepower", "weight", "acceleration", "origin"]].groupby(by=["origin"])
group_by_cluster = df_auto[["mpg", "displacement", "horsepower", "weight", "acceleration", "cluster_id"]].groupby(by=["cluster_id"])

In [41]:
print("\nMEAN BY CLASS ORIGIN")
print("\n")
print(group_by_origin.mean())
print("\n")
print("\nMEAN BY CLUSTER ID")
print("\n")
print(group_by_cluster.mean())


MEAN BY CLASS ORIGIN


              mpg  displacement  horsepower       weight  acceleration
origin                                                                
1       20.091935    245.655242  118.768614  3361.358871     15.045968
2       27.891429    109.142857   81.240117  2423.300000     16.787143
3       30.450633    102.708861   79.835443  2221.227848     16.172152



MEAN BY CLUSTER ID


                  mpg  displacement  horsepower       weight  acceleration
cluster_id                                                                
0           27.365414    131.934211   84.298589  2459.511278     16.298120
1           13.889062    358.093750  167.046875  4398.593750     13.025000
2           17.502985    278.567164  124.388060  3626.641791     15.152239


#### We can see that when each function is grouped by Origin and ID, the mean does not provide much insight, but we can see that the mean acceleration between clusters and origin groups, for example, is very similar to each other when compared to the other features, and we can consider it as noise. 

In [43]:
print("VARIANCE BY CLASS ORIGIN")
print(grouped_by_origin.var())
print("\nVARIANCE BY CLUSTER ID")
print(grouped_by_cluster.var())

VARIANCE BY CLASS ORIGIN


NameError: name 'grouped_by_origin' is not defined