In [68]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
import joblib


from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV
from sklearn.preprocessing import RobustScaler, LabelEncoder, StandardScaler, Normalizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.cluster import KMeans, DBSCAN
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report, jaccard_score, log_loss, mean_squared_error


In [69]:
df = pd.read_csv('/content/drive/MyDrive/Sertifikasi DSBIZ/Iris.csv')
df.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa


In [70]:
df.info()
df.describe()
df.select_dtypes('object').describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             150 non-null    int64  
 1   SepalLengthCm  150 non-null    float64
 2   SepalWidthCm   150 non-null    float64
 3   PetalLengthCm  150 non-null    float64
 4   PetalWidthCm   150 non-null    float64
 5   Species        150 non-null    object 
dtypes: float64(4), int64(1), object(1)
memory usage: 7.2+ KB


Unnamed: 0,Species
count,150
unique,3
top,Iris-setosa
freq,50


In [71]:
labelencoder_y = LabelEncoder()
y = labelencoder_y.fit_transform(df['Species'])
df['Species']=y
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             150 non-null    int64  
 1   SepalLengthCm  150 non-null    float64
 2   SepalWidthCm   150 non-null    float64
 3   PetalLengthCm  150 non-null    float64
 4   PetalWidthCm   150 non-null    float64
 5   Species        150 non-null    int64  
dtypes: float64(4), int64(2)
memory usage: 7.2 KB


In [72]:
df.isna().any()

Id               False
SepalLengthCm    False
SepalWidthCm     False
PetalLengthCm    False
PetalWidthCm     False
Species          False
dtype: bool

In [76]:
scaler = StandardScaler()
SepalLengthCm = scaler.fit_transform(pd.DataFrame(df['SepalLengthCm']))
SepalWidthCm = scaler.fit_transform(pd.DataFrame(df['SepalWidthCm']))
PetalLengthCm = scaler.fit_transform(pd.DataFrame(df['PetalLengthCm']))
PetalWidthCm = scaler.fit_transform(pd.DataFrame(df['PetalWidthCm']))

df['SepalLengthCm'] = SepalLengthCm
df['SepalWidthCm'] = SepalWidthCm
df['PetalLengthCm'] = PetalLengthCm
df['PetalWidthCm'] = PetalWidthCm
df

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,-0.900681,1.032057,-1.341272,-1.312977,0
1,2,-1.143017,-0.124958,-1.341272,-1.312977,0
2,3,-1.385353,0.337848,-1.398138,-1.312977,0
3,4,-1.506521,0.106445,-1.284407,-1.312977,0
4,5,-1.021849,1.263460,-1.341272,-1.312977,0
...,...,...,...,...,...,...
145,146,1.038005,-0.124958,0.819624,1.447956,2
146,147,0.553333,-1.281972,0.705893,0.922064,2
147,148,0.795669,-0.124958,0.819624,1.053537,2
148,149,0.432165,0.800654,0.933356,1.447956,2


In [77]:
X = df.drop('Species', axis=1)
y = df['Species']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(120, 5) (120,) (30, 5) (30,)


In [84]:
classifiers = [
    KMeans(n_clusters=3, random_state=0),
    DBSCAN(eps=3, min_samples=2),]
model_performance = pd.DataFrame(columns=['Method', 'jaccard', 'mse'])

In [85]:
for classifier in classifiers:
    classifier.fit(X_train, y_train)
    y_pred = classifier.fit_predict(X_test)
    method = str(type(classifier)).split('.')[-1][:-2]
    mse = mean_squared_error(y_test, y_pred)
    akurasi = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='micro')
    jass = jaccard_score(y_test, y_pred, average='macro')
    # logloss = log_loss(y_test, y_pred,eps=1e-15, normalize=True, sample_weight=None, labels=None)precision_score, recall_score, f1_score
    model_performance = model_performance.append({
        'Method':method,
        'jaccard':jass,
        'akurasi':akurasi,
        'precision': precision,
        # 'logloss':logloss,
        'mse':mse}, ignore_index=True)
model_performance.sort_values([ 'mse', 'jaccard'], ascending=False).reset_index(drop=True)

Unnamed: 0,Method,jaccard,mse,akurasi,precision
0,DBSCAN,0.046914,6.8,0.133333,0.133333
1,KMeans,0.333333,0.666667,0.333333,0.333333


In [53]:
model_performance.sort_values([ 'mse', 'jaccard'], ascending=False).reset_index(drop=True)

Unnamed: 0,Method,jaccard,mse,akurasi,precision
0,DBSCAN,0.046914,6.8,0.133333,0.222222
1,KMeans,0.175439,0.666667,0.333333,0.175439
