# DBSCAN  - Clustering Algorithm

Import packages for analysis of data

In [None]:
import numpy as np 
import pandas as pd

Url for dataset that is to be used

In [None]:
url = "http://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"

Assign dataset column names and assing the dataset to a pandas dataframe

In [None]:
# Assign colum names to the dataset
names = ['sepal-length', 'sepal-width', 'petal-length', 'petal-width', 'Class']

# Read dataset to pandas dataframe
dataset = pd.read_csv(url, names=names) 



Inspect dataset

In [None]:
dataset.head()

Obtain only the values for the clustering algorithm. Without the classes

In [None]:
data = dataset.drop(['Class'], axis=1)

Check data

In [None]:
data

View data only from the first column 

In [None]:
data['sepal-length']

### Plot graphs

Import packages to plot graphs 

In [None]:
import matplotlib.pyplot as plt  

#### Plot graph for Sepal Length and Sepal Width

In [None]:
plt.figure(figsize=(12, 6))  
plt.scatter(data['sepal-length'],data['sepal-width'], c= 'k')
plt.title('Sepal Comparison') 
plt.xlabel('Sepal Length')
plt.ylabel('Sepal Width')
plt.show()
# ask class to add title and color

#### Plot graph for Petal Width and Petal Length 

In [None]:
plt.figure(figsize=(12, 6))  
plt.scatter(data['petal-length'],data['petal-width'], c= 'm')
plt.title('Petal Comparison')  
plt.xlabel('Petal Length')
plt.ylabel('Petal Width')
plt.show()


### Standardize the data 
Data is standardizes as distance is a critical part of this machine learning algorithm

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
Xs = scaler.fit_transform(data)

## DBSCAN machine learning

Import packages to conduct machine learning 

In [None]:
from sklearn.cluster import DBSCAN
from sklearn.metrics import accuracy_score

Fit the machine learning algorithm with selected parameters

In [None]:
dbscan = DBSCAN(eps=0.5, min_samples=5)


Obtain the predicted values of the classes

In [None]:
pred = dbscan.fit_predict(Xs)

 First let's see the predicted values, what did it give numerically

In [None]:
pred

In [None]:
dataset['prediction'] = pred

In [None]:
dataset['prediction'].value_counts()

# Draw graphs for the predictions 

In [None]:
# viewing data where prediction = 0
dataset[dataset['prediction']==0]

View the plot of the actual data:

Seperate data according to their `Class` to plot 

In [None]:
# Predictions
pred_1 = dataset[dataset['prediction'] == 1]
pred_0 = dataset[dataset['prediction'] == 0]
pred_n1 = dataset[dataset['prediction'] == -1]

In [None]:
# Actual
setosa = dataset[dataset['Class'] == 'Iris-setosa']
virginica  = dataset[dataset['Class'] == 'Iris-virginica']
versicolor = dataset[dataset['Class'] == 'Iris-versicolor']

### Plot actual and predicted data side by side 
#### Predicted & Actual Sepal length and width

In [None]:
plt.figure(figsize=(12,6))
plt.subplot(1, 2, 1)

## Predicted sepal length and width


plt.scatter(pred_0['sepal-length'], pred_0['sepal-width'])
plt.scatter(pred_1['sepal-length'], pred_1['sepal-width'])
plt.scatter(pred_n1['sepal-length'], pred_n1['sepal-width'])


plt.legend(['setosa','versicolor','virginica'])
plt.title('Predicted Sepal Comparison')  
plt.xlabel('Sepal Length')  
plt.ylabel("Sepal Width") 

plt.subplot(1, 2, 2)

# Actual Sepal Length and Width 
plt.scatter(setosa['sepal-length'], setosa['sepal-width'])
plt.scatter(versicolor['sepal-length'], versicolor['sepal-width'])
plt.scatter(virginica['sepal-length'], virginica['sepal-width'])

plt.legend(['setosa','versicolor','virginica'])
plt.title('Actual Sepal Comparison')
plt.xlabel('Sepal Length')
plt.ylabel('Sepal Width')


 


plt.show()

#### Predicted & Actual Petal length and width

In [None]:
plt.figure(figsize=(12,6))
plt.subplot(1, 2, 1)

#predicted 


plt.scatter(pred_0['petal-length'], pred_0['petal-width'])
plt.scatter(pred_1['petal-length'], pred_1['petal-width'])
plt.scatter(pred_n1['petal-length'], pred_n1['petal-width'])

plt.legend(['setosa','versicolor','virginica'])
plt.title('Predicted Petal Comparison')  
plt.xlabel('Petal Length')  
plt.ylabel("Petal Width")  

# actual 

plt.subplot(1, 2, 2)
plt.scatter(setosa['petal-length'], setosa['petal-width'])
plt.scatter(versicolor['petal-length'], versicolor['petal-width'])
plt.scatter(virginica['petal-length'], virginica['petal-width'])

plt.legend(['setosa','versicolor','virginica'])
plt.title('Actual Petal Comparison')
plt.xlabel('Petal Length')
plt.ylabel('Petal Width')


plt.show()

In [None]:
import seaborn as sns

### Exercise 1 
Test each combination ( manual grid search )
- Change the value of eps to 0.1, 1,10 
- Change the value of min_samples to 1,5,10
- Note down the changes that you see in the figure below

In [None]:
#### replace values here 
dbscan = DBSCAN(eps=1, min_samples=5)





pred = dbscan.fit_predict(Xs)
dataset['prediction'] = pred
fig,ax = plt.subplots(figsize=(12,6))

plt.subplot(1, 2, 1)
plt.title('Predicted Petal Comparison')  
sns.scatterplot(data=dataset, hue='prediction', x='petal-length', y='petal-width',palette='Accent')
plt.subplot(1, 2, 2)
plt.title('Actual Petal Comparison')  
sns.scatterplot(data=dataset, hue='Class', x='petal-length', y='petal-width',palette='Accent' )

plt.show()

## Cool Example 

https://www.naftaliharris.com/blog/visualizing-dbscan-clustering/