# Preprocessing

In [1]:
# Dataset used is available at https://www.kaggle.com/rakeshrau/social-network-ads

In [2]:
# import library
import pandas as pd
from sklearn.cluster import KMeans
from sklearn import metrics

In [3]:
# load the dataset and create dataframe
dataframe = pd.read_csv('Social_Network_Ads.csv')
dataframe

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19,19000,0
1,15810944,Male,35,20000,0
2,15668575,Female,26,43000,0
3,15603246,Female,27,57000,0
4,15804002,Male,19,76000,0
...,...,...,...,...,...
395,15691863,Female,46,41000,1
396,15706071,Male,51,23000,1
397,15654296,Female,50,20000,1
398,15755018,Male,36,33000,0


In [4]:
# Get Age and EstimatedSalary as input feature
X = dataframe[['Age','EstimatedSalary']].values

# get Purchased as Target
y = dataframe['Purchased'].values

In [5]:
# Split the dataset into training and test set
# 70% for training and 30% for testing.

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state = 0)

In [6]:
# Feature Scaling
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# SVM Model

In [7]:
# Fit SVM Model to the Training set
from sklearn.svm import SVC
SVM_classifier = SVC(random_state = 42)
SVM_classifier.fit(X_train, y_train)

SVC(random_state=42)

In [8]:
# Prediction on Test set
y_pred = SVM_classifier.predict(X_test)

In [9]:
# Confusion Matrix
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report
confusion_matrix = confusion_matrix(y_test, y_pred)
confusion_matrix

array([[72,  7],
       [ 4, 37]], dtype=int64)

In [10]:
dataframe['Purchased'].value_counts(normalize=True)

0    0.6425
1    0.3575
Name: Purchased, dtype: float64

- We have 64% label-0 and 36% label-1
- Hence it is safe to assume that data is not unbalanced

# Hence Accuracy is selected as evaluation metrics 

In [11]:
# Accuracy is selected metricsto know how accurate our model is
accuracy_score(y_test, y_pred)

0.9083333333333333

In [12]:
# round(x,3) : Round the number x to 3 decimal places.

print('SVM Model Accuracy is : ',round(accuracy_score(y_test, y_pred),3))

SVM Model Accuracy is :  0.908


# Unsupervised classification - Kmeans

In [13]:
# Build kmeans model
clustering = KMeans(n_clusters = 2,random_state = 42)

In [14]:
# train the model on X_train
clustering.fit(X_train)

KMeans(n_clusters=2, random_state=42)

# Fowlkes-Mallows scores is used for evaluation
- https://scikit-learn.org/stable/modules/clustering.html#clustering-performance-evaluation

In [15]:
# get the predicted cluster for X_test Data
labels_pred = clustering.predict(X_test)

In [16]:
labels_true = y_test

In [17]:
# Fowlkes-Mallows scores is used when True labels of sample is known
# and we already know the True labels from our Datset

metrics.fowlkes_mallows_score(labels_true, labels_pred)

0.7523217322115527

- The dataset is balanced,if the number of positive samples is similar to the negative samples else it is unbalanced.

- We can not apply the same evaluation metrics, Because suppose we have a dataset containing 5% positive data and 95% negative data. Then if our model is predicting for 100% negative data, still we get 95% Accuracy, which is biased Hence we need to use precision and recall.
- reference https://www.kdnuggets.com/2017/06/7-techniques-handle-imbalanced-data.html