In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import sklearn

In [2]:
file_path = r"C:\Users\12489\Documents\Data\Social_Network_Ads.csv"

dataset = pd.read_csv(file_path)

In [4]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   User ID          400 non-null    int64 
 1   Gender           400 non-null    object
 2   Age              400 non-null    int64 
 3   EstimatedSalary  400 non-null    int64 
 4   Purchased        400 non-null    int64 
dtypes: int64(4), object(1)
memory usage: 15.8+ KB


In [7]:
dataset.isnull().sum()

User ID            0
Gender             0
Age                0
EstimatedSalary    0
Purchased          0
dtype: int64

In [5]:
X = dataset.iloc[:, [1, 2, 3]].values #input features: Gender, Age and Salary

y = dataset.iloc[:, -1].values #dependent variable: Purchased(0, 1)

In [6]:
print(f"""Shape of the features: {X.shape}
Shape of the target: {y.shape}""")

Shape of the features: (400, 3)
Shape of the target: (400,)


-----------------

## data wrangling

*(given our initial analysis, what needs to be wrangled?*

*...it looks like we just need to encode the gender column)*

In [8]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder() #instantiate the encoder

X[:,0] = le.fit_transform(X[:,0])

In [9]:
# ensure that it worked

print(X)

[[1 19 19000]
 [1 35 20000]
 [0 26 43000]
 ...
 [0 50 20000]
 [1 36 33000]
 [0 49 36000]]


----------------------

## ready for analysis

In [10]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

In [11]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()

X_train = sc.fit_transform(X_train)

X_test = sc.transform(X_test)

In [12]:
# check your progress - compare to previous data

X_train

array([[ 1.02532046e+00,  1.92295008e+00,  2.14601566e+00],
       [-9.75304830e-01,  2.02016082e+00,  3.78719297e-01],
       [-9.75304830e-01, -1.38221530e+00, -4.32498705e-01],
       [-9.75304830e-01, -1.18779381e+00, -1.01194013e+00],
       [-9.75304830e-01,  1.92295008e+00, -9.25023920e-01],
       [-9.75304830e-01,  3.67578135e-01,  2.91803083e-01],
       [-9.75304830e-01,  1.73156642e-01,  1.46942725e-01],
       [ 1.02532046e+00,  2.02016082e+00,  1.74040666e+00],
       [-9.75304830e-01,  7.56421121e-01, -8.38107706e-01],
       [-9.75304830e-01,  2.70367388e-01, -2.87638347e-01],
       [ 1.02532046e+00,  3.67578135e-01, -1.71750061e-01],
       [-9.75304830e-01, -1.18475597e-01,  2.20395980e+00],
       [-9.75304830e-01, -1.47942605e+00, -6.35303205e-01],
       [ 1.02532046e+00, -1.28500455e+00, -1.06988428e+00],
       [ 1.02532046e+00, -1.38221530e+00,  4.07691369e-01],
       [-9.75304830e-01, -1.09058306e+00,  7.55356227e-01],
       [ 1.02532046e+00, -1.47942605e+00

----------------------

## **Naive Bayes**

There are three types of Naive Bayes models: Gaussian, Multinomial, and Bernoulli.

Gaussian Naive Bayes – This is a variant of Naive Bayes which supports continuous values and has an assumption that each class is normally distributed. 

Multinomial Naive Bayes – This is another variant which is an event-based model that has features as vectors where sample(feature) represents frequencies with which certain events have occurred.

Bernoulli – This variant is also event-based where features are independent boolean which are in binary form.

In [13]:
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB() #creating the classifier

classifier.fit(X_train, y_train)

GaussianNB()

In [14]:
y_pred  =  classifier.predict(X_test)

-----------------------

## check for accuracy

In [17]:
from sklearn.metrics import confusion_matrix,accuracy_score
cm = confusion_matrix(y_test, y_pred)
ac = accuracy_score(y_test,y_pred)

In [18]:
print(cm)

[[56  2]
 [ 4 18]]


In [19]:
print(ac)

0.925


In [20]:
# MAKING A SPECIFIC PREDICTION

row = [[1, 20, 18000]]

y_pred  =  classifier.predict(row)

print(y_pred)

[1]


## Naive Bayes with Multiple Labels

In model building part, you can use wine dataset which is a very famous multi-class classification problem. "This dataset is the result of a chemical analysis of wines grown in the same region in Italy but derived from three different cultivars." (UC Irvine)

Dataset comprises of 13 features (alcohol, malic_acid, ash, alcalinity_of_ash, magnesium, total_phenols, flavanoids, nonflavanoid_phenols, proanthocyanins, color_intensity, hue, od280/od315_of_diluted_wines, proline) and type of wine cultivar. This data has three type of wine Class_0, Class_1, and Class_3. Here you can build a model to classify the type of wine.

The dataset is available in the scikit-learn library.

In [None]:
#Import scikit-learn dataset library
from sklearn import datasets

#Load dataset
wine = datasets.load_wine()

In [None]:
# print the names of the 13 features
print("Features: ", wine.feature_names)

# print the label type of wine(class_0, class_1, class_2)
print("Labels: ", wine.target_names)

In [None]:
# Import train_test_split function
from sklearn.model_selection import train_test_split

# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(wine.data, wine.target, test_size=0.3,random_state=109) # 70% training and 30% test

In [None]:
#Import Gaussian Naive Bayes model
from sklearn.naive_bayes import GaussianNB

#Create a Gaussian Classifier
gnb = GaussianNB()

#Train the model using the training sets
gnb.fit(X_train, y_train)

#Predict the response for test dataset
y_pred = gnb.predict(X_test)

In [None]:
#Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics

# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))