In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [7]:
dataset =  pd.read_csv("bill_authentication.csv")
#Data were extracted from images that were taken from genuine and forged banknote-like specimens. For digitization,
#an industrial camera usually used for print inspection was used. The final images have 400x 400 pixels. 
#Due to the object lens and distance to the investigated object gray-scale pictures
#with a resolution of about 660 dpi were gained. Wavelet Transform tool were used to extract features from images.

In [8]:
dataset.head(1000)  # we would like to perform a classification according to the variance, skewness, curtosis and entropy
# we will determine if the banknote can be authenticated or not!


Unnamed: 0,Variance,Skewness,Curtosis,Entropy,Class
0,3.62160,8.66610,-2.807300,-0.446990,0
1,4.54590,8.16740,-2.458600,-1.462100,0
2,3.86600,-2.63830,1.924200,0.106450,0
3,3.45660,9.52280,-4.011200,-3.594400,0
4,0.32924,-4.45520,4.571800,-0.988800,0
5,4.36840,9.67180,-3.960600,-3.162500,0
6,3.59120,3.01290,0.728880,0.564210,0
7,2.09220,-6.81000,8.463600,-0.602160,0
8,3.20320,5.75880,-0.753450,-0.612510,0
9,1.53560,9.17720,-2.271800,-0.735350,0


In [9]:
dataset.shape

(1372, 5)

In [10]:
# Number of missing values in each column of training data
missing_val_count_by_column = (dataset.isnull().sum())
print(missing_val_count_by_column[missing_val_count_by_column > 0])  

Variance    1
Skewness    6
Curtosis    8
Entropy     4
dtype: int64


In [11]:
from sklearn.impute import SimpleImputer  # Imputer is for missing values

# Fill in the lines below: imputation
my_imputer = SimpleImputer()
datasetClean = pd.DataFrame(my_imputer.fit_transform(dataset))
# Fill in the lines below: imputation removed column names; put them back
datasetClean.columns = dataset.columns
# In general you can use this imputer without worrying about the missing values.

In [12]:
# Number of missing values in each column of training data
missing_val_count_by_column = (datasetClean.isnull().sum())
print(missing_val_count_by_column[missing_val_count_by_column > 0]) 

Series([], dtype: int64)


In [13]:
cor = datasetClean.corr()   # correlation calculation, we are trying to eliminate non-relevant columns from the dataset!
cor_target = abs(cor["Class"])
#Selecting highly correlated features
relevant_features = cor_target[cor_target>0.0001]  # change the parameter, to see if you can get more 
#columns related to "AveragePrice" . In our case "type" columns looks not relevant therefore we are going to drop it!
relevant_features.index
# but what if we dont drop? we cannot guarantee that high correlation means good results!

Index(['Variance', 'Skewness', 'Curtosis', 'Entropy', 'Class'], dtype='object')

In [9]:
#datasetClean.drop('Entropy', axis=1, inplace=True)  #### according to the result of correlation we will drop this column. 
# What if it has still effect on the result despite of the correlation?
#Examine the training with and without this "type" column!

In [10]:
X = datasetClean.drop('Class', axis=1)
y = datasetClean['Class']

In [11]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20)

In [12]:
from sklearn.svm import SVC  # support vector machine - linear
svclassifier = SVC(kernel='linear')
fitted = svclassifier.fit(X_train, y_train)

In [13]:
y_pred = svclassifier.predict(X_test)

In [14]:
from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))

[[148   4]
 [  0 123]]
              precision    recall  f1-score   support

         0.0       1.00      0.97      0.99       152
         1.0       0.97      1.00      0.98       123

    accuracy                           0.99       275
   macro avg       0.98      0.99      0.99       275
weighted avg       0.99      0.99      0.99       275



In [15]:
from sklearn.metrics import accuracy_score
print('SVC:',accuracy_score(y_test, y_pred)*100,'%')

SVC: 98.54545454545455 %


In [16]:
# Try to predict the price, you can choose any row from data set and check if the prediction "0" or "1" is correct!
test =datasetClean.loc[[1000]]   # take Nth row to check your prediction result manually 
test = test.drop('Class', axis=1)  # drop target "class" column
prediction= fitted.predict(test)
prediction

array([1.])

In [17]:
#now here is your new perspective, you cant always rely on numbers yea?
# here is how you can really predict your ML algorithm. you show it a NEW data and it gives you a result!
#dont forget about this part of code, you will need to check your final predictions in your career(dont worry about exam)

data = [[1.364,2.1539,2.457,2.99532]]   # you can also create a dataframe by yourself and feed it into SVM prediction
# imagine you have a banknote which has values as above. See if it can pass the authentication or not
# 1 for the pass. 0 for not-pass
  
# Create the pandas DataFrame 
df = pd.DataFrame(data, columns = ['Variance', 'Skewness','Curtosis','Entropy']) 
df

Unnamed: 0,Variance,Skewness,Curtosis,Entropy
0,1.364,2.1539,2.457,2.99532


In [18]:
prediction= fitted.predict(df)  # you can adjust the values and see what will your model generate.
# this is a binary classification so result will be either 1 or 0 as yes or no.
prediction

array([0.])