In [22]:
import pandas as pd


from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, StratifiedShuffleSplit

from sklearn.svm import SVC

from sklearn.metrics import accuracy_score

In [23]:
# Loading Dataset from our csv file
df = pd.read_csv('PII_Detection_With_Classifier.csv')
print('Dataframe shape: ', df.shape)
type(df['Data'])

Dataframe shape:  (100, 2)


pandas.core.series.Series

In [24]:
df.head()

Unnamed: 0,Data,PII_STATUS
0,128-075-9247,1
1,188-319-3089,1
2,Columbia,0
3,299-291-7118,1
4,245-091-9288,1


In [25]:
print(df)

            Data  PII_STATUS
0   128-075-9247           1
1   188-319-3089           1
2       Columbia           0
3   299-291-7118           1
4   245-091-9288           1
..           ...         ...
95  129-506-9728           1
96  548-789-7429           1
97  458-031-1446           1
98  568-420-9605           1
99  608-680-7676           1

[100 rows x 2 columns]


In [26]:
# One-Hot Encoding 
df['Data'] = df['Data'].astype('category')
df = pd.get_dummies(df, columns=['Data'])

In [27]:
# X = features & y = Target class
X = df.drop(['PII_STATUS'], axis=1)
y = df['PII_STATUS']

In [28]:
# Splitting dataset into training and testing split with 75-25% ratio
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [29]:
# K-fold splits ( K = 10 here )
cv = StratifiedShuffleSplit(n_splits=10, test_size=.30, random_state=15)

In [30]:
# Building our model with K-fold( K=10 here) cross validation and GridSearch to find the best hyper-parameters

# Defining all the parameters
params = {
    'C': [0.001, 0.01, 0.1, 0.5, 1, 2, 3, 10], 
    'gamma' : [0.001,0.001, 0.01, 0.1, 0.5, 1, 2, 3, 5, 10]
}

# Building SVC model
svc = SVC(kernel='rbf', probability=True) 

# Hyper-Parameter optimisation using GridSearch
grid = GridSearchCV(svc, param_grid=params, scoring='accuracy', n_jobs =-1, cv=cv, verbose=1)

# Fitting the model
grid.fit(X_train, y_train)

# Fitting the model
grid.fit(X_train, y_train)

Fitting 10 folds for each of 80 candidates, totalling 800 fits
Fitting 10 folds for each of 80 candidates, totalling 800 fits


In [31]:
print('Values of best hyper-parameters:', grid.best_params_)

Values of best hyper-parameters: {'C': 0.001, 'gamma': 0.001}


In [32]:
# Using the best hyper-parameters from previous step and predicting on test feature dataset(X_test)
logreg_grid = grid.best_estimator_
y_pred = logreg_grid.predict(X_test)

results = y_test.to_frame()
results['prediction'] = y_pred

print(results)

    PII_STATUS  prediction
83           1           1
53           0           1
70           1           1
45           1           1
44           1           1
39           0           1
22           1           1
80           0           1
10           1           1
0            1           1
18           1           1
30           1           1
73           0           1
33           1           1
90           1           1
4            1           1
76           1           1
77           0           1
12           1           1
31           1           1
55           1           1
88           1           1
26           1           1
42           0           1
69           0           1


In [33]:
# Calculating Model Accuracy
svm_grid_score = accuracy_score(y_test, y_pred)
print('Model Accuracy Score :', svm_grid_score)


Model Accuracy Score : 0.72
