## Importing Libraries

In [9]:
# importing basic libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# SVC for classification through support vector machine
from sklearn.svm import SVC

# importing train test split & GridSearchCV library
from sklearn.model_selection import train_test_split, GridSearchCV

# model evaluation metrics
from sklearn.metrics import classification_report, confusion_matrix

# standardization library
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()



# Retriving train and test data into this notebook 

In [1]:
# Retrieve the test DataFrame from the IPython database
%store -r preprocessed_test_df

In [3]:
# Retrieve the test DataFrame from the IPython database
%store -r preprocessed_train_df

In [5]:
preprocessed_train_df

Unnamed: 0,age,workclass,education,educationno,maritalstatus,occupation,relationship,race,sex,hoursperweek,native,Salary
0,39.0,5,9,13,4,0,1,4,1,40.0,37,0
1,50.0,4,9,13,2,3,0,4,1,32.5,37,0
2,38.0,2,11,9,0,5,1,4,1,40.0,37,0
3,53.0,2,1,7,2,5,0,2,1,40.0,37,0
4,28.0,2,9,13,2,9,5,2,0,40.0,4,0
...,...,...,...,...,...,...,...,...,...,...,...,...
30156,27.0,2,7,12,2,12,5,4,0,38.0,37,0
30157,40.0,2,11,9,2,6,0,4,1,40.0,37,1
30158,58.0,2,11,9,6,0,4,4,0,40.0,37,0
30159,22.0,2,11,9,4,0,3,4,1,32.5,37,0


In [6]:
preprocessed_test_df

Unnamed: 0,age,workclass,education,educationno,maritalstatus,occupation,relationship,race,sex,hoursperweek,native,Salary
0,25,2,1,7,4,6,3,2,1,40.0,37,0
1,38,2,11,9,2,4,0,4,1,50.0,37,0
2,28,1,7,12,2,10,0,4,1,40.0,37,1
3,44,2,15,10,2,6,0,2,1,40.0,37,1
4,34,2,0,6,4,7,1,4,1,32.5,37,0
...,...,...,...,...,...,...,...,...,...,...,...,...
15055,33,2,9,13,4,9,3,4,1,40.0,37,0
15056,39,2,9,13,0,9,1,4,0,36.0,37,0
15057,38,2,9,13,2,9,0,4,1,50.0,37,0
15058,44,2,9,13,0,0,3,1,1,40.0,37,0


## model training

**Data splitting**

In [7]:
# train data
x_train = preprocessed_train_df.drop(columns=["Salary"])
y_train = preprocessed_train_df["Salary"]

# test data
x_test = preprocessed_test_df.drop(columns=["Salary"])
y_test = preprocessed_test_df["Salary"]

**Data Stadardization**

In [10]:
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)

**Creating a base model on default hyperparameter**

In [11]:
svc = SVC()
svc.fit(x_train, y_train)
base_pred = svc.predict(x_test)
print(classification_report(y_test, base_pred))

              precision    recall  f1-score   support

           0       0.85      0.93      0.89     11360
           1       0.70      0.50      0.58      3700

    accuracy                           0.82     15060
   macro avg       0.78      0.71      0.73     15060
weighted avg       0.81      0.82      0.81     15060



**Using Grid Search method to findout best performing hyperparameters**

In [12]:
param_grid = {'C' : [0.1,0.5,0.9],
              'kernel':['rbf','linear','sigmoid']}
svc = SVC()

In [13]:
grid_model = GridSearchCV(estimator=svc, param_grid=param_grid, cv = 5)
grid_model.fit(x_train,y_train)

GridSearchCV(cv=5, estimator=SVC(),
             param_grid={'C': [0.1, 0.5, 0.9],
                         'kernel': ['rbf', 'linear', 'sigmoid']})

In [14]:
#check best best perfroming estimators
grid_model.best_estimator_

SVC(C=0.9)

In [15]:
# checking best perfroming hyperparamerter values for SVM
grid_model.best_params_

{'C': 0.9, 'kernel': 'rbf'}

In [16]:
# predictions on test data
final_pred = grid_model.predict(x_test)

In [17]:
#c lassification report
print(classification_report(y_test,final_pred))

              precision    recall  f1-score   support

           0       0.85      0.93      0.89     11360
           1       0.70      0.49      0.58      3700

    accuracy                           0.82     15060
   macro avg       0.78      0.71      0.73     15060
weighted avg       0.81      0.82      0.81     15060



In [18]:
print(grid_model.score(x_train,y_train))
print(grid_model.score(x_test,y_test))

0.8297801797022645
0.8238379814077025


**Hence our model is generalized