In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_selector, make_column_transformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder,OrdinalEncoder 
from sklearn.impute import SimpleImputer, KNNImputer

In [3]:
pd.set_option('display.max_row', 147)
pd.set_option('display.max_column', 147)

In [4]:
df = pd.read_csv("salary.csv")
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [5]:
df.shape

(32561, 15)

In [6]:
df.columns

Index(['age', 'workclass', 'fnlwgt', 'education', 'education-num',
       'marital-status', 'occupation', 'relationship', 'race', 'sex',
       'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
       'salary'],
      dtype='object')

In [7]:
df.describe().transpose().sort_values("std")

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
education-num,32561.0,10.080679,2.57272,1.0,9.0,10.0,12.0,16.0
hours-per-week,32561.0,40.437456,12.347429,1.0,40.0,40.0,45.0,99.0
age,32561.0,38.581647,13.640433,17.0,28.0,37.0,48.0,90.0
capital-loss,32561.0,87.30383,402.960219,0.0,0.0,0.0,0.0,4356.0
capital-gain,32561.0,1077.648844,7385.292085,0.0,0.0,0.0,0.0,99999.0
fnlwgt,32561.0,189778.366512,105549.977697,12285.0,117827.0,178356.0,237051.0,1484705.0


In [8]:
df.dtypes.value_counts()

object    9
int64     6
dtype: int64

In [9]:
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_selector, make_column_transformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder , LabelEncoder
from sklearn.impute import SimpleImputer

from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.utils import class_weight

# from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.model_selection import cross_val_score
# from sklearn.metrics import classification_report
# from sklearn import metrics
# from sklearn.metrics import confusion_matrix


In [10]:
y_salary_target = df["salary"]
y_salary_target.shape

(32561,)

In [11]:
X_featrures = df.drop(["salary"] , axis = 1)
X_featrures.shape

(32561, 14)

In [12]:
numerical_features = make_column_selector(dtype_include = np.number)
categorical_features =  make_column_selector(dtype_exclude = np.number)

In [13]:
categorical_pipeline = make_pipeline(SimpleImputer( missing_values=None , strategy='most_frequent'),
                                     SimpleImputer(missing_values=np.nan , 
                                                            strategy='most_frequent'),OneHotEncoder()) 

In [14]:
make_column_transformer((categorical_pipeline, categorical_features))

ColumnTransformer(transformers=[('pipeline',
                                 Pipeline(steps=[('simpleimputer-1',
                                                  SimpleImputer(missing_values=None,
                                                                strategy='most_frequent')),
                                                 ('simpleimputer-2',
                                                  SimpleImputer(strategy='most_frequent')),
                                                 ('onehotencoder',
                                                  OneHotEncoder())]),
                                 <sklearn.compose._column_transformer.make_column_selector object at 0x000001FE3B7E2100>)])

In [15]:
numerical_pipeline = make_pipeline( SimpleImputer(strategy='mean'), StandardScaler())

In [16]:
preprocessor = make_column_transformer((numerical_pipeline, numerical_features) , 
                                       (categorical_pipeline, categorical_features))

In [17]:
X_scaled = preprocessor.fit_transform(X_featrures)
X_scaled # to large to be prined

<32561x108 sparse matrix of type '<class 'numpy.float64'>'
	with 455854 stored elements in Compressed Sparse Row format>

In [18]:
encoder_y_salary_target = LabelEncoder()
y_encoder_y_salary_target = encoder_y_salary_target.fit_transform(y_salary_target)
y_encoder_y_salary_target

array([0, 0, 0, ..., 0, 0, 1])

In [19]:
X_train_features, X_test_features, y_train_salary, y_test_salary = train_test_split(X_scaled, 
                                                         y_encoder_y_salary_target, 
                                                         test_size = 0.2,  random_state = 5 )

In [20]:
print('Train set:', X_train_features.shape)
print('Test set:', X_test_features.shape)

Train set: (26048, 108)
Test set: (6513, 108)


In [21]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm

In [22]:
# model_salary = RandomForestClassifier()
# this algorithm ist gut but ie take to much time to be lunch on my Computer.

In [23]:
model_salary = svm.SVC()

In [24]:
cross_val_score(model_salary, X_train_features, y_train_salary, cv = 5) 
# this line code take to much time it take only 5 minute with svm but mith RDC it take to much time

array([0.85259117, 0.84548944, 0.85143954, 0.85793818, 0.86312152])

In [25]:
model_salary.fit(X_train_features , y_train_salary )

SVC()

In [26]:
model_salary.score(X_test_features, y_test_salary)

0.8542914171656687

In [27]:
y_features_pred = model_salary.predict(X_test_features)

In [28]:
# get support vectors
model_salary.support_vectors_

<8867x108 sparse matrix of type '<class 'numpy.float64'>'
	with 124138 stored elements in Compressed Sparse Row format>

In [29]:
# get indices of support vectors
model_salary.support_

array([    0,     2,     4, ..., 26017, 26023, 26029])

In [30]:
# get number of support vectors for each class
model_salary.n_support_

array([4592, 4275])

In [31]:
# the classification report
print(confusion_matrix(y_test_salary , y_features_pred))
print(classification_report(y_test_salary, y_features_pred))

[[4692  273]
 [ 676  872]]
              precision    recall  f1-score   support

           0       0.87      0.95      0.91      4965
           1       0.76      0.56      0.65      1548

    accuracy                           0.85      6513
   macro avg       0.82      0.75      0.78      6513
weighted avg       0.85      0.85      0.85      6513



In [32]:
# just to see what the encoded values are
encoder_y_salary_target.inverse_transform([0, 1])

array([' <=50K', ' >50K'], dtype=object)

# With GridSearchCV

In [59]:
param_grid = {
    "svc_C" : [1, 5, 10, 50],
    "svc_gamma" : [0.001 , 0.0005, 0.001 , 0.005]
}

In [60]:
grid_salary = GridSearchCV(svm.SVC(), param_grid, cv = 7)

In [61]:
svm.SVC().get_params().keys()

dict_keys(['C', 'break_ties', 'cache_size', 'class_weight', 'coef0', 'decision_function_shape', 'degree', 'gamma', 'kernel', 'max_iter', 'probability', 'random_state', 'shrinking', 'tol', 'verbose'])

In [62]:
svm.SVC().get_params().values()

dict_values([1.0, False, 200, None, 0.0, 'ovr', 3, 'scale', 'rbf', -1, False, None, True, 0.001, False])