# Classification

## Anomaly detection

### import required packages

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

### load data from source

In [3]:
df = pd.read_csv('/tmp/hearing_test.csv')
df.head(5)

Unnamed: 0,age,physical_score,test_result
0,33.0,40.7,1
1,50.0,37.2,1
2,52.0,24.7,0
3,56.0,31.0,0
4,35.0,42.9,1


### Exploratory Data Analysis

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             5000 non-null   float64
 1   physical_score  5000 non-null   float64
 2   test_result     5000 non-null   int64  
dtypes: float64(2), int64(1)
memory usage: 117.3 KB


In [8]:
df['test_result'].value_counts()

test_result
1    3000
0    2000
Name: count, dtype: int64

### Data Preprocessing

#### undersampling

- select only 2000 records where test_results = 1
- ignoring randome records to match with the other category

In [11]:
# filter the records where test_result = 1
df_passed = df[df['test_result'] == 1]
df_failed = df[df['test_result'] == 0]

# collect only 2000 records randomly
# as we are having 2000 records where test_result = 0
df_under = df_passed.sample(2000)

# build a new dataset using 2000 passed and 2000 failed records
df_balanced = pd.concat([df_under, df_failed], axis=0)

In [15]:
df_balanced['test_result'].value_counts()

test_result
1    2000
0    2000
Name: count, dtype: int64

In [17]:
# split the data into X and Y
X = df_balanced.drop('test_result', axis=1)
Y = df_balanced['test_result']

In [18]:
from sklearn.model_selection import train_test_split

# split the data into train and test
x_train, x_test, y_train, y_test = train_test_split(X, Y, train_size=0.7, random_state=12345)

### model building

In [35]:
from sklearn.svm import SVC

model = SVC(C=2, kernel='rbf')
model.fit(x_train, y_train)

### model evaluation

In [36]:
# define y_true and y_pred
y_true = y_test
y_pred = model.predict(x_test)

In [37]:
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_true, y_pred)
accuracy

0.936

### fine tune the model

In [30]:
from sklearn.model_selection import GridSearchCV

# decide the parameters
parameters = {
    "kernel": ('rbf', 'linear'),
    "C": range(1, 3)
}

# call GridSearchCV to find out the optimal values of hyper-parameters
gs = GridSearchCV(model, parameters)

# fit the data
gs.fit(x_train, y_train)

In [34]:
gs.cv_results_

{'mean_fit_time': array([0.02701731, 0.06577983, 0.02140722, 0.09798975]),
 'std_fit_time': array([0.00865262, 0.00248125, 0.00095082, 0.00404826]),
 'mean_score_time': array([0.0098877 , 0.00315204, 0.00856962, 0.00311956]),
 'std_score_time': array([2.01894240e-03, 7.20946561e-05, 2.49717466e-04, 8.33532084e-05]),
 'param_C': masked_array(data=[1, 1, 2, 2],
              mask=[False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_kernel': masked_array(data=['rbf', 'linear', 'rbf', 'linear'],
              mask=[False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'C': 1, 'kernel': 'rbf'},
  {'C': 1, 'kernel': 'linear'},
  {'C': 2, 'kernel': 'rbf'},
  {'C': 2, 'kernel': 'linear'}],
 'split0_test_score': array([0.91714286, 0.90142857, 0.91428571, 0.90142857]),
 'split1_test_score': array([0.92      , 0.91428571, 0.92142857, 0.91428571]),
 'split2_test_score': array([0.93285714, 0.93      , 0.93285714, 0.93     

### cross validation

In [27]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(model, x_train, y_train, cv=10)
scores

array([0.92571429, 0.90857143, 0.92      , 0.92285714, 0.92571429,
       0.94285714, 0.93428571, 0.92      , 0.90857143, 0.91714286])

### build the model using imbalanced data

In [22]:
# split the data into X and Y
X = df.drop('test_result', axis=1)
Y = df['test_result']

from sklearn.model_selection import train_test_split

# split the data into train and test
x_train, x_test, y_train, y_test = train_test_split(X, Y, train_size=0.7, random_state=12345)

In [23]:
from sklearn.svm import SVC

model = SVC()
model.fit(x_train, y_train)

In [24]:
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_true, y_pred)
accuracy

0.9166666666666666