In [93]:
# import modules for working with data
import pandas as pd
import numpy as np

# import stats functions from scipy
from scipy import stats

# imports for better control of output and plots
from IPython.display import display
import matplotlib.pyplot as plt
import seaborn as sns

# show plots in the notebook
%matplotlib inline

In [94]:
DATA_PATH = '../data/KaggleV2-May-2016.csv'
df = pd.read_csv(DATA_PATH)
df.head()

Unnamed: 0,PatientId,AppointmentID,Gender,ScheduledDay,AppointmentDay,Age,Neighbourhood,Scholarship,Hipertension,Diabetes,Alcoholism,Handcap,SMS_received,No-show
0,29872500000000.0,5642903,F,2016-04-29T18:38:08Z,2016-04-29T00:00:00Z,62,JARDIM DA PENHA,0,1,0,0,0,0,No
1,558997800000000.0,5642503,M,2016-04-29T16:08:27Z,2016-04-29T00:00:00Z,56,JARDIM DA PENHA,0,0,0,0,0,0,No
2,4262962000000.0,5642549,F,2016-04-29T16:19:04Z,2016-04-29T00:00:00Z,62,MATA DA PRAIA,0,0,0,0,0,0,No
3,867951200000.0,5642828,F,2016-04-29T17:29:31Z,2016-04-29T00:00:00Z,8,PONTAL DE CAMBURI,0,0,0,0,0,0,No
4,8841186000000.0,5642494,F,2016-04-29T16:07:23Z,2016-04-29T00:00:00Z,56,JARDIM DA PENHA,0,1,1,0,0,0,No


## Czyszczenie danych </br>
wykonano na podstawie analizy z `analyzes/analyze_dataset.ipyb`

In [95]:
df.set_index('AppointmentID', inplace=True ,verify_integrity=True)
df = df[(df.Age > -1) & (df.Age < 115)]
df.loc[:, ['ScheduledDay', 'AppointmentDay']] = df.loc[:, ['ScheduledDay', 'AppointmentDay']].apply(pd.to_datetime)
df.rename(columns={'No-show':'NoShow'}, inplace=True)
day_from_datetime = lambda dt: dt.day_name()
df['DayOfWeek'] = df.AppointmentDay.apply(day_from_datetime)
df['BinNoShow'] = (df.NoShow == "Yes").astype(int)
df['Gender'] = (df.Gender == "M").astype(int)

df.head()

Unnamed: 0_level_0,PatientId,Gender,ScheduledDay,AppointmentDay,Age,Neighbourhood,Scholarship,Hipertension,Diabetes,Alcoholism,Handcap,SMS_received,NoShow,DayOfWeek,BinNoShow
AppointmentID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
5642903,29872500000000.0,0,2016-04-29 18:38:08+00:00,2016-04-29 00:00:00+00:00,62,JARDIM DA PENHA,0,1,0,0,0,0,No,Friday,0
5642503,558997800000000.0,1,2016-04-29 16:08:27+00:00,2016-04-29 00:00:00+00:00,56,JARDIM DA PENHA,0,0,0,0,0,0,No,Friday,0
5642549,4262962000000.0,0,2016-04-29 16:19:04+00:00,2016-04-29 00:00:00+00:00,62,MATA DA PRAIA,0,0,0,0,0,0,No,Friday,0
5642828,867951200000.0,0,2016-04-29 17:29:31+00:00,2016-04-29 00:00:00+00:00,8,PONTAL DE CAMBURI,0,0,0,0,0,0,No,Friday,0
5642494,8841186000000.0,0,2016-04-29 16:07:23+00:00,2016-04-29 00:00:00+00:00,56,JARDIM DA PENHA,0,1,1,0,0,0,No,Friday,0


In [96]:
dayofweek = pd.get_dummies(df.DayOfWeek)
df = df.join(dayofweek)


In [97]:
df.head()

Unnamed: 0_level_0,PatientId,Gender,ScheduledDay,AppointmentDay,Age,Neighbourhood,Scholarship,Hipertension,Diabetes,Alcoholism,...,SMS_received,NoShow,DayOfWeek,BinNoShow,Friday,Monday,Saturday,Thursday,Tuesday,Wednesday
AppointmentID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5642903,29872500000000.0,0,2016-04-29 18:38:08+00:00,2016-04-29 00:00:00+00:00,62,JARDIM DA PENHA,0,1,0,0,...,0,No,Friday,0,1,0,0,0,0,0
5642503,558997800000000.0,1,2016-04-29 16:08:27+00:00,2016-04-29 00:00:00+00:00,56,JARDIM DA PENHA,0,0,0,0,...,0,No,Friday,0,1,0,0,0,0,0
5642549,4262962000000.0,0,2016-04-29 16:19:04+00:00,2016-04-29 00:00:00+00:00,62,MATA DA PRAIA,0,0,0,0,...,0,No,Friday,0,1,0,0,0,0,0
5642828,867951200000.0,0,2016-04-29 17:29:31+00:00,2016-04-29 00:00:00+00:00,8,PONTAL DE CAMBURI,0,0,0,0,...,0,No,Friday,0,1,0,0,0,0,0
5642494,8841186000000.0,0,2016-04-29 16:07:23+00:00,2016-04-29 00:00:00+00:00,56,JARDIM DA PENHA,0,1,1,0,...,0,No,Friday,0,1,0,0,0,0,0


Usuńmy dane nieznaczące na koniec dajmy zmienną zależną
Dajmy zmienną niezależną na koniec i znormalizujmy wiek

In [98]:
noshow = df.BinNoShow
df = df.drop(columns=['PatientId', 'ScheduledDay', 'AppointmentDay', 'Neighbourhood', 'NoShow', 'DayOfWeek', 'BinNoShow'])
df = df.join(noshow)
df.Age = df.Age / max(df.Age)
df.head()

Unnamed: 0_level_0,Gender,Age,Scholarship,Hipertension,Diabetes,Alcoholism,Handcap,SMS_received,Friday,Monday,Saturday,Thursday,Tuesday,Wednesday,BinNoShow
AppointmentID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
5642903,0,0.607843,0,1,0,0,0,0,1,0,0,0,0,0,0
5642503,1,0.54902,0,0,0,0,0,0,1,0,0,0,0,0,0
5642549,0,0.607843,0,0,0,0,0,0,1,0,0,0,0,0,0
5642828,0,0.078431,0,0,0,0,0,0,1,0,0,0,0,0,0
5642494,0,0.54902,0,1,1,0,0,0,1,0,0,0,0,0,0


## Testowanie modeli

Przygotujmy Funkcje do testowania modeli

In [99]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

def test_model(X, y, typ):
    
    # Create training and testing samples
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

    # Fit the model
    model = typ
    clf = model.fit(X_train, y_train)

    # Predict class labels on a test data
    pred_labels = model.predict(X_test)

    # Use score method to get accuracy of the model
    print('--------------------------------------------------------')
    score = model.score(X_test, y_test)
    print('Accuracy Score: ', score)
    print('--------------------------------------------------------')
    
    # Look at classification report to evaluate the model
    print(classification_report(y_test, pred_labels))
    
    # Return relevant data for chart plotting
    return X_train, X_test, y_train, y_test, clf, pred_labels

zaczytanie danych z DataFrame

In [100]:
X = np.array(df.iloc[:, :-1].values)
y = np.array(df.iloc[:,-1].values)


sprawdźmy zbilandowanie datasetu

In [101]:
check_y = list(y)
print(f'Procent wizyt odwołanych: {round(check_y.count(1) / len(check_y),3)} %, procent wizyt nie odwołanych {round(check_y.count(0) / len(check_y),3)} %')
print(f'Liczba wizyt odwołanych {check_y.count(1)}')

Procent wizyt odwołanych: 0.202 %, procent wizyt nie odwołanych 0.798 %
Liczba wizyt odwołanych 22316


Dane są wyraźnie nie zbilansowane co będzie miało negatywny wpływ na predykcję. Zakładając w ciemno że pacjent przyjdzie na wizytę mamy 80% szans na to iż nasze założenie jest słuszne.

In [102]:
not_cancelled = np.where(y == 0)
not_cancelled = not_cancelled[0]
len(not_cancelled)


88205

In [103]:
# get random indexes from not cancelled
import random
not_cancelled = np.array(random.sample(list(not_cancelled), check_y.count(1)))
print(f'Wizyty nie odwołane liczba: {len(not_cancelled)}')

# get indexes for canceled visits
cancelled = np.where(y == 1)[0]
print(f'Wizyty odwołane liczba: {len(cancelled)}')

Wizyty nie odwołane liczba: 22316
Wizyty odwołane liczba: 22316


In [104]:
# join two arrays
indexes = np.concatenate((not_cancelled, cancelled))
# Balance dataset
X = X[indexes]
y = y[indexes]

Sprawdżmy ponownie zbilansowanie Datasetu

In [105]:
check_y = list(y)
print(f'Procent wizyt odwołanych: {round(check_y.count(1) / len(check_y),3)} %, procent wizyt nie odwołanych {round(check_y.count(0) / len(check_y),3)} %')
print(f'Liczba wizyt odwołanych {check_y.count(1)}')

Procent wizyt odwołanych: 0.5 %, procent wizyt nie odwołanych 0.5 %
Liczba wizyt odwołanych 22316


# Przetestujmy Naive Bayes Classifier

In [106]:
from sklearn.naive_bayes import GaussianNB
X_train, X_test, y_train, y_test, clf, y_pred, = test_model(X, y, GaussianNB())

--------------------------------------------------------
Accuracy Score:  0.563683208244651
--------------------------------------------------------
              precision    recall  f1-score   support

           0       0.55      0.64      0.59      4424
           1       0.58      0.49      0.53      4503

    accuracy                           0.56      8927
   macro avg       0.57      0.56      0.56      8927
weighted avg       0.57      0.56      0.56      8927



Klasifikator Naive bayes nie nadaje się do tego zadania gdyż mamy wektor rzadki. Rozważmy wprowadzenie PCA, gdyż wektory rzadkie mogą dawać dużo szumu w klasyfikacji

In [111]:
from sklearn.decomposition import PCA
def test_model_pca(X, y, typ):
    
    # Create training and testing samples
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

    # Add PCA to the analyzes
    pca = PCA(n_components = 2)
    X_train = pca.fit_transform(X_train)
    X_test = pca.transform(X_test)
    explained_variance = pca.explained_variance_ratio_

    # Fit the model
    model = typ
    clf = model.fit(X_train, y_train)

    # Predict class labels on a test data
    pred_labels = model.predict(X_test)

    # Use score method to get accuracy of the model
    print('--------------------------------------------------------')
    score = model.score(X_test, y_test)
    print('Accuracy Score: ', score)
    print('--------------------------------------------------------')
    
    # Look at classification report to evaluate the model
    print(classification_report(y_test, pred_labels))
    
    # Return relevant data for chart plotting
    return X_train, X_test, y_train, y_test, clf, pred_labels

In [108]:
X_train, X_test, y_train, y_test, clf, y_pred, = test_model_pca(X, y, GaussianNB())

Classes:  [0 1]
Class Priors:  [0.50110629 0.49889371]
--------------------------------------------------------
Accuracy Score:  0.5544975915761174
--------------------------------------------------------
              precision    recall  f1-score   support

           0       0.54      0.63      0.58      4424
           1       0.57      0.48      0.52      4503

    accuracy                           0.55      8927
   macro avg       0.56      0.56      0.55      8927
weighted avg       0.56      0.55      0.55      8927



Dodanie dekompozycji PCA nic nie zmieniło za wiele. Spróbujmy inny klasyfikator

## Test Support Vector Machine

In [109]:
from sklearn import svm
X_train, X_test, y_train, y_test, clf, y_pred, = test_model(X, y, svm.SVC())

--------------------------------------------------------
Accuracy Score:  0.5749971995071133
--------------------------------------------------------
              precision    recall  f1-score   support

           0       0.56      0.63      0.60      4424
           1       0.59      0.52      0.55      4503

    accuracy                           0.57      8927
   macro avg       0.58      0.58      0.57      8927
weighted avg       0.58      0.57      0.57      8927



Wyniki nie są zadowalające, spróbujmy dekompozycji PCA z SVM

In [112]:
X_train, X_test, y_train, y_test, clf, y_pred, = test_model_pca(X, y, svm.SVC())

--------------------------------------------------------
Accuracy Score:  0.5654755236921698
--------------------------------------------------------
              precision    recall  f1-score   support

           0       0.55      0.71      0.62      4424
           1       0.60      0.43      0.50      4503

    accuracy                           0.57      8927
   macro avg       0.57      0.57      0.56      8927
weighted avg       0.57      0.57      0.56      8927



Nadal klasyfikacja jest bliska losowości

## Klasyfikacja z użyciem RFC

In [114]:
from sklearn.ensemble import RandomForestClassifier
X_train, X_test, y_train, y_test, clf, y_pred, = test_model(X, y, RandomForestClassifier())

--------------------------------------------------------
Accuracy Score:  0.5534894141368881
--------------------------------------------------------
              precision    recall  f1-score   support

           0       0.55      0.57      0.56      4424
           1       0.56      0.54      0.55      4503

    accuracy                           0.55      8927
   macro avg       0.55      0.55      0.55      8927
weighted avg       0.55      0.55      0.55      8927



In [115]:
X_train, X_test, y_train, y_test, clf, y_pred, = test_model_pca(X, y, RandomForestClassifier())

--------------------------------------------------------
Accuracy Score:  0.5530413352750084
--------------------------------------------------------
              precision    recall  f1-score   support

           0       0.55      0.57      0.56      4424
           1       0.56      0.54      0.55      4503

    accuracy                           0.55      8927
   macro avg       0.55      0.55      0.55      8927
weighted avg       0.55      0.55      0.55      8927



# KNN Klasifikator

In [116]:
from sklearn.neighbors import KNeighborsClassifier
X_train, X_test, y_train, y_test, clf, y_pred, = test_model_pca(X, y, KNeighborsClassifier())
X_train, X_test, y_train, y_test, clf, y_pred, = test_model(X, y, KNeighborsClassifier())

--------------------------------------------------------
Accuracy Score:  0.5437436988910048
--------------------------------------------------------
              precision    recall  f1-score   support

           0       0.54      0.54      0.54      4424
           1       0.55      0.54      0.55      4503

    accuracy                           0.54      8927
   macro avg       0.54      0.54      0.54      8927
weighted avg       0.54      0.54      0.54      8927

--------------------------------------------------------
Accuracy Score:  0.5407191665733169
--------------------------------------------------------
              precision    recall  f1-score   support

           0       0.54      0.55      0.54      4424
           1       0.55      0.53      0.54      4503

    accuracy                           0.54      8927
   macro avg       0.54      0.54      0.54      8927
weighted avg       0.54      0.54      0.54      8927



## Liniowy Ridge Classifier

In [117]:
from sklearn.linear_model import RidgeClassifier
X_train, X_test, y_train, y_test, clf, y_pred, = test_model(X, y, RidgeClassifier())
X_train, X_test, y_train, y_test, clf, y_pred, = test_model_pca(X, y, RidgeClassifier())

--------------------------------------------------------
Accuracy Score:  0.5729808446286546
--------------------------------------------------------
              precision    recall  f1-score   support

           0       0.56      0.65      0.60      4424
           1       0.59      0.50      0.54      4503

    accuracy                           0.57      8927
   macro avg       0.58      0.57      0.57      8927
weighted avg       0.58      0.57      0.57      8927

--------------------------------------------------------
Accuracy Score:  0.5494567043799709
--------------------------------------------------------
              precision    recall  f1-score   support

           0       0.54      0.57      0.55      4424
           1       0.56      0.53      0.54      4503

    accuracy                           0.55      8927
   macro avg       0.55      0.55      0.55      8927
weighted avg       0.55      0.55      0.55      8927



# Wnioski
Żaden z algorytmów nie spełnia oczekiwań co do klasyfikacji, wyniki klasyfikacji są bliskie losowości i mimo analizy w pliku `analyze_dataset.ipynb` gdzie wykazano statystyczną istotność różnic parametrów, zbadane parametry nie rzutują na faktyczną klasyfikację pojawiania się pacjentów na wizytach

In [123]:
from numpy import count_nonzero
sparsity = 1.0 - count_nonzero(X) / X.size
print(f'Sparsity macierzy zmiennych niezależnych: {round(sparsity, 4)}')

Sparsity macierzy zmiennych niezależnych: 0.7792


Problem z jakim się mierzymy jest rzadkość macierzy współczynników. Sparsity wynosi 0.7792 zatem mamy do czynienia z macierzą rzadką, gdzie większość wartości stanowią wartości 0. Stąd wynikają problemy z wytrenowaniem modelu uczenia maszynowego