In [161]:
# import modules for working with data
import pandas as pd
import numpy as np

# import stats functions from scipy
from scipy import stats

# imports for better control of output and plots
from IPython.display import display
import matplotlib.pyplot as plt
import seaborn as sns

# show plots in the notebook
%matplotlib inline

In [162]:
DATA_PATH = '../data/KaggleV2-May-2016.csv'
df = pd.read_csv(DATA_PATH)
df.head()

Unnamed: 0,PatientId,AppointmentID,Gender,ScheduledDay,AppointmentDay,Age,Neighbourhood,Scholarship,Hipertension,Diabetes,Alcoholism,Handcap,SMS_received,No-show
0,29872500000000.0,5642903,F,2016-04-29T18:38:08Z,2016-04-29T00:00:00Z,62,JARDIM DA PENHA,0,1,0,0,0,0,No
1,558997800000000.0,5642503,M,2016-04-29T16:08:27Z,2016-04-29T00:00:00Z,56,JARDIM DA PENHA,0,0,0,0,0,0,No
2,4262962000000.0,5642549,F,2016-04-29T16:19:04Z,2016-04-29T00:00:00Z,62,MATA DA PRAIA,0,0,0,0,0,0,No
3,867951200000.0,5642828,F,2016-04-29T17:29:31Z,2016-04-29T00:00:00Z,8,PONTAL DE CAMBURI,0,0,0,0,0,0,No
4,8841186000000.0,5642494,F,2016-04-29T16:07:23Z,2016-04-29T00:00:00Z,56,JARDIM DA PENHA,0,1,1,0,0,0,No


## Czyszczenie danych </br>
wykonano na podstawie analizy z `analyzes/analyze_dataset.ipyb`

In [163]:
df.set_index('AppointmentID', inplace=True ,verify_integrity=True)
df = df[(df.Age > -1) & (df.Age < 115)]
df.loc[:, ['ScheduledDay', 'AppointmentDay']] = df.loc[:, ['ScheduledDay', 'AppointmentDay']].apply(pd.to_datetime)
df.rename(columns={'No-show':'NoShow'}, inplace=True)
day_from_datetime = lambda dt: dt.day_name()
df['DayOfWeek'] = df.AppointmentDay.apply(day_from_datetime)
df['BinNoShow'] = (df.NoShow == "Yes").astype(int)
df['Gender'] = (df.Gender == "M").astype(int)

df.head()

Unnamed: 0_level_0,PatientId,Gender,ScheduledDay,AppointmentDay,Age,Neighbourhood,Scholarship,Hipertension,Diabetes,Alcoholism,Handcap,SMS_received,NoShow,DayOfWeek,BinNoShow
AppointmentID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
5642903,29872500000000.0,0,2016-04-29 18:38:08+00:00,2016-04-29 00:00:00+00:00,62,JARDIM DA PENHA,0,1,0,0,0,0,No,Friday,0
5642503,558997800000000.0,1,2016-04-29 16:08:27+00:00,2016-04-29 00:00:00+00:00,56,JARDIM DA PENHA,0,0,0,0,0,0,No,Friday,0
5642549,4262962000000.0,0,2016-04-29 16:19:04+00:00,2016-04-29 00:00:00+00:00,62,MATA DA PRAIA,0,0,0,0,0,0,No,Friday,0
5642828,867951200000.0,0,2016-04-29 17:29:31+00:00,2016-04-29 00:00:00+00:00,8,PONTAL DE CAMBURI,0,0,0,0,0,0,No,Friday,0
5642494,8841186000000.0,0,2016-04-29 16:07:23+00:00,2016-04-29 00:00:00+00:00,56,JARDIM DA PENHA,0,1,1,0,0,0,No,Friday,0


In [164]:
dayofweek = pd.get_dummies(df.DayOfWeek)
df = df.join(dayofweek)


In [165]:
df.head()

Unnamed: 0_level_0,PatientId,Gender,ScheduledDay,AppointmentDay,Age,Neighbourhood,Scholarship,Hipertension,Diabetes,Alcoholism,...,SMS_received,NoShow,DayOfWeek,BinNoShow,Friday,Monday,Saturday,Thursday,Tuesday,Wednesday
AppointmentID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5642903,29872500000000.0,0,2016-04-29 18:38:08+00:00,2016-04-29 00:00:00+00:00,62,JARDIM DA PENHA,0,1,0,0,...,0,No,Friday,0,1,0,0,0,0,0
5642503,558997800000000.0,1,2016-04-29 16:08:27+00:00,2016-04-29 00:00:00+00:00,56,JARDIM DA PENHA,0,0,0,0,...,0,No,Friday,0,1,0,0,0,0,0
5642549,4262962000000.0,0,2016-04-29 16:19:04+00:00,2016-04-29 00:00:00+00:00,62,MATA DA PRAIA,0,0,0,0,...,0,No,Friday,0,1,0,0,0,0,0
5642828,867951200000.0,0,2016-04-29 17:29:31+00:00,2016-04-29 00:00:00+00:00,8,PONTAL DE CAMBURI,0,0,0,0,...,0,No,Friday,0,1,0,0,0,0,0
5642494,8841186000000.0,0,2016-04-29 16:07:23+00:00,2016-04-29 00:00:00+00:00,56,JARDIM DA PENHA,0,1,1,0,...,0,No,Friday,0,1,0,0,0,0,0


Usuńmy dane nieznaczące na koniec dajmy zmienną zależną
Dajmy zmienną niezależną na koniec i znormalizujmy wiek

In [166]:
noshow = df.BinNoShow
df = df.drop(columns=['PatientId', 'ScheduledDay', 'AppointmentDay', 'Neighbourhood', 'NoShow', 'DayOfWeek', 'BinNoShow'])
df = df.join(noshow)
df.Age = df.Age / max(df.Age)
df.head()

Unnamed: 0_level_0,Gender,Age,Scholarship,Hipertension,Diabetes,Alcoholism,Handcap,SMS_received,Friday,Monday,Saturday,Thursday,Tuesday,Wednesday,BinNoShow
AppointmentID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
5642903,0,0.607843,0,1,0,0,0,0,1,0,0,0,0,0,0
5642503,1,0.54902,0,0,0,0,0,0,1,0,0,0,0,0,0
5642549,0,0.607843,0,0,0,0,0,0,1,0,0,0,0,0,0
5642828,0,0.078431,0,0,0,0,0,0,1,0,0,0,0,0,0
5642494,0,0.54902,0,1,1,0,0,0,1,0,0,0,0,0,0


## Testowanie modeli

Przygotujmy Funkcje do testowania modeli

In [167]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

def test_model(X, y, typ):
    
    # Create training and testing samples
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

    # Fit the model
    model = typ
    clf = model.fit(X_train, y_train)

    # Predict class labels on a test data
    pred_labels = model.predict(X_test)

    # Print model attributes 
    print('Classes: ', clf.classes_) # class labels known to the classifier
    if str(typ)=='GaussianNB()':
        print('Class Priors: ',clf.class_prior_) # prior probability of each class.
    else: 
        print('Class Log Priors: ',clf.class_log_prior_) # log prior probability of each class.
        
    # Use score method to get accuracy of the model
    print('--------------------------------------------------------')
    score = model.score(X_test, y_test)
    print('Accuracy Score: ', score)
    print('--------------------------------------------------------')
    
    # Look at classification report to evaluate the model
    print(classification_report(y_test, pred_labels))
    
    # Return relevant data for chart plotting
    return X_train, X_test, y_train, y_test, clf, pred_labels

zaczytanie danych z DataFrame

In [168]:
X = np.array(df.iloc[:, :-1].values)
y = np.array(df.iloc[:,-1].values)


sprawdźmy zbilandowanie datasetu

In [169]:
check_y = list(y)
print(f'Procent wizyt odwołanych: {round(check_y.count(1) / len(check_y),3)} %, procent wizyt nie odwołanych {round(check_y.count(0) / len(check_y),3)} %')
print(f'Liczba wizyt odwołanych {check_y.count(1)}')

Procent wizyt odwołanych: 0.202 %, procent wizyt nie odwołanych 0.798 %
Liczba wizyt odwołanych 22316


Dane są wyraźnie nie zbilansowane co będzie miało negatywny wpływ na predykcję. Zakładając w ciemno że pacjent przyjdzie na wizytę mamy 80% szans na to iż nasze założenie jest słuszne.

In [170]:
limit = check_y.count(1)
counter = 0
check = 0
for i, data in enumerate(y):
    if data == 0:
        counter += 1
        if counter >= limit:
            X = np.delete(X, i, 0)
            y = np.delete(y, i, 0)



IndexError: index 74179 is out of bounds for axis 0 with size 74178

# Przetestujmy Naive Bayes Classifier

In [None]:
from sklearn.naive_bayes import GaussianNB
X_train, X_test, y_train, y_test, clf, y_pred, = test_model(X, y, GaussianNB())

Classes:  [0 1]
Class Priors:  [0.79768368 0.20231632]
--------------------------------------------------------
Accuracy Score:  0.790409409635829
--------------------------------------------------------
              precision    recall  f1-score   support

           0       0.80      0.98      0.88     17677
           1       0.32      0.04      0.07      4428

    accuracy                           0.79     22105
   macro avg       0.56      0.51      0.48     22105
weighted avg       0.71      0.79      0.72     22105



1