In [1]:
import pandas as pd
import numpy as np
import pandas_profiling
from sklearn import tree
from sklearn.utils import shuffle 
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.datasets import load_iris
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import classification_report

data = pd.read_csv("data.csv", header=None) #read the data
data.sample(5)  #see what the data look like

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1549,1550,1551,1552,1553,1554,1555,1556,1557,1558
1604,11,64,5.8181,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,nonad.
1596,33,33,1.0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,nonad.
1175,77,77,1.0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,nonad.
1395,205,200,0.9756,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,nonad.
2065,24,120,5.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,nonad.


In [2]:
## replace labels. AD = 1; nonAD = 0
data.replace('ad.', 1, inplace=True)
data.replace('nonad.', 0, inplace=True)

In [3]:
##  check missing values
def missing_percentage(df):
    """This function takes a DataFrame(df) as input and returns two columns:
        * total missing values
        * total missing values percentage"""
    total = df.isnull().sum().sort_values(ascending = False)
    percent = round(df.isnull().sum().sort_values(ascending = False)/len(df)*100,2)
    return pd.concat([total, percent], axis=1, keys=['Total','Percent'])

# apply the function to the data
missing_percentage(data)

Unnamed: 0,Total,Percent
1558,0,0.0
534,0,0.0
512,0,0.0
513,0,0.0
514,0,0.0
...,...,...
1041,0,0.0
1042,0,0.0
1043,0,0.0
1044,0,0.0


In [4]:
### create train and test sets
df_label = pd.DataFrame(data, columns=[1558]) #extra the last column which is the label
df_data = pd.DataFrame.drop(data, columns = [1558]) #delete the last label column

#30% data are splitted into test data set, they are shuffled
X_train, X_test, y_train, y_test = train_test_split(df_data, df_label, test_size = 0.3, random_state = 40)

In [5]:
X_train.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1548,1549,1550,1551,1552,1553,1554,1555,1556,1557
count,1651.0,1651.0,1651.0,1651.0,1651.0,1651.0,1651.0,1651.0,1651.0,1651.0,...,1651.0,1651.0,1651.0,1651.0,1651.0,1651.0,1651.0,1651.0,1651.0,1651.0
mean,63.448819,156.073289,3.959404,0.764991,0.001211,0.0,0.006663,0.004846,0.003634,0.014537,...,0.001817,0.003634,0.001211,0.001817,0.003028,0.009085,0.012114,0.013931,0.010902,0.001211
std,54.521547,131.833762,6.348432,0.424133,0.034794,0.0,0.081377,0.069462,0.060193,0.119725,...,0.042601,0.060193,0.034794,0.042601,0.054965,0.094912,0.109427,0.11724,0.103876,0.034794
min,1.0,1.0,0.0015,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,25.0,80.0,1.0375,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,51.0,110.0,2.0869,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,80.0,184.0,5.3333,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,640.0,640.0,60.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [6]:
## Feature scaling
# As we can see, the change of first three factors is very large, and the last factors are binary
# Through feature scaling, all features could be  


In [7]:
## model 1: decision tree 
dt = tree.DecisionTreeClassifier(criterion="entropy") #create decision tree
dt.fit(X_train, y_train) #fit the model

y_pred = dt.predict(X_test)

In [8]:
## evaluate the model
from sklearn.metrics import recall_score, precision_score, accuracy_score, classification_report, f1_score

dc_accuracy = accuracy_score(y_test, y_pred)
dc_precision = precision_score(y_test, y_pred)
dc_recall = recall_score(y_test, y_pred)
dc_f1 = f1_score(y_test, y_pred)
print("decision tree accuracy: %0.2f" %dc_accuracy)
print("decision tree precision: %0.2f" %dc_precision)
print("decision tree recall: %0.2f" %dc_recall)
print("decision tree f1: %0.2f" %dc_f1)

decision tree accuracy: 0.95
decision tree precision: 0.86
decision tree recall: 0.85
decision tree f1: 0.86


In [10]:
from sklearn.metrics import confusion_matrix
c_m = confusion_matrix(y_test, y_pred)

In [12]:
from sklearn.metrics import roc_curve, auc
#plt.style.use('seaborn-pastel')
y_score = dt.score(X_test, y_test)

FPR, TPR, _ = roc_curve(y_test, y_score)
ROC_AUC = auc(FPR, TPR)
print (ROC_AUC)

plt.figure(figsize =[11,9])
plt.plot(FPR, TPR, label= 'ROC curve(area = %0.2f)'%ROC_AUC, linewidth= 4)
plt.plot([0,1],[0,1], 'k--', linewidth = 4)
plt.xlim([0.0,1.0])
plt.ylim([0.0,1.05])
plt.xlabel('False Positive Rate', fontsize = 18)
plt.ylabel('True Positive Rate', fontsize = 18)
plt.title('ROC for Titanic survivors', fontsize= 18)
plt.show()

TypeError: Singleton array 0.9548022598870056 cannot be considered a valid collection.