<a href="https://colab.research.google.com/github/aparnashenoy/ml/blob/master/data_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

import the libraries

In [0]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn import preprocessing, model_selection
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
from pandas.plotting import scatter_matrix

import the dataset

In [0]:
dataset = pd.read_csv('traindata.csv')

In [0]:
dataset.head()

In [0]:
dataset.isnull()
dataset.dropna()


In [0]:
categorical = dataset.dtypes == object
categorical['Unit1'] = True
categorical['Unit2'] = True
categorical['Gender'] = True

In [0]:
cat_vars = list(categorical[categorical].index)
cont_vars = list(categorical[~categorical].index)
cont_vars.pop(-1)

In [0]:
dataset.replace('?',-9999, inplace=True)
print(dataset.axes)

In [0]:
dataset.drop(['Unnamed: 0','X'],1, inplace=True)

In [0]:
print(dataset.shape)

In [0]:
print(dataset.columns)

In [0]:
dataset.hist(figsize=(20,20 ))
f=plt.show()


In [0]:
plt.savefig("abc.png")

In [0]:
from pandas.plotting import scatter_matrix
scatter_matrix(dataset,figsize=(39,39))
plt.show()

In [0]:
X=np.array(dataset.drop(['SepsisLabel'],1))
y=np.array(dataset['SepsisLabel'])

In [0]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y, test_size=0.25, random_state=0)

In [0]:
seed=123
scoring='accuracy'

In [0]:
models=[]
models.append(('KNN', KNeighborsClassifier(n_neighbors=5)))
models.append(('SVM',SVC()))

In [0]:
results=[]
names =[]
for name, model in models:
  kfold=model_selection.KFold(n_splits=10, random_state=seed)
  cv_results=model_selection.cross_val_score(model,X_train,y_train,cv=kfold, scoring=scoring)
  results.append(cv_results)
  names.append(name)
  msg="%s: %f (%f)" % (name,cv_results.mean(), cv_results.std())
  print(msg)

In [0]:
from sklearn.preprocessing import StandardScaler
sc_X=StandardScaler()
X_train=sc_X.fit_transform(X_train)
X_test=sc_X.transform(X_test)
X_standardized=sc_X.transform(X)
data=pd.DataFrame(X_standardized)
data.describe()


In [0]:
X_standardized['SepsisLabel'].value_counts()

In [0]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state=0)
classifier.fit(X_train,y_train)

In [0]:
y_train

In [0]:
np.any(np.isnan(dataset))

In [0]:
np.all(np.isfinite(dataset))

In [0]:
dataset.fillna(999, inplace=True)

In [0]:
dataset.replace([np.inf, -np.inf], np.nan, inplace=True)

In [0]:
y_pred = classifier.predict(X_test)

In [0]:
y_pred

In [0]:
from sklearn.metrics import confusion_matrix
cm=confusion_matrix(y_test, y_pred)

In [0]:
cm

In [0]:
from matplotlib.colors import ListedColormap

In [0]:
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(classifier.score(X_test, y_test)))

In [0]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

In [0]:
from sklearn.decomposition import PCA
X = dataset.iloc[:,2:39]
y = dataset.iloc[:,40]
pca = PCA(n_components=2).fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(pca, y, random_state=0)

plt.figure(dpi=120)
plt.scatter(pca[y.values==0,0], pca[y.values==0,1], alpha=0.5, label='YES', s=2, color='navy')
plt.scatter(pca[y.values==1,0], pca[y.values==1,1], alpha=0.5, label='NO', s=2, color='darkorange')
plt.legend()
plt.title('Sepsis Data Set\nFirst Two Principal Components')
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.gca().set_aspect('equal')
plt.show()

In [0]:
def plot_bank(X, y, fitted_model):
  plt.figure(figsize=(9.8,5), dpi=100)
  for i, plot_type in enumerate(['Decision Boundary', 'Decision Probabilities']):
    plt.subplot(1,2,i+1)
    mesh_step_size = 0.01  # step size in the mesh
    x_min, x_max = X[:, 0].min() - .1, X[:, 0].max() + .1
    y_min, y_max = X[:, 1].min() - .1, X[:, 1].max() + .1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, mesh_step_size), np.arange(y_min, y_max, mesh_step_size))
    if i == 0:
       Z = fitted_model.predict(np.c_[xx.ravel(), yy.ravel()])
    else:
         try:
           Z = fitted_model.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:,1]
         except:
             plt.text(0.4, 0.5, 'Probabilities Unavailable', horizontalalignment='center',
                      verticalalignment='center', transform = plt.gca().transAxes, fontsize=12)
             plt.axis('off')
             break
             Z = Z.reshape(xx.shape)
             plt.scatter(X[y.values==0,0], X[y.values==0,1], alpha=0.8, label='YES', s=5, color='navy')
             plt.scatter(X[y.values==1,0], X[y.values==1,1], alpha=0.8, label='NO', s=5, color='darkorange')
             plt.imshow(Z, interpolation='nearest', cmap='RdYlBu_r', alpha=0.15, 
                        extent=(x_min, x_max, y_min, y_max), origin='lower')
             plt.title(plot_type + '\n' + 
                       str(fitted_model).split('(')[0]+ ' Test Accuracy: ' + str(np.round(fitted_model.score(X, y), 5)))
plt.gca().set_aspect('equal');
plt.tight_layout()
plt.legend()
plt.subplots_adjust(top=0.9, bottom=0.08, wspace=0.02)
model = LogisticRegression()
model.fit(X_train,y_train)
plot_bank(X_test, y_test, model)
plt.show()