In [13]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from itertools import cycle
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from sklearn import datasets
from sklearn.cluster import DBSCAN
from sklearn.metrics import mean_squared_error


def main(cgm,insulin):
    ## Read CGM Data
    cgmdata=pd.read_csv(cgm)
  
    ## Read Insulind data 
    insulindata=pd.read_csv(insulin)
   
   
    ##Subset CGM
    insulin=insulindata[["Index","Date","Time","BWZ Carb Input (grams)"]]
    
    #Merge date time
    insulin['dt']=pd.to_datetime(insulin['Date'] + ' ' + insulin['Time'])
    #Rename column
    insulin = insulin.rename(columns={'BWZ Carb Input (grams)': 'carbinput'})
    #Subset carb input >0
    insulin_new=insulin[insulin["carbinput"] >0]
    insulin_new['tm2'] = pd.DatetimeIndex(insulin_new['dt'])
    #subset columns
    insulin_join=insulin_new[["dt","carbinput","tm2"]]
    #sort by date time
    insulin_sort=insulin_join.sort_values(by='dt')
    # Choose start time
    insulin_sort['lagtime'] = (insulin_sort['dt'].shift(-1))
    insulin_sort['diff']=insulin_sort['lagtime']-insulin_sort['dt']
    insulin_sort=insulin_sort[insulin_sort['diff'] >='02:00:00']

    ##Read,Subset and sort CGM
    cgm=cgmdata[["Index","Date","Time","Sensor Glucose (mg/dL)"]]
    cgm['dt']=pd.to_datetime(cgm['Date'] + ' ' + cgm['Time'])
    cgm_sort=cgm.sort_values(by='dt')
    
    #Merge cgm_sort and insulin_sort
    new=pd.merge_asof(
    cgm_sort,
    insulin_sort,
   # by="component",
    right_on="dt",
    left_on="dt",
    direction='nearest',
    tolerance=pd.Timedelta('2.5 minute'))
    
    new_fin=new[new["carbinput"]>0]
    new_fin['tm']=pd.to_datetime(new_fin['Date'] + ' ' + new_fin['Time'])
    new_fin['tm2meal'] = pd.DatetimeIndex(new_fin['tm']) + timedelta(hours=2,minutes=0)
    new_fin['tm30meal'] = pd.DatetimeIndex(new_fin['tm']) - timedelta(hours=0,minutes=30)
    new_fin['tm2nomeal'] = pd.DatetimeIndex(new_fin['tm2meal']) + timedelta(hours=2,minutes=0)
    new_fin=new_fin[["dt","tm","tm2meal","tm30meal","tm2nomeal"]]
    
    #merge new with meal dates
    newfinal=pd.merge_asof(
    new,
    new_fin,
   # by="component",
    right_on="dt",
    left_on="dt",
    direction='nearest',
    tolerance=pd.Timedelta('1 minute'))
    
    ####### MEAL DATA EXTRACTION ##########
    newsubset=newfinal[newfinal["carbinput"]>0]
    mealtime=newsubset[["tm30meal"]]
    mealtime["dt"]=mealtime["tm30meal"]
    
    newfinal2=pd.merge_asof(
    newfinal,
    mealtime,
   # by="component",
    right_on="dt",
    left_on="dt",
    tolerance=pd.Timedelta('30 minute'))
    
    mealtime2=newsubset[["tm","tm2meal"]]
    mealtime2["dt"]=mealtime2["tm"]
    
    newfinal3=pd.merge_asof(
    newfinal2,
    mealtime2,
   # by="component",
    right_on="dt",
    left_on="dt",
    tolerance=pd.Timedelta('120 minute'))
    
    Mergedfinal=newfinal3[(newfinal3["tm30meal_y"].notna()) | (newfinal3["tm2meal_y"].notna())]
    mealdata=Mergedfinal[["Sensor Glucose (mg/dL)"]]
    mealdata = mealdata.reset_index()
    mealdata=mealdata[["Sensor Glucose (mg/dL)"]]
    #Imput missing observations with mean value
    mealdata['Sensor Glucose (mg/dL)'].fillna(mealdata['Sensor Glucose (mg/dL)'].mean(), inplace = True)
    
    
    seq= cycle([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30])
    mealdata['x'] = [next(seq) for count in range(mealdata.shape[0])]
    mealdata['y']=0
    mealdata['y'] = (mealdata["x"] %30==0).shift(1).cumsum() + 1
    mealdata['y'].fillna(value=1, inplace = True)
    
    meal_data=mealdata.pivot(index='y',columns='x', values='Sensor Glucose (mg/dL)').add_prefix('X').reset_index()
    
    ####### NO MEAL DATA EXTRACTION ##########
    mealtime3=newsubset[["tm","tm2nomeal"]]
    mealtime3["dt"]=mealtime3["tm2nomeal"]
    
    newfinal4=pd.merge_asof(
    newfinal2,
    mealtime3,
   # by="component",
    right_on="dt",
    left_on="dt",
    tolerance=pd.Timedelta('120 minute'))
    
    Mergedfinal2=newfinal4[(newfinal4["tm2nomeal_y"].notna())]
    nomealdata=Mergedfinal2[["Sensor Glucose (mg/dL)"]]
    nomealdata = nomealdata.reset_index()
    nomealdata=nomealdata[["Sensor Glucose (mg/dL)"]]

    #Imput missing observations with mean value
    nomealdata['Sensor Glucose (mg/dL)'].fillna(nomealdata['Sensor Glucose (mg/dL)'].mean(), inplace = True)
    seq= cycle([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24])
    nomealdata['x'] = [next(seq) for count in range(nomealdata.shape[0])]
    nomealdata['y']=0
    nomealdata['y'] = (nomealdata["x"] %24==0).shift(1).cumsum() + 1
    nomealdata['y'].fillna(value=1, inplace = True)
    nomeal_data=nomealdata.pivot(index='y',columns='x', values='Sensor Glucose (mg/dL)').add_prefix('X').reset_index()
    
    
    ##Drop x and y
    meal_data.drop(['y'], axis=1, inplace=True)
    nomeal_data.drop(['y'], axis=1, inplace=True)
    
    
   ##Min max value training
    meal_data['max_value'] = meal_data.max(axis=1)
    meal_data['min_value'] = meal_data.min(axis=1)
    meal_data['std'] = meal_data.std(axis=1)
    meal_data['dev']=(meal_data['max_value']-meal_data['min_value'])/(meal_data['min_value'])
    
    nomeal_data['max_value'] = nomeal_data.max(axis=1)
    nomeal_data['min_value'] = nomeal_data.min(axis=1)
    nomeal_data['std'] = nomeal_data.std(axis=1)
    nomeal_data['dev']=(nomeal_data['max_value']-nomeal_data['min_value'])/(nomeal_data['min_value'])
    
    ##Set meal & no meal indicator
    meal_data['label']=1
    nomeal_data['label']=0
    
    #Subset final features and label
    meal=meal_data[["std","dev","label"]]
    nomeal=nomeal_data[["std","dev","label"]]
    
    ###Extract Ground Truth###
    min_value=newsubset['carbinput'].min()
    max_value=newsubset['carbinput'].max()
    ##Total Number of Bins
    k=round((max_value-min_value)/20)
    
    newsubset['groundtruth'] = pd.qcut(newsubset['carbinput'], q=k, labels=[0,1,2,3,4,5])
    binmatrix=newsubset[["carbinput","groundtruth"]]
    binmatrix=binmatrix.reset_index()
    binmatrix.index.name='x'
    
    ##K-mean 
    scaled_features=meal.copy()
    col_names=['std','dev']
    features=meal[col_names]
    scaler=StandardScaler().fit(features.values)
    features=scaler.transform(features.values)
    
    scaled_features[col_names]=features
    

    kmeans=KMeans(init="random",n_clusters=k,n_init=10,max_iter=300,random_state=42)
    kmeans.fit(scaled_features)
    
    P=kmeans.predict(scaled_features)
    
    scaled_features['predicted'] = pd.Series(P, index=scaled_features.index)
    mergedkmean = pd.merge(scaled_features, binmatrix, left_index=True, right_on='groundtruth')
    mergedkmean=mergedkmean.sort_index()
    
    ##K-mean SSE Calculation
    ksse=kmeans.inertia_
     
    #Create confusion matrix 
    y_test = mergedkmean['groundtruth'].tolist()
    y_pred = mergedkmean['predicted'].tolist()
    cm = confusion_matrix(y_test, y_pred)
    cmdf = pd.DataFrame(cm)
    
    #Calculate purity
    cmdf2=cmdf.copy()
    cmdf.loc["TotalC"] = cmdf.sum()
    cmdf.loc[:,'TotalR'] = cmdf.sum(numeric_only=True, axis=1)
    rowtotal=cmdf.loc['TotalC','TotalR']
    
    cmdf2['max_value'] = cmdf2.max(axis=1)
    columntotal=cmdf2['max_value'].sum()
    
    kpurity=round(cmdf2['max_value'].sum()/cmdf.loc['TotalC','TotalR'])
    
    #Calculate entropy kmean
    
    ck0=((cmdf.loc[0,4]/cmdf.loc[0,'TotalR'])*np.log(cmdf.loc[0,4]/cmdf.loc[0,'TotalR']))*(cmdf.loc[0,'TotalR']/rowtotal)
    ck1=((cmdf.loc[1,3]/cmdf.loc[1,'TotalR'])*np.log(cmdf.loc[1,3]/cmdf.loc[1,'TotalR']))*(cmdf.loc[1,'TotalR']/rowtotal)
    ck2=((cmdf.loc[2,0]/cmdf.loc[2,'TotalR'])*np.log(cmdf.loc[2,0]/cmdf.loc[2,'TotalR']))*(cmdf.loc[2,'TotalR']/rowtotal)
    ck3=((cmdf.loc[3,5]/cmdf.loc[3,'TotalR'])*np.log(cmdf.loc[3,5]/cmdf.loc[3,'TotalR']))*(cmdf.loc[3,'TotalR']/rowtotal)
    ck4=((cmdf.loc[4,3]/cmdf.loc[4,'TotalR'])*np.log(cmdf.loc[4,3]/cmdf.loc[4,'TotalR']))*(cmdf.loc[4,'TotalR']/rowtotal)
    ck5=((cmdf.loc[5,4]/cmdf.loc[5,'TotalR'])*np.log(cmdf.loc[5,4]/cmdf.loc[5,'TotalR']))*(cmdf.loc[5,'TotalR']/rowtotal)
    
    kmentropy=((ck0+ck1+ck2+ck3+ck4+ck5))
    
    ##DBSCAN Algorithm
    db=DBSCAN(eps=0.50,min_samples=3)
    db.fit(scaled_features)
    
    #dbpred=db.predict(scaled_features)
    labels = db.labels_
    n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
    
    scaled_features['predictdb'] = pd.Series(labels, index=scaled_features.index)
    mergeddbscan = pd.merge(scaled_features, binmatrix, left_index=True, right_on='groundtruth')
    
    #Create confusion matrix 
    y_test = mergeddbscan['groundtruth'].tolist()
    y_pred = mergeddbscan['predictdb'].tolist()
    cmdb = confusion_matrix(y_test, y_pred)
    cmdb = pd.DataFrame(cmdb)
    
    #Dbscansse
    mse=mean_squared_error(y_test, y_pred)
    counts= np.count_nonzero(y_test)
    dbsse=mse*counts
    
    ##Purity
    cmdb2=cmdb.copy()
    cmdb.loc["TotalC"] = cmdb.sum()
    cmdb.loc[:,'TotalR'] = cmdb.sum(numeric_only=True, axis=1)
    rowtotal=cmdb.loc['TotalC','TotalR']
    
    cmdb2['max_value'] = cmdb2.max(axis=1)
    columntotal=cmdb2['max_value'].sum()
    
    dbpurity=round(cmdb2['max_value'].sum()/cmdb.loc['TotalC','TotalR'])
    
    #Calculate entropy dbscan
    
    c0=((cmdb.loc[0,0]/cmdb.loc[0,'TotalR'])*np.log(cmdb.loc[0,0]/cmdb.loc[0,'TotalR']))*(cmdb.loc[0,'TotalR']/rowtotal)
    c1=((cmdb.loc[1,1]/cmdb.loc[1,'TotalR'])*np.log(cmdb.loc[1,1]/cmdb.loc[1,'TotalR']))*(cmdb.loc[1,'TotalR']/rowtotal)
    c2=((cmdb.loc[2,5]/cmdb.loc[2,'TotalR'])*np.log(cmdb.loc[2,5]/cmdb.loc[2,'TotalR']))*(cmdb.loc[2,'TotalR']/rowtotal)
    c3=((cmdb.loc[3,2]/cmdb.loc[3,'TotalR'])*np.log(cmdb.loc[3,2]/cmdb.loc[3,'TotalR']))*(cmdb.loc[3,'TotalR']/rowtotal)
    c4=((cmdb.loc[4,1]/cmdb.loc[4,'TotalR'])*np.log(cmdb.loc[4,1]/cmdb.loc[4,'TotalR']))*(cmdb.loc[4,'TotalR']/rowtotal)
    c5=((cmdb.loc[5,0]/cmdb.loc[5,'TotalR'])*np.log(cmdb.loc[5,0]/cmdb.loc[5,'TotalR']))*(cmdb.loc[5,'TotalR']/rowtotal)
    
    dbentropy=((c0+c1+c2+c3+c4+c5))
    
    data={'kmsse':[ksse],'dbsse':[dbsse],'kent':[kmentropy],'dbent':[dbentropy],'kmpure':[kpurity],'dbpure':[dbpurity]}

    finalresult = pd.DataFrame (data, columns = ['kmsse','dbsse','kent','dbent','kmpure','dbpure'])
    
    return(finalresult.to_csv('Result.csv',header=False, index=False))
    

In [14]:
main('Desktop/Project2/CGMData.csv','Desktop/Project2/InsulinData.csv')

  if self.run_code(code, result):
  if self.run_code(code, result):
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in 

0.175787728026534