In [2]:
import os
os.chdir('../')

In [67]:
import pandas as pd
import dask.dataframe as dd
import pytest

from rwd_analytics.cohort import CohortBuilder
from rwd_analytics.features_selection import FeaturesSelection, time_at_risk, get_features_scores
from rwd_analytics.lookups import Descendants, ConceptInfo, ConceptRelationship, ComorbidConditions, Ingredient
from rwd_analytics.treatment_line import last_activity_date

In [68]:
person = pd.DataFrame({
    'person_id':[1, 2, 3, 4, 5],
    'gender_concept_id':[8532, 8507, 8532, 8507, 8507],
    'year_of_birth':[1990, 2000, 2010, 1970, 1960]
})
condition_occurrence = pd.DataFrame({
    'person_id':[1, 1, 1, 1, 2, 2],
    'condition_concept_id':[44831230, 2, 3, 4, 44831230, 2],
    'condition_start_datetime':[
        pd.to_datetime('2017-12-10'),
        pd.to_datetime('2017-12-10'),
        pd.to_datetime('2017-12-10'),
        pd.to_datetime('2017-12-10'),
        pd.to_datetime('2017-12-10'),
        pd.to_datetime('2017-12-10'),
    ]
})
observation_period = pd.DataFrame({
    'person_id':[1, 2],
    'observation_period_start_date':[
        pd.to_datetime('2015-01-01'),
        pd.to_datetime('2017-12-01')
    ],
    'observation_period_end_date':[
        pd.to_datetime('2019-01-01'),
        pd.to_datetime('2018-02-01')
    ]
})
drug_exposure = pd.DataFrame({
    'person_id':[1, 1, 1, 1, 2, 2],
    'drug_concept_id':[10, 20, 30, 40, 10, 20],
    'drug_exposure_start_datetime':[
        pd.to_datetime('2017-12-10'),
        pd.to_datetime('2017-12-10'),
        pd.to_datetime('2017-12-10'),
        pd.to_datetime('2017-12-10'),
        pd.to_datetime('2017-12-10'),
        pd.to_datetime('2017-12-10'),
    ]
})

visit_occurrence = pd.DataFrame({
    'person_id':[1],
    'visit_start_datetime':[
        pd.to_datetime('2017-12-10')
    ]
})
visit_occurrence = dd.from_pandas(visit_occurrence, npartitions=1).set_index('person_id')
person = dd.from_pandas(person, npartitions=1).set_index('person_id')
condition_occurrence = dd.from_pandas(condition_occurrence, npartitions=1).set_index('person_id')
observation_period = dd.from_pandas(observation_period, npartitions=1).set_index('person_id')
drug_exposure = dd.from_pandas(drug_exposure, npartitions=1).set_index('person_id')
measurement = pd.DataFrame()
procedure = pd.DataFrame()
measurement = dd.from_pandas(measurement, npartitions=1)
procedure = dd.from_pandas(procedure, npartitions=1)
omop_tables = {
    'person':person,
    'condition_occurrence':condition_occurrence,
    'procedure_occurrence':procedure,
    'drug_exposure':drug_exposure,
    'visit_occurrence':visit_occurrence,
    'observation_period':observation_period,
    'measurement':measurement
}

In [91]:


def line_generation_preprocess(cohort, ingredient_list, omop_tables):
    subjects = cohort.person_id.unique().tolist()
    descendants = Descendants()
    ingredients = Ingredient()
    drug_temp = omop_tables['drug_exposure'].loc[drug_exposure.index.isin(subjects)]
    drug_temp = drug_temp[drug_temp['drug_concept_id'].isin(descendants(ingredient_list))]
    drug_temp = drug_temp.compute().reset_index()
    drug_temp = ingredients(drug_temp)
    drug_temp = dd.from_pandas(drug_temp, npartitions=1).set_index('person_id')
    last_activity = last_activity_date(cohort, omop_tables)
    cohort_enhanced = pd.merge(cohort, last_activity, how='left', on='person_id')
    cohort_enhanced = dd.from_pandas(cohort_enhanced, npartitions=1).set_index('person_id')
    return drug_temp, cohort_enhanced

In [92]:
drug_temp, cohort_enhanced = line_generation_preprocess(cohort, ingredient_list, omop_tables)

In [96]:
def false_after_lot(x):
    x = x.sort_values(by=['time_from_start'])
    first_false = x['is_in_line'].idxmin()
    if not x['is_in_line'][first_false]:
        x.loc[first_false:, 'is_in_line'] = False
    return x


def is_in_line(drug_code, regimen_codes):
    """
    fluorouracil = 955632
    capecitabine = 1337620
    leucovorin = 1388796
    levoleucovorin = 40168303
    """
    substitutes = {
        955632:[955632, 1337620],
        1337620:[955632, 1337620],
        1388796:[1388796, 40168303],
        40168303:[1388796, 40168303]
    }
    if drug_code not in substitutes:
        substitutes[drug_code] = [drug_code]

    for substitute in substitutes[drug_code]:
        if substitute in regimen_codes:
            return True

    # Addition of leucovorin or levoleucovorin does not advance the lot
    return drug_code in [1388796, 40168303]


class LinesOfTherapy():
    def __init__(self, drug_temp, cohort, offset=14, nb_of_lines = 3):
        self.drug_temp = drug_temp
        self.index_date = cohort
        self.lines = self.__get_drugs(self.drug_temp, self.index_date, offset)
        self.lines = self.lines.persist()
        self.lines['line_number'] = 0

    def __get_drugs(self, df, index, offset):
        df = dd.merge(df, index, how='left', on='person_id')
        df = df[(df['drug_exposure_start_datetime'] - df['cohort_start_date']).dt.days >= -offset]
        return df[['drug_concept_id', 'drug_exposure_start_datetime']]

    def __get_lines(self, df, line_number):
        def add_paclitaxel_gemcitabine(x):
            """
            Adding Paclitaxel (1378382) or Gemcitabine (1314924)
            does not change the line of therapy
            """
            if (1378382 in x[0]) or (1314924 in x[0]):
                g = list(set(x[1]).symmetric_difference(set(x[0])))
                if 1378382 in g:
                    return np.append(x[0], 1378382)
                elif 1314924 in g:
                    return np.append(x[0], 1314924)
            return x[0]
        
        df = df.reset_index()
        start_line = df.groupby(['person_id'])['drug_exposure_start_datetime'].min().to_frame('start_date')
        df = df.merge(start_line, how='left', on='person_id')
        df['time_from_start'] = (df['drug_exposure_start_datetime'] - df['start_date']).dt.days
        regimen_codes = df[df['time_from_start'] <= 28].groupby('person_id').drug_concept_id.unique().to_frame('regimen_codes_28')
        df = df.merge(regimen_codes, how='left', on='person_id')
        regimen_codes = df[df['time_from_start'] <= 90].groupby('person_id').drug_concept_id.unique().to_frame('regimen_codes_90')
        df = df.merge(regimen_codes, how='left', on='person_id')    
        df['regimen_codes'] = df['regimen_codes_28']
        df['tmp'] = list(zip(df['regimen_codes_28'], df['regimen_codes_90']))
        df['regimen_codes'] = df['tmp'].map(add_paclitaxel_gemcitabine)
        df = df.sort_values(by=['person_id', 'time_from_start'])
        df['tmp'] = list(zip(df['drug_concept_id'], df['regimen_codes']))
        df['is_in_line'] = df['tmp'].map(lambda x: is_in_line(x[0], x[1]))
        del df['tmp']
        df = df.groupby('person_id').apply(false_after_lot).reset_index(drop=True)
        df['line_number'] = line_number
        df['line_number'] = df['line_number'].astype(int)
        return df
    
    def __call__(self):
        line_number = 1
        dfs = []
        tmp = []
        lines = self.lines
        
        while line_number != nb_of_lines+1:
            df = lines.map_partitions(self.__get_lines, line_number)
            temp = df[df['is_in_line'] == True][['person_id', 'start_date', 'regimen_codes', 'line_number']]
            lines = df[df['is_in_line'] == False][['person_id', 'drug_concept_id', 'drug_exposure_start_datetime']]
            dfs.append(temp)
            line_number = line_number + 1

        lines_f = dd.concat(dfs)
        lines_f['regimen_codes'] = lines_f['regimen_codes'].astype(str)
        return lines_f.drop_duplicates()


def listToString(s):  
    str1 = ", " 
    return (str1.join(s))


class LineName():
    def __init__(self, lot_f, concept_infos):
        lot_f["regimen_codes"] = lot_f.regimen_codes.str.replace(" ", ',')
        lot_f["regimen_codes"] = lot_f.regimen_codes.str.replace("[", "")
        lot_f["regimen_codes"] = lot_f.regimen_codes.str.replace("]", "")

        for index, row in concept_infos.iterrows():
            lot_f["regimen_codes"] = lot_f.regimen_codes.str.replace(str(row['concept_id']), row['concept_name'])
        self.lot = lot_f

    def __call__(self):
        self.lot['regimen_codes_sorted'] = self.lot['regimen_codes'].str.split(',')
        self.lot = self.lot.reset_index(drop=True)
        for index, row in self.lot.iterrows():
            row['regimen_codes_sorted'].sort()
            regimen_sorted = listToString(row['regimen_codes_sorted'])
            self.lot.loc[index, 'regimen_codes_sorted'] = regimen_sorted

        del self.lot['regimen_codes']
        self.lot = self.lot.sort_values(by=['person_id', 'line_number'])
        self.lot['regimen_codes_sorted'] = self.lot['regimen_codes_sorted'].map(
            lambda x:', '.join([l.strip() for l in x.split(',') if l.strip() != '']))

        return self.lot.rename(columns={'regimen_codes_sorted':'regimen_name'})
    

In [100]:
test_line_generation()

In [None]:
# chi-squared test with similar proportions
from scipy.stats import chi2_contingency
from scipy.stats import chi2

xi_results = pd.DataFrame({
    'YY': [],
    'YN': [],
    'NY': [],
    'NN': [],
    'stat': [],
    'p': [],
    'dof': [],
    'probability': [],
    'interpret test-statistic': [],
    'interpret p-value': []
})
table = [[16, 197], [37847, 2286732]]
stat, p, dof, expected = chi2_contingency(table)
prob = 0.95
critical = chi2.ppf(prob, dof)

# interpret test-statistic
if abs(stat) >= critical:
    test_statistic = 'Dependent (reject H0)'
else:
    test_statistic = 'Independent (fail to reject H0)'

# interpret p-value
alpha = 1.0 - prob
if p <= alpha:
    p_value = 'Dependent (reject H0)'
else:
    p_value = 'Independent (fail to reject H0)'

# contingency table
xi_add_results = pd.DataFrame({
    'YY': [table[0][0]],
    'YN': [table[0][1]],
    'NY': [table[1][0]],
    'NN': [table[1][1]],
    'stat': [stat],
    'p': [p],
    'dof': [dof],
    'probability':[prob],
    'interpret test-statistic': [test_statistic],
    'interpret p-value': [p_value]
})

xi_results = xi_results.append(xi_add_results)

In [None]:
xi_results

In [None]:
test_comorbid_conditions()

In [None]:
test_concept_info()

In [None]:
What is the time for defining a gap?


In [None]:
era

In [None]:
era = t.groupby(['person_id', 'concept_id']).agg({
    'start_date':['min', 'max', 'count'],
    'gap':['cumsum']
})
era['era_duration'] = (era['max'] - era['min']).dt.days
era = era.reset_index()

In [None]:
t

In [None]:
import pandas as pd
import dask.dataframe as dd
import numpy as np
import math

# sudo docker-compose exec --user root  notebook bash
# pip install -U scikit-learn
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyClassifier
from sklearn.metrics import accuracy_score, log_loss, classification_report, confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

In [None]:
df = pd.DataFrame({
    'age_at_index':[18, 28, 8, 48, 58],
    'gender = female':[1, 0, 1, 0, 0],
    'condition_1':[1, 1, 1, 0, 0],
    'condition_2':[1, 1, 1, 0, 0],
    'target':[0, 1, 1, 0, 0]
})

In [None]:
get_features_scores(df, 4)

In [None]:
from sklearn.preprocessing import StandardScaler
X = df.iloc[:,0:4]
scaler = StandardScaler().fit(X)
standardized_X = scaler.transform(X)

from sklearn.linear_model import LogisticRegression
#X = df.iloc[:,0:4]  #independent columns
X = standardized_X
y = df.iloc[:,-1]    #target column i.e price range
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
clf = LogisticRegression(random_state=0, solver='lbfgs').fit(X, y)
#clf.predict(X)
proba = pd.DataFrame(clf.predict_proba(X))[[1]]
proba.columns = ['probability']
proba['probability'] = proba['probability'].apply(lambda x:round(x, 4))
pd.concat([df, proba], axis=1)

In [None]:
X

In [None]:
clf.score(X, y)

In [None]:
import pandas as pd
import dask.dataframe as dd

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from rwd_analytics.lookups import Descendants, ComorbidConditions

def get_features_scores(df, n_features):
    X = df.iloc[:,0:n_features]  #independent columns
    y = df.iloc[:,-1]    #target column i.e price range
    
    #apply SelectKBest class to extract top 10 best features
    bestfeatures = SelectKBest(score_func=chi2, k=n_features)
    fit = bestfeatures.fit(X,y)
    dfscores = pd.DataFrame(fit.scores_)
    dfcolumns = pd.DataFrame(X.columns)
    
    #concat two dataframes for better visualization 
    featureScores = pd.concat([dfcolumns, dfscores], axis=1)

    # naming the dataframe columns and rounding results
    featureScores.columns = ['Specs', 'Score']
    featureScores['Score'] = featureScores['Score'].round(2)
    return featureScores.nlargest(n_features, 'Score')

In [None]:
get_features_scores(df, 4)

In [None]:
PredictionModels(df)()

In [None]:
class PredictionModels():
    def __init__(self, df):
        a = df[df['target']==1]
        b = df[df['target']==0]
        print('Subjects in target=1: '+str(len(a)))
        print('Subjects in target=0: '+str(len(b)))
        
        X = df.drop('target', axis=1)
        y = df['target']
        X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, test_size=0.25)
        
        self.X_train = X_train
        self.X_test = X_test
        self.y_train = y_train
        self.y_test = y_test
        
        #if (len(a)+len(b))/20 < abs(len(b)-len(a)):
        #    print('Imbalanced dataset: resampling train set')
        #    print('************************************')
        #    c = math.trunc((max(len(a), len(b)) + min(len(a), len(b)))/2)
        #    a = self.X_train.sample(n=c, replace=True, random_state=3)
        #    b = self.y_train.sample(n=c, replace=True, random_state=3)
        #    df = pd.concat([a, b], ignore_index=True)

    def __call__(self):
        feedback = pd.DataFrame(columns=['Classifier', 'Model Score', 'Accuracy Score'])
        classifiers = [
            DummyClassifier(strategy='most_frequent', random_state=0),
            KNeighborsClassifier(3),
            SVC(kernel="rbf", C=0.025, probability=True),
            #NuSVC(probability=True),
            DecisionTreeClassifier(),
            RandomForestClassifier(),
            AdaBoostClassifier(),
            GradientBoostingClassifier()
            ]

        for classifier in classifiers:
            model = classifier.fit(self.X_train, self.y_train)
            y_pred = model.predict(self.X_test)
            feedback_temp = pd.DataFrame({
                'Classifier':[classifier],
                'Model Score':[model.score(self.X_train, self.y_train)],
                'Accuracy Score':[accuracy_score(self.y_test, y_pred)]
            })
            feedback.append(feedback_temp)
            print(classifier)
            print("Training score: %.3f" % model.score(self.X_train, self.y_train))
            print("Test score: %.3f" % accuracy_score(self.y_test, y_pred))
            print('*** Confusion matrix ***')
            print(confusion_matrix(self.y_test, y_pred))
            print('*** Classification report ***')
            print(classification_report(self.y_test, y_pred))
            print('************************************')
        
        return feedback