In [1]:
import os
os.chdir('../')

In [2]:
#https://pennchime.herokuapp.com/
#https://seaborn.pydata.org/examples/index.html

In [3]:
import pandas as pd
import dask.dataframe as dd
import numpy as np
import pytest

from rwd_analytics.cohort import CohortBuilder
from rwd_analytics.features_selection import FeaturesSelection, time_at_risk, get_features_scores
from rwd_analytics.lookups import Descendants, Concept, ConceptRelationship, ComorbidConditions, Ingredient
from rwd_analytics.treatment_line import last_activity_date, agg_lot_by_patient, line_generation_preprocess, LinesOfTherapy
from rwd_analytics.predictions import get_matching_pairs

In [4]:
person = pd.DataFrame({
    'person_id':[1, 2, 3, 4, 5],
    'gender_concept_id':[8532, 8507, 8532, 8507, 8507],
    'year_of_birth':[1990, 2000, 2010, 1970, 1960]
})
condition_occurrence = pd.DataFrame({
    'person_id':[1, 1, 1, 1, 2, 2],
    'condition_concept_id':[44831230, 2, 3, 4, 44831230, 2],
    'condition_start_datetime':[
        pd.to_datetime('2017-12-10'),
        pd.to_datetime('2017-12-10'),
        pd.to_datetime('2017-12-10'),
        pd.to_datetime('2017-12-10'),
        pd.to_datetime('2017-12-10'),
        pd.to_datetime('2017-12-10'),
    ]
})
observation_period = pd.DataFrame({
    'person_id':[1, 2],
    'observation_period_start_date':[
        pd.to_datetime('2015-01-01'),
        pd.to_datetime('2017-12-01')
    ],
    'observation_period_end_date':[
        pd.to_datetime('2019-01-01'),
        pd.to_datetime('2018-02-01')
    ]
})
drug_exposure = pd.DataFrame({
    'person_id':[1, 1, 1, 1, 2, 2],
    'drug_concept_id':[10, 20, 30, 40, 10, 20],
    'drug_exposure_start_datetime':[
        pd.to_datetime('2017-12-10'),
        pd.to_datetime('2017-12-10'),
        pd.to_datetime('2017-12-10'),
        pd.to_datetime('2017-12-10'),
        pd.to_datetime('2017-12-10'),
        pd.to_datetime('2017-12-10'),
    ]
})

visit_occurrence = pd.DataFrame({
    'person_id':[1],
    'visit_start_datetime':[
        pd.to_datetime('2017-12-10')
    ]
})
visit_occurrence = dd.from_pandas(visit_occurrence, npartitions=1).set_index('person_id')
person = dd.from_pandas(person, npartitions=1).set_index('person_id')
condition_occurrence = dd.from_pandas(condition_occurrence, npartitions=1).set_index('person_id')
drug_exposure = dd.from_pandas(drug_exposure, npartitions=1).set_index('person_id')
observation_period = dd.from_pandas(observation_period, npartitions=1).set_index('person_id')
measurement = pd.DataFrame(columns=['person_id'])
procedure = pd.DataFrame(columns=['person_id', 'procedure_datetime', 'procedure_concept_id'])
measurement = dd.from_pandas(measurement, npartitions=1)
procedure = dd.from_pandas(procedure, npartitions=1)
omop_tables = {
    'person':person,
    'condition_occurrence':condition_occurrence,
    'procedure_occurrence':procedure,
    'drug_exposure':drug_exposure,
    'visit_occurrence':visit_occurrence,
    'observation_period':observation_period,
    'measurement':measurement
}

In [None]:
class CleaningLabResults():
    def __init__(self, df):
        self.df = df
    
    def test_distance_from_range(self, row, average, std):
        if row['value_as_number']/average < 0.01:
            if row['distance_from_range'] > row['new_distance_from_range']:
                self.i = self.i + 1
                return row['new_value_as_number']
            else:
                return row['value_as_number']
            
        else:
            return row['value_as_number']
    
    def __call__(self):
        measurement_dfs = []
        for concept in self.df['measurement_concept_id'].unique().tolist():
            df = self.df[self.df['measurement_concept_id']==concept].copy()
            df = df[df['value_as_number']!=0]
            std = df.value_as_number.std()
            high = round(df[df['range_high']!=0]['range_high'].value_counts().idxmax(), 2)
            low = round(df[df['range_low']!=0]['range_low'].value_counts().idxmax(), 2)
            std = df[df['range_high']==high].value_as_number.std()
            average = df[df['range_high']==high].value_as_number.mean()
            df['range_high'] = high
            df['range_low'] = low
            print(std)
            print(average)
            print(concept)
            print('*******')
            
            self.i = 1
            while self.i != 0:
                self.i = 0
                df['distance_from_range'] = abs(df['value_as_number']-average)
                df['new_value_as_number'] = df['value_as_number']*10
                df['new_distance_from_range'] = abs(df['new_value_as_number']-average)
                df['value_as_number'] = df.apply(self.test_distance_from_range, args=(average, std), axis=1)

            self.i = 1
            while self.i != 0:
                self.i = 0
                df['distance_from_range'] = abs(df['value_as_number']-average)
                df['new_value_as_number'] = df['value_as_number']/10
                df['new_distance_from_range'] = abs(df['new_value_as_number']-average)
                df['value_as_number'] = df.apply(self.test_distance_from_range, args=(average, std), axis=1)

            measurement_dfs.append(df)
        measurement = pd.concat(measurement_dfs)
        measurement = measurement.round({'value_as_number': 1, 'range_high': 1, 'range_low': 1})
        del measurement['distance_from_range']
        del measurement['new_value_as_number']
        del measurement['new_distance_from_range']
        return measurement

In [None]:
def describe_lot(lot, cohort_enhanced, line, censoring_date):
    lot = lot.merge(cohort_enhanced, how='left', on='person_id')
    lot['time_to_last_activity'] = (lot['last_activity_date'] - lot['start_date']).dt.days
    lot['time_to_next_treatment'] = (lot['end_date'] - lot['start_date']).dt.days
    lot['event'] = lot['end_date'].apply(lambda x:0 if x > pd.to_datetime(censoring_date, format='%Y-%m-%d') else 1)
    lot = lot[lot['event']==1]
    lot = lot[lot['line_number']==line]
    lot = lot.groupby('regimen_name').agg({
        'person_id':['count'],
        'time_to_next_treatment':['median'],
        'time_to_last_activity':['median']
    })
    lot = lot[lot[('person_id', 'count')]>=50]
    return lot.sort_values(by=[('time_to_next_treatment', 'median')], ascending=False)

In [None]:
# chi-squared test with similar proportions
from scipy.stats import chi2_contingency
from scipy.stats import chi2

xi_results = pd.DataFrame({
    'YY': [],
    'YN': [],
    'NY': [],
    'NN': [],
    'stat': [],
    'p': [],
    'dof': [],
    'probability': [],
    'interpret test-statistic': [],
    'interpret p-value': []
})
table = [[16, 197], [37847, 2286732]]
stat, p, dof, expected = chi2_contingency(table)
prob = 0.95
critical = chi2.ppf(prob, dof)

# interpret test-statistic
if abs(stat) >= critical:
    test_statistic = 'Dependent (reject H0)'
else:
    test_statistic = 'Independent (fail to reject H0)'

# interpret p-value
alpha = 1.0 - prob
if p <= alpha:
    p_value = 'Dependent (reject H0)'
else:
    p_value = 'Independent (fail to reject H0)'

# contingency table
xi_add_results = pd.DataFrame({
    'YY': [table[0][0]],
    'YN': [table[0][1]],
    'NY': [table[1][0]],
    'NN': [table[1][1]],
    'stat': [stat],
    'p': [p],
    'dof': [dof],
    'probability':[prob],
    'interpret test-statistic': [test_statistic],
    'interpret p-value': [p_value]
})

xi_results = xi_results.append(xi_add_results)

In [None]:
xi_results

In [None]:
test_comorbid_conditions()

In [None]:
test_concept_info()

In [None]:
What is the time for defining a gap?


In [None]:
era

In [None]:
era = t.groupby(['person_id', 'concept_id']).agg({
    'start_date':['min', 'max', 'count'],
    'gap':['cumsum']
})
era['era_duration'] = (era['max'] - era['min']).dt.days
era = era.reset_index()

In [None]:
t

In [None]:
import pandas as pd
import dask.dataframe as dd
import numpy as np
import math

# sudo docker-compose exec --user root  notebook bash
# pip install -U scikit-learn
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyClassifier
from sklearn.metrics import accuracy_score, log_loss, classification_report, confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

In [None]:
df = pd.DataFrame({
    'age_at_index':[18, 28, 8, 48, 58],
    'gender = female':[1, 0, 1, 0, 0],
    'condition_1':[1, 1, 1, 0, 0],
    'condition_2':[1, 1, 1, 0, 0],
    'target':[0, 1, 1, 0, 0]
})

In [None]:
get_features_scores(df, 4)

In [None]:
from sklearn.preprocessing import StandardScaler
X = df.iloc[:,0:4]
scaler = StandardScaler().fit(X)
standardized_X = scaler.transform(X)

from sklearn.linear_model import LogisticRegression
#X = df.iloc[:,0:4]  #independent columns
X = standardized_X
y = df.iloc[:,-1]    #target column i.e price range
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
clf = LogisticRegression(random_state=0, solver='lbfgs').fit(X, y)
#clf.predict(X)
proba = pd.DataFrame(clf.predict_proba(X))[[1]]
proba.columns = ['probability']
proba['probability'] = proba['probability'].apply(lambda x:round(x, 4))
pd.concat([df, proba], axis=1)

In [None]:
X

In [None]:
clf.score(X, y)

In [None]:
import pandas as pd
import dask.dataframe as dd

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from rwd_analytics.lookups import Descendants, ComorbidConditions

def get_features_scores(df, n_features):
    X = df.iloc[:,0:n_features]  #independent columns
    y = df.iloc[:,-1]    #target column i.e price range
    
    #apply SelectKBest class to extract top 10 best features
    bestfeatures = SelectKBest(score_func=chi2, k=n_features)
    fit = bestfeatures.fit(X,y)
    dfscores = pd.DataFrame(fit.scores_)
    dfcolumns = pd.DataFrame(X.columns)
    
    #concat two dataframes for better visualization 
    featureScores = pd.concat([dfcolumns, dfscores], axis=1)

    # naming the dataframe columns and rounding results
    featureScores.columns = ['Specs', 'Score']
    featureScores['Score'] = featureScores['Score'].round(2)
    return featureScores.nlargest(n_features, 'Score')

In [None]:
get_features_scores(df, 4)

In [None]:
PredictionModels(df)()

In [None]:
class PredictionModels():
    def __init__(self, df):
        a = df[df['target']==1]
        b = df[df['target']==0]
        print('Subjects in target=1: '+str(len(a)))
        print('Subjects in target=0: '+str(len(b)))
        
        X = df.drop('target', axis=1)
        y = df['target']
        X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, test_size=0.25)
        
        self.X_train = X_train
        self.X_test = X_test
        self.y_train = y_train
        self.y_test = y_test
        
        #if (len(a)+len(b))/20 < abs(len(b)-len(a)):
        #    print('Imbalanced dataset: resampling train set')
        #    print('************************************')
        #    c = math.trunc((max(len(a), len(b)) + min(len(a), len(b)))/2)
        #    a = self.X_train.sample(n=c, replace=True, random_state=3)
        #    b = self.y_train.sample(n=c, replace=True, random_state=3)
        #    df = pd.concat([a, b], ignore_index=True)

    def __call__(self):
        feedback = pd.DataFrame(columns=['Classifier', 'Model Score', 'Accuracy Score'])
        classifiers = [
            DummyClassifier(strategy='most_frequent', random_state=0),
            KNeighborsClassifier(3),
            SVC(kernel="rbf", C=0.025, probability=True),
            #NuSVC(probability=True),
            DecisionTreeClassifier(),
            RandomForestClassifier(),
            AdaBoostClassifier(),
            GradientBoostingClassifier()
            ]

        for classifier in classifiers:
            model = classifier.fit(self.X_train, self.y_train)
            y_pred = model.predict(self.X_test)
            feedback_temp = pd.DataFrame({
                'Classifier':[classifier],
                'Model Score':[model.score(self.X_train, self.y_train)],
                'Accuracy Score':[accuracy_score(self.y_test, y_pred)]
            })
            feedback.append(feedback_temp)
            print(classifier)
            print("Training score: %.3f" % model.score(self.X_train, self.y_train))
            print("Test score: %.3f" % accuracy_score(self.y_test, y_pred))
            print('*** Confusion matrix ***')
            print(confusion_matrix(self.y_test, y_pred))
            print('*** Classification report ***')
            print(classification_report(self.y_test, y_pred))
            print('************************************')
        
        return feedback