## Load libraries and data

In [1]:
import pandas as pd
import numpy as np
import sklearn
from scipy.stats import norm
from sklearn.preprocessing import StandardScaler
from scipy import stats
import warnings
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
import re
import enchant
import difflib as db
from scipy.sparse import csr_matrix
warnings.filterwarnings('ignore')

fd = pd.read_csv('data/fd 03132018.csv', encoding= "ISO-8859-1")

## Preliminary Cleaning

In [2]:
condition = fd[(fd['trackable_type']== "Condition") | (fd['trackable_type']=="Symptom")].copy()
condition['trackable_name'] = condition['trackable_name'].str.lower().str.strip()
condition['trackable_name'] = condition['trackable_name'].str.replace('[^A-Za-z\s]+', '')

In [4]:
condition['conditionCount'] = condition.groupby(["trackable_name"])['trackable_name'].transform("count")

In [6]:
condition['userCount'] = condition.groupby(['trackable_name'])['user_id'].transform('nunique')

In [7]:
condition.head()

Unnamed: 0,user_id,age,sex,country,checkin_date,trackable_id,trackable_type,trackable_name,trackable_value,conditionCount,userCount
0,QEVuQwEABlEzkh7fsBBjEe26RyIVcg==,,,,2015-11-26,1069,Condition,ulcerative colitis,0,3230,390
1,QEVuQwEAWRNGnuTRqXG2996KSkTIEw==,30.0,male,US,2015-11-26,1069,Condition,ulcerative colitis,0,3230,390
2,QEVuQwEA+WkNxtp/qkHvN2YmTBBDqg==,0.0,female,CA,2017-04-28,3168,Condition,pain in left upper arm felt like i was getting...,4,1,1
3,QEVuQwEA+WkNxtp/qkHvN2YmTBBDqg==,0.0,female,CA,2017-04-28,3169,Condition,hip pain when gettin up,3,1,1
4,QEVuQwEA+WkNxtp/qkHvN2YmTBBDqg==,0.0,female,CA,2017-04-28,3170,Condition,pain in hand joints,4,1,1


In [15]:
condition = condition[(condition['conditionCount'] > 20) & (condition['userCount']> 20)]

Unnamed: 0,user_id,age,sex,country,checkin_date,trackable_id,trackable_type,trackable_name,trackable_value,conditionCount,userCount
0,QEVuQwEABlEzkh7fsBBjEe26RyIVcg==,,,,2015-11-26,1069,Condition,ulcerative colitis,0,3230,390
1,QEVuQwEAWRNGnuTRqXG2996KSkTIEw==,30.0,male,US,2015-11-26,1069,Condition,ulcerative colitis,0,3230,390
6,QEVuQwEA+WkNxtp/qkHvN2YmTBBDqg==,0.0,female,CA,2017-04-28,1356,Condition,headache,2,35746,3664
15,QEVuQwEA+WkNxtp/qkHvN2YmTBBDqg==,0.0,female,CA,2017-04-28,123,Symptom,joint stiffness,3,3848,606
22,QEVuQwEAHgM/igE3w0tBL14Jq1nEfw==,,,,2015-06-22,269,Condition,crohns disease,3,3125,608


## Subsetting to get outliers by counts 

Experimenting with grouping

In [None]:

difflib.get_close_matches('pain', fdSample.trackable_name.astype(str), n = 10, cutoff = .6)
for i in fd['trackable_name']:
    list = difflib.get_close_matches(i, fd.trackable_name.astype(str), n = 3, cutoff = .6)
    fd['new_col'] = fd['trackable_name'].map(lambda x: list if i in x else "")


In [68]:
fd

Unnamed: 0,user_id,age,sex,country,checkin_date,trackable_id,trackable_type,trackable_name,trackable_value,new_col
0,QEVuQwEABlEzkh7fsBBjEe26RyIVcg==,,,,2015-11-26,1069,Condition,ulcerative colitis,0,
1,QEVuQwEAWRNGnuTRqXG2996KSkTIEw==,30.0,male,US,2015-11-26,1069,Condition,ulcerative colitis,0,
2,QEVuQwEA+WkNxtp/qkHvN2YmTBBDqg==,0.0,female,CA,2017-04-28,3168,Condition,pain in left upper arm felt like i was getting...,4,
3,QEVuQwEA+WkNxtp/qkHvN2YmTBBDqg==,0.0,female,CA,2017-04-28,3169,Condition,hip pain when gettin up,3,
4,QEVuQwEA+WkNxtp/qkHvN2YmTBBDqg==,0.0,female,CA,2017-04-28,3170,Condition,pain in hand joints,4,
5,QEVuQwEA+WkNxtp/qkHvN2YmTBBDqg==,0.0,female,CA,2017-04-28,3171,Condition,numbness in right hand,2,
6,QEVuQwEA+WkNxtp/qkHvN2YmTBBDqg==,0.0,female,CA,2017-04-28,1356,Condition,headache,2,
7,QEVuQwEA+WkNxtp/qkHvN2YmTBBDqg==,0.0,female,CA,2017-04-28,3172,Condition,pain in left ankle,1,
8,QEVuQwEA+WkNxtp/qkHvN2YmTBBDqg==,0.0,female,CA,2017-04-28,3173,Condition,pain in left leg,1,
9,QEVuQwEA+WkNxtp/qkHvN2YmTBBDqg==,0.0,female,CA,2017-04-28,3174,Condition,pain in joints on feet,2,
