In [1]:
import os
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from sklearn import preprocessing

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [2]:
# load data
#data = pd.read_csv("/Users/brettpowers/Desktop/hf_new_small.csv")
data = pd.read_csv("/Users/rachelkim/Documents/CU/compassMachineLearning/Example Data/hf_new_small.csv")
data.drop("Unnamed: 0", axis = 1, inplace = True)
data.head()

Unnamed: 0,person_id,label,age,sex,diag_med
0,5537565608205382086,0,12,female,"4132855,444131,4066147,4289526,2414397,439060,..."
1,205040565946650389,0,69,female,"19079322,40229093,963359,1593744,35603432,4024..."
2,-7994356425511450986,0,54,male,"2414397,19079498,19075034,902741,379019,210198..."
3,2839027767049717315,0,87,male,"19025145,46287618,2211583,317898,2001537,24143..."
4,1464437645105651076,0,80,female,"435524,255848,442752,4070750,40231925,46287338..."


In [3]:
# split into training and test
train_set, test_set = train_test_split(data, test_size=0.2, random_state=42)

# Split into inputs and labels 
train_icmed = train_set["diag_med"]
train_label = train_set["label"]

test_icmed = test_set["diag_med"]
test_label = test_set["label"]

# Tokenizing icmed codes (tokenizer above)
vocab_size = 10

tokenize = Tokenizer(
            num_words=vocab_size,
            filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
            lower=True,
            split=",",
            char_level=False,
        )

tokenize.fit_on_texts(train_icmed)
index = tokenize.index_word

x_train = tokenize.texts_to_matrix(train_icmed)
x_test = tokenize.texts_to_matrix(test_icmed)

# Encoding with Sklearn
encoder = preprocessing.LabelBinarizer()
encoder.fit(train_label)
y_train = np.squeeze(encoder.transform(train_label))
y_test = np.squeeze(encoder.transform(test_label))


In [4]:
index

{1: '2108115',
 2: '2414397',
 3: '2414398',
 4: '35605482',
 5: '40220357',
 6: '320128',
 7: '4214956',
 8: '2514435',
 9: '19070869',
 10: '2414392',
 11: '4132855',
 12: '40162515',
 13: '2514436',
 14: '35603428',
 15: '4036803',
 16: '40231925',
 17: '2514437',
 18: '2314215',
 19: '19135374',
 20: '2213418',
 21: '254761',
 22: '40244026',
 23: '40232756',
 24: '432867',
 25: '2414396',
 26: '200219',
 27: '2314205',
 28: '77670',
 29: '2314216',
 30: '19005965',
 31: '2414393',
 32: '257011',
 33: '436096',
 34: '46287424',
 35: '42902754',
 36: '442077',
 37: '378253',
 38: '2211361',
 39: '2212946',
 40: '194133',
 41: '2212945',
 42: '46272450',
 43: '25297',
 44: '1127433',
 45: '4167217',
 46: '312437',
 47: '35603431',
 48: '46287338',
 49: '40240688',
 50: '2314213',
 51: '4144111',
 52: '46287618',
 53: '42707627',
 54: '196523',
 55: '433316',
 56: '437663',
 57: '27674',
 58: '19020053',
 59: '2212937',
 60: '40180078',
 61: '433736',
 62: '440383',
 63: '19020131',
 

In [5]:
pd.DataFrame(x_train)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0
2,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...
23995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
23996,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0
23997,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
23998,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
# Fit Random Forest model
forest_clf = RandomForestClassifier(n_estimators = 10, max_depth = 3, n_jobs=-1, random_state=0, verbose=1)
forest_clf.fit(x_train, y_train)
y_pred = forest_clf.predict(x_test)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done   6 out of  10 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=8)]: Done  10 out of  10 | elapsed:    0.0s finished


In [7]:
importances = pd.DataFrame.from_dict(index, orient = 'index', columns = ['Code'])
importances = pd.concat([pd.DataFrame({'Code':'NA'}, index = [0]), importances])
importances = importances.head(vocab_size)
importances['Importance'] = forest_clf.feature_importances_
importances

Unnamed: 0,Code,Importance
0,,0.0
1,2108115.0,0.043583
2,2414397.0,0.043923
3,2414398.0,0.095587
4,35605482.0,0.024515
5,40220357.0,0.020756
6,320128.0,0.63887
7,4214956.0,0.053081
8,2514435.0,0.029554
9,19070869.0,0.05013


In [8]:
importances_sorted = importances.sort_values('Importance', ascending = False)
importances_sorted

Unnamed: 0,Code,Importance
6,320128.0,0.63887
3,2414398.0,0.095587
7,4214956.0,0.053081
9,19070869.0,0.05013
2,2414397.0,0.043923
1,2108115.0,0.043583
8,2514435.0,0.029554
4,35605482.0,0.024515
5,40220357.0,0.020756
0,,0.0
