In [None]:
import pandas as pd
from ast import literal_eval
%run utils.ipynb
import xgboost as xgb

In [None]:
df = pd.read_csv('data_full.csv')

In [None]:
df.head()

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import GridSearchCV

# def get_vectorized_data(text_array, dim_reduction=True):
#     tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 2), min_df=2, max_df=0.95)
#     X = tfidf_vectorizer.fit_transform(text_array) #features
#     if dim_reduction:
#         return reduce_vector_dimension(X)
#     else:
#         return X.toarray()
    
# def reduce_vector_dimension(X):
#     lsa = TruncatedSVD(n_components=100, n_iter=10, random_state=3)
#     X = lsa.fit_transform(X)
#     return X

vectorizer = TfidfVectorizer(ngram_range=(1, 2), min_df=2, max_df=0.95)

In [None]:
X = vectorizer.fit_transform(df['text'])
y = df['hypertension'].map({"Yes": 1, "Maybe": 1, "No": 0}).values

In [None]:
n_estimators = [100, 300, 1000] #number of trees, change it to 1000 for better results
max_depth = [3,5,6]
learning_rate = [0.05] #so called `eta` value
reg_lambda = [0,0.5,1,2] #L2 regularization term on weights
reg_alpha = [0,0.5,1,2] #L1 regularization term on weights 
objective = ['binary:logistic']
min_child_weight = [4,8,16]
subsample = [0.8]
colsample_bytree = [0.7]
random_state = [3]

clf = xgb.XGBClassifier()

params = dict(
    n_estimators=n_estimators,
    max_depth=max_depth,
    learning_rate=learning_rate,
    reg_lambda=reg_lambda,
    reg_alpha=reg_alpha,
    objective=objective,
    min_child_weight=min_child_weight,
    subsample=subsample,
    colsample_bytree=colsample_bytree,
    random_state=random_state
)

gridsearch = GridSearchCV(clf,
                          params,
                          cv = 5,
                          verbose = 1, 
                          n_jobs = -1,
                          scoring = 'roc_auc')

xgb_best_model = gridsearch.fit(X, y)

In [None]:
xgb_best_model.best_score_

In [None]:
xgb_best_model.best_params_

Best parameters when using full text data (scoring = accuracy)
{'colsample_bytree': 0.7,
 'learning_rate': 0.05,
 'max_depth': 3,
 'min_child_weight': 8,
 'n_estimators': 300,
 'objective': 'binary:logistic',
 'random_state': 3,
 'reg_alpha': 2,
 'reg_lambda': 0,
 'subsample': 0.8}
 score = 0.7994599152272002
 
 
 Best parameters when using partial text data (scoring = roc_auc)
 {'colsample_bytree': 0.7,
 'learning_rate': 0.05,
 'max_depth': 3,
 'min_child_weight': 4,
 'n_estimators': 100,
 'objective': 'binary:logistic',
 'random_state': 3,
 'reg_alpha': 0.5,
 'reg_lambda': 0,
 'subsample': 0.8}
 score = 0.8719259788504541
 
 Best parameters when using full text data (scoring = roc_auc)
 {'colsample_bytree': 0.7,
 'learning_rate': 0.05,
 'max_depth': 5,
 'min_child_weight': 8,
 'n_estimators': 300,
 'objective': 'binary:logistic',
 'random_state': 3,
 'reg_alpha': 1,
 'reg_lambda': 0,
 'subsample': 0.8}
 score = 0.9579638001758906

In [None]:
aModel = xgb.XGBClassifier(
    colsample_bytree=0.7,
    learning_rate=0.05,
    max_depth=5,
    min_child_weight=8,
    n_estimators=300,
    objective='binary:logistic',
    random_state=3,
    reg_alpha=1,
    reg_lambda=0,
    subsample=0.8
    )

In [None]:
aModel.fit(X,y)

In [None]:
aModel.save_model("xgb_hypertension.model")

In [None]:
# from sklearn.model_selection import train_test_split
# X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,shuffle=True,random_state=3,stratify = y,)
# aModel.fit(X_train,y_train)


In [None]:
# from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc, confusion_matrix
# y_pred = aModel.predict(X_test)
# f1_score(y_test, y_pred, average='macro')

In [None]:
import xgboost as xgb

In [None]:
xgb_model = xgb.Booster({'nthread':2})
xgb_model.load_model("xgb_hypertension.model")