In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import os
from datetime import timedelta, date, datetime
from sqlalchemy.types import NVARCHAR, DATE, FLOAT, VARCHAR, DATETIME
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import make_scorer
from sklearn.metrics import roc_auc_score, confusion_matrix, plot_roc_curve,\
    plot_confusion_matrix, recall_score, precision_score, accuracy_score

from helpers import *

data_dir = "../data"

sns.set_style('whitegrid')



## Feature extraction

In [None]:
from sqlalchemy import create_engine

import pandas as pd
import config
from sql_queries import *

engine = create_engine(
    f"mysql+pymysql://{config.sql_credentials['user']}:{config.sql_credentials['password']}@{config.sql_credentials['host']}:{config.sql_credentials['port']}/{config.sql_credentials['db']}",
    pool_pre_ping=True)

conn = engine.connect()
#
# print(datetime.now())
# login_data = pd.read_sql(login_query, conn)
# print(datetime.now())
# dist_data = pd.read_sql(dist_query, conn)
# print(datetime.now())
# prefs_data = pd.read_sql(prefs_query, conn)
# print(datetime.now())
# training_data = pd.read_sql(training_query, conn)
# print(datetime.now())
# worked_data = pd.read_sql(worked_query, conn)
# print(datetime.now())
#
# login_data.to_csv('../data/login_data.csv', index = False)
# dist_data.to_csv('../data/dist_data.csv', index = False)
# worked_data.to_csv('../data/worked_data.csv', index = False)
# prefs_data.to_csv('../data/prefs_data_v2.csv', index = False)
# training_data.to_csv('../data/training_data_v2.csv', index = False)


prefs_data = pd.read_sql(prefs_query, conn)
prefs_data.to_csv('../data/prefs_data_v2.csv', index = False)

## Feature engineering

In [None]:
login_data = pd.read_csv('../data/login_data.csv')
dist_data = pd.read_csv('../data/dist_data.csv')
worked_data = pd.read_csv('../data/worked_data.csv')
prefs_data = pd.read_csv('../data/prefs_data_v2.csv')
training_data = pd.read_csv('../data/training_data_v2.csv')


In [None]:
print(training_data.shape)
all_data = training_data.merge(login_data, how='left', on = ['placement_ad_id','carer_id','sent_at'])
print(all_data.shape)
all_data = all_data.merge(prefs_data, how='left', on = ['placement_ad_id','carer_id','sent_at'])
print(all_data.shape)
all_data = all_data.merge(dist_data, how='left', on = ['placement_ad_id','carer_id','sent_at'])
print(all_data.shape)
all_data = all_data.merge(worked_data, how='left', on = ['placement_ad_id','carer_id','sent_at'])
print(all_data.shape)

all_data.drop_duplicates(subset=['placement_ad_id','carer_id','sent_at'], inplace = True)
print(all_data.shape)

In [None]:
all_data.columns

In [None]:
all_data.carer_age.isna().sum()

In [None]:
all_data['carer_applied_in_8hrs'].value_counts()

In [None]:
target = 'carer_applied_in_8hrs'

X1 = all_data.loc[all_data['carer_applied_in_8hrs']==1]
X0 = all_data.loc[all_data['carer_applied_in_8hrs']==0].sample(frac=0.01, random_state=1992)

df = pd.concat([X1,X0], axis = 0)
df

In [None]:
pd.get_dummies(df['sms_type'],drop_first=True)
df.columns

## Feature selection

In [None]:
predictors = ['lead_time', 'min_provider_rate',
       'ongoing', 'min_provider_rate', 'one_off_payments_total',
       'licence_needed', 'car_needed', 'moving_handling', 'dementia',
       'mental_health_issues', 'hoist', 'parkinsons', 'stroke', 'alzheimers',
       'stoma', 'diabetes', 'peg', 'has_wifi', 'smoking', 'has_pets',
       'has_two_crs']
X_train, X_eval, y_train, y_eval = split_train_test(df,predictors,target, test_size=0.15)

In [None]:
model = LogisticRegression()


In [None]:
n_features =8
X = X_train
y = np.ravel(y_train)

recall_scorer = make_scorer(recall_score, greater_is_better=True)
results = select_features(model, n_features, y, X,scoring=recall_scorer)

In [None]:
f_predictors = list(set(sum(list(results.loc[results.cv_score == results.cv_score.max()]['features']), [])))

In [None]:
heat_list = f_predictors + [target]

fig, ax = plt.subplots(1, 2, figsize=(25, 8))

sns.lineplot(x=range(1, n_features), y=results['cv_score'], ax = ax[0], linewidth = 3)
sns.heatmap(training_data[heat_list].corr(), cmap='Blues', annot=True, ax = ax[1])

## Model performance

### Baseline

In [None]:
model.fit(X_train[f_predictors], y)

In [None]:
preds = model.predict(X_eval[f_predictors])
print(f'Precision is {precision_score(y_eval,preds)}, recall is {recall_score(y_eval,preds)}, accuracy is {accuracy_score(y_eval,preds)}')

On a balanced sample (around 50% each applied/not applied), the model does a fair job at classifying positive examples
correctly, showing an accuracy of 58.4% together with a recall score of 61.2% on the evaluation set (not used for FS)

In [None]:
plot_confusion_matrix(model,X_eval[f_predictors],y_eval)

Feature predictive importance is line with expectations,bonuses possibly reflecting location

In [None]:
plot_shap_metrics(model,df,predictors,target)

### Including soft preferences



