In [15]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
import tensorflow as tf
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import confusion_matrix
from tensorflow.keras import backend as K
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 16124556624970100641
, name: "/device:XLA_CPU:0"
device_type: "XLA_CPU"
memory_limit: 17179869184
locality {
}
incarnation: 12216014709939542296
physical_device_desc: "device: XLA_CPU device"
]


In [16]:
train = pd.read_csv("https://raw.githubusercontent.com/benvictoria17/MachineLearning/master/dataset/Health%20Insurance%20Cross%20Sell%20Prediction%20%F0%9F%8F%A0%20%F0%9F%8F%A5/train.csv")
submit = pd.read_csv("https://raw.githubusercontent.com/benvictoria17/MachineLearning/master/dataset/Health%20Insurance%20Cross%20Sell%20Prediction%20%F0%9F%8F%A0%20%F0%9F%8F%A5/sample_submission.csv")

In [18]:
print('# of missing values in train: %s' % train.isnull().sum().sum())
print('# of missing values in submit: %s' % submit.isnull().sum().sum())
print('# of duplicate values in train: %s' % (train.id.unique().shape[0] - train.id.shape[0]))
print('# of duplicate values in submit: %s' % (submit.id.unique().shape[0] - submit.id.shape[0]))

# of missing values in train: 0
# of missing values in submit: 0
# of duplicate values in train: 0
# of duplicate values in submit: 0


In [19]:
def print_mean_response(dimension):
    response = train.groupby(dimension)[['Response']].mean()
    print(response)
    return

def plot_mean_response(dimension):
    response = train.groupby(dimension)[['Response']].mean()
    plt.figure()
    plt.plot(response, 'o')
    return

def plot_distribution(dimension):
    plt.figure()
    neg = train[train.Response==0]
    pos = train[train.Response==1]
    
    sns.distplot(neg[dimension], hist=False, kde=True, 
             bins=int(180/5), color = 'darkblue', 
             hist_kws={'edgecolor':'black'},
             kde_kws={'linewidth': 4}, label='%s when Response=0' % dimension)
    
    sns.distplot(pos[dimension], hist=False, kde=True, 
             bins=int(180/5), color = 'red', 
             hist_kws={'edgecolor':'black'},
             kde_kws={'linewidth': 4}, label='%s when Response=1' % dimension)
    
def plot_hist(dimension):
    
    fig, ax = plt.subplots(1,2)
    fig.tight_layout()
                    
    neg = train[train.Response == 0][dimension].to_frame()
    pos = train[train.Response == 1][dimension].to_frame()

    pos_percentages = (pos.groupby(dimension).size() / pos.groupby(dimension).size().sum()) * 100
    pos_percentages = pos_percentages.reset_index(name='frequencies')
   
    
    neg_percentages = (neg.groupby(dimension).size() / neg.groupby(dimension).size().sum()) * 100
    neg_percentages = neg_percentages.reset_index(name='frequencies')
    
    neg_plot = sns.barplot(x=neg_percentages[dimension], y=neg_percentages['frequencies'], ax=ax[0])
    pos_plot = sns.barplot(x=pos_percentages[dimension], y=pos_percentages['frequencies'], ax=ax[1])
    
    ax[0].set(xlabel='%s (Response = 0)' % dimension, ylabel='Frequencies (%)')
    ax[1].set(xlabel='%s (Response = 1)' % dimension, ylabel='Frequencies (%)')
  
    fig.show()

    return
    
    
# plot_distribution('Age')
# plot_distribution('Vintage')
# plot_distribution('Annual_Premium')

# plot_hist('Gender')
# plot_hist('Driving_License')
# plot_hist('Previously_Insured')
# plot_hist('Vehicle_Age')
# plot_hist('Vehicle_Damage')


print(train.groupby('Response')['id'].count() / train.shape[0])

Response
0    0.877437
1    0.122563
Name: id, dtype: float64


In [20]:
def preprocess(X):
    df_dropped = X.drop(['Vintage', 'Annual_Premium'], axis=1)
 
    X = pd.get_dummies(df_dropped, 
                         columns=['Gender', 'Driving_License', 
                                  'Region_Code', 'Previously_Insured',
                                  'Vehicle_Age', 'Vehicle_Damage', 
                                  'Policy_Sales_Channel'])
    return X
    
# construct X_train




X_raw = train.iloc[:, 1:11]
train_rows = train.shape[0]
y = train[['Response']]
X_submit_raw = submit.iloc[:, 1:11]

preprocessed = preprocess(pd.concat([X_raw, X_submit_raw]))

X = preprocessed.iloc[0:(train_rows), :]
X_submit = preprocessed.iloc[train_rows:,: ]


X_train, X_dev_test, y_train, y_dev_test = train_test_split(X, y, test_size=0.05)
X_dev, X_test, y_dev, y_test = train_test_split(X_dev_test, y_dev_test, test_size=0.5)


print('Train size: %s' % X_train.shape[0])
print('Dev size: %s' % X_dev.shape[0])
print('Test size: %s' % X_test.shape[0])

Train size: 362053
Dev size: 9528
Test size: 9528


In [23]:
print('Evaluate model on test set')
model.evaluate(X_test, y_test, verbose=1)

y_test_pred = model.predict(X_test)
y_test_pred = np.greater(y_test_pred, 0.5).astype(int)

val_f1 = f1_score(y_test, y_test_pred)
val_recall = recall_score(y_test, y_test_pred)         
val_precision = precision_score(y_test, y_test_pred)

print('F1 %s recall %s precision %s' % (str(val_f1), str(val_recall), str(val_precision)))

Evaluate model on test set
F1 0.0 recall 0.0 precision 0.0


  y_test_pred = np.greater(y_test_pred, 0.5).astype(int)
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:

predictions = model.predict(X_submit)
predictions = np.greater(predictions, 0.5).astype(int)

pred_df = pd.DataFrame()
pred_df['prediction'] = predictions.ravel()
ids = submit[['id']]
result = pd.concat([ids, pred_df], axis=1)
print(result.groupby('prediction').count())