<a href="https://colab.research.google.com/github/cinnData/UMDataWeek-2023/blob/main/Notebooks/deposit.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# [UM-01] Term deposits

### Importing the data

In [None]:
import numpy as np, pandas as pd

In [None]:
path = 'https://raw.githubusercontent.com/cinnData/UMDataWeek-2023/main/Data/'
df = pd.read_csv(path + 'deposit.csv', index_col=0)

### Exploring the data

In [None]:
df.info()

In [None]:
df.head()

In [None]:
df['deposit'].mean().round(3)

### Target vector and feature matrix

In [None]:
y = df['deposit']
y.shape

In [None]:
X = df.iloc[:, :-1]
X.shape

### Q1. Logistic regression model

In [None]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(max_iter=1500)

In [None]:
clf.fit(X, y);

In [None]:
y_pred = clf.predict(X)

In [None]:
clf.score(X, y).round(3)

In [None]:
conf = pd.crosstab(y, y_pred)
conf

In [None]:
acc = (y == y_pred).mean().round(3)
acc

In [None]:
prec = y[y_pred == 1].mean().round(3)
prec

In [None]:
rec = y_pred[y == 1].mean().round(3)
rec

### Q2a. Predictive scores

In [None]:
df['score'] = clf.predict_proba(X)[:, 1]

In [None]:
df[['deposit', 'score']]

In [None]:
df['score'].mean().round(3)

### Q2b. Distribution of the scores

In [None]:
from matplotlib import pyplot as plt
# Set the size of the figure
plt.figure(figsize = (12,5))
# First subplot
plt.subplot(1, 2, 1)
plt.hist(df['score'][y == 1], color='gray', edgecolor='white')
plt.title('Figure a. Scores (subscribers)')
plt.xlabel('Subscription score')
# Second subplot
plt.subplot(1, 2, 2)
plt.hist(df['score'][y == 0], color='gray', edgecolor='white')
plt.title('Figure b. Scores (non-subscribers)')
plt.xlabel('Subscription score');

### Q3. Set a threshold

In [None]:
y_pred = (df['score'] > 0.11)
conf = pd.crosstab(y, y_pred)
conf

In [None]:
acc = (y == y_pred).mean().round(3)
prec = y[y_pred == 1].mean().round(3)
rec = y_pred[y == 1].mean().round(3)
acc, prec, rec

### Q4. Target of 4,000 subscriptions

In [None]:
df.sort_values('score', ascending=False, inplace=True)
df[['deposit', 'score']]

In [None]:
df['cum_subscription'] = np.cumsum(df['deposit'])
df[['deposit', 'score', 'cum_subscription']]

In [None]:
len(df[df['cum_subscription'] < 4000]) + 1


### Q5. Budget 10,000 calls

In [None]:
df.index[:10000]

In [None]:
df.iloc[10000, -1]

### Q6. Validation assuming budget of 20%

In [None]:
df = df.drop(columns=['score', 'cum_subscription'])

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5)

In [None]:
clf.fit(X_train, y_train)
y_score = clf.predict_proba(X_test)[:, 1]

In [None]:
df_test = pd.DataFrame({'deposit': y_test, 'score': y_score}, index=X_test.index)
df_test.sort_values('score', inplace=True, ascending=False)
df_test

In [None]:
N = int(len(y_test)/5)
N

In [None]:
df_test['deposit'].head(N).sum()