# Term Deposit Marketing

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve, auc
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_validate
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    confusion_matrix, classification_report, accuracy_score,
    roc_curve, auc, precision_recall_curve, average_precision_score
)
import matplotlib.pyplot as plt

## Data Exploration

In [None]:
data = pd.read_csv(r'/kaggle/input/term-deposit-marketing-2020/term-deposit-marketing-2020.csv')
data.head()

In [None]:
counts = data.y.value_counts()
pcts = data.y.value_counts(normalize = True) * 100
print(pd.concat([counts, pcts], axis=1))

In [None]:
data.isnull().sum()

In [None]:
data.describe()

In [None]:
data.info()

In [None]:
categorical_cols = [col for col in data.columns if data[col].dtype == 'object']
numerical_cols = [col for col in data.columns if data[col].dtype != 'object']

print("Categorical columns:", categorical_cols)
print("Numerical columns:", numerical_cols)

In [None]:
data['y'] = data['y'].map({'no':0,'yes':1})

In [None]:
print('unique values for categorical features are:')
for c in categorical_cols:
  x  = data[c].unique()
  print(f'{c}: {x} -- {len(x)} values\n')

In [None]:
corr_matrix = data[numerical_cols + ['y']].corr()
plt.figure(figsize=(10, 8))
sns.heatmap(
    corr_matrix,
    annot=True,
    fmt=".2f",
    cmap="coolwarm",
    vmin=-1, vmax=1,
    linewidths=0.5
)
plt.title('Correlation Heatmap')
plt.show()

In [None]:
# Histograms for select numeric features
for col in numerical_cols[:-1]:
    plt.figure()
    data[col].hist()
    plt.title(f'Distribution of {col.capitalize()}')
    plt.xlabel(col.capitalize())
    plt.ylabel('Count')
    plt.show()

In [None]:
# Plot conversion rate for each category
for col in categorical_cols[:-1] + ['day']:
    conversion_rates = data.groupby(col)['y'].mean().sort_values()
    plt.figure()
    conversion_rates.plot(kind='bar')
    plt.title(f'Conversion Rate by {col.capitalize()}')
    plt.xlabel(col.capitalize())
    plt.ylabel('Conversion Rate')
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.grid(color = 'grey', linestyle = '--', linewidth = 0.35)
    plt.show()

#### From the intial exploration we can make a few observations: 
1. Distribution of **campaign**, **duration**, and **balance** is highly skewed
2. The data is not balanced, not many users have converted
3. Conversion is highly correlated with duration of the call made. This is natural as longer calls could mean users who are more seriously considering the offer. But won't be of much help as a guide.
4. Conversion rate is higher for students and retired users which makes sense as these are the two groups that might benefit and financially make sense for them to use a term deposit account
5. Conversion is higher for teritary education, while secondary education doesn't make much difference. In general conversion goes up with higher education
6. Loan or mortgage makes opting for deposit acount less likely which could be attributed to the less financial freedom and more need for liquidity when paying off loans
7. Conversions are significantly higher for October and March (Why?)
8. Higher conversion when last call is made first or last day of the month. Also distribution shows multiple peaks at first 5 days, 15th to 20th, and last days of the month.

## Feature Engineering & First Model

**Feature Engineering:**
* For binary variables we convert them to 0 and 1
* For month, we convert to month number, 1-12
* For education we map to numbers while keeping unknown as -1
* job, contact, and marital status will be one hot encoded
* Numerical columns like balance and duration are standard scaled

**Models:**
* We test logistic regression, random forest, and xgboost
* We use cross validation and test the final model on a hold out set to get final score

In [None]:
X = data.drop(columns=['y'])
y = data['y']
X_train, X_hold, y_train, y_hold = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

In [None]:
# 3. One-hot encode the yes/no flags
X_train = pd.get_dummies(X_train, columns=['default','housing','loan'])
X_hold  = pd.get_dummies(X_hold,  columns=['default','housing','loan'])

# 4. Month → numeric
month_map = {
    'jan':1,  'feb':2,  'mar':3,  'apr':4,
    'may':5,  'jun':6,  'jul':7,  'aug':8,
    'sep':9,  'oct':10, 'nov':11, 'dec':12
}
X_train['month'] = X_train['month'].map(month_map)
X_hold['month']  = X_hold['month'].map(month_map)

# 5. Group jobs & one-hot
job_map = {
    'management':'white_collar','admin':'white_collar',
    'entrepreneur':'white_collar','self-employed':'white_collar',
    'technician':'white_collar',
    'blue-collar':'blue_collar','services':'blue_collar','housemaid':'blue_collar',
    'retired':'retired','unemployed':'unemployed','student':'student',
    'unknown':'unknown'
}
for df_ in (X_train, X_hold):
    df_['job_group'] = df_['job'].map(job_map)
X_train = pd.get_dummies(X_train, columns=['job_group'])
X_hold  = pd.get_dummies(X_hold,  columns=['job_group'])

# 6. Contact → cellular vs non-cellular + one-hot
for df_ in (X_train, X_hold):
    df_['contact_type'] = df_['contact'].apply(
        lambda x: 'cellular' if x=='cellular' else 'non_cellular'
    )
X_train = pd.get_dummies(X_train, columns=['contact_type'])
X_hold  = pd.get_dummies(X_hold,  columns=['contact_type'])

# 7. One-hot encode marital & education
X_train = pd.get_dummies(X_train, columns=['marital','education'])
X_hold  = pd.get_dummies(X_hold,  columns=['marital','education'])

# 8. Align columns (fill missing dummies with 0)
X_train, X_hold = X_train.align(X_hold, join='left', axis=1, fill_value=0)

# 9. Scale numeric columns
numerical_cols = ['age','balance','day','duration','campaign','month']
scaler = StandardScaler()
X_train[numerical_cols] = scaler.fit_transform(X_train[numerical_cols])
X_hold[numerical_cols]  = scaler.transform(X_hold[numerical_cols])

categorical_cols = X_train.select_dtypes(include=['object', 'category']).columns

X_train = X_train.drop(columns=categorical_cols)
X_hold  = X_hold.drop(columns=categorical_cols)

In [None]:
X_train.head()

In [None]:
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'XGBoost': XGBClassifier(n_estimators=100, max_depth=3, use_label_encoder=False, eval_metric='logloss', random_state=42)
}

# Evaluate on hold-out set
plt.figure(figsize=(6, 4))
for name, model in models.items():
    # Fit model
    model.fit(X_train, y_train)
    # Predictions
    y_pred = model.predict(X_hold)
    y_prob = model.predict_proba(X_hold)[:, 1]
    
    # Confusion matrix and classification report
    print(f"\n=== {name} ===")
    print("Confusion Matrix:")
    print(confusion_matrix(y_hold, y_pred))
    print("\nClassification Report:")
    print(classification_report(y_hold, y_pred, digits=3))
    print(f"Accuracy: {accuracy_score(y_hold, y_pred):.3f}")
    
    # ROC curve
    fpr, tpr, _ = roc_curve(y_hold, y_prob)
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, lw=2, label=f'{name} (AUC = {roc_auc:.3f})')

# Plot ROC
plt.plot([0, 1], [0, 1], linestyle='--', color='gray')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Hold-out ROC Curves')
plt.legend(loc='lower right')
plt.tight_layout()
plt.show()

# Precision-Recall curve
plt.figure(figsize=(6, 4))
for name, model in models.items():
    y_prob = model.predict_proba(X_hold)[:, 1]
    precision, recall, _ = precision_recall_curve(y_hold, y_prob)
    ap = average_precision_score(y_hold, y_prob)
    plt.plot(recall, precision, lw=2, label=f'{name} (AP = {ap:.3f})')

plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curves')
plt.legend(loc='lower left')
plt.tight_layout()
plt.show()

In [None]:
# 1. 5-Fold CV setup
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = ['accuracy', 'recall', 'roc_auc']

# 2. Run CV for each model and report/plot results
for name, model in models.items():
    scores = cross_validate(model, X_train, y_train, cv=cv, scoring=scoring, return_train_score=False)
    
    # Build a DataFrame of test scores
    cv_df = pd.DataFrame({
        'accuracy': scores['test_accuracy'],
        'recall':   scores['test_recall'],
        'roc_auc':  scores['test_roc_auc']
    }, index=[f'Fold {i+1}' for i in range(cv.get_n_splits())])
    
    # Print per-fold and mean scores
    print(f'\n{name} 5-Fold CV Results:')
    display(cv_df)  # show table in notebook
    print('Mean scores:')
    display(cv_df.mean().to_frame(name='mean').T)
    
    # Plot fold scores + mean
    plt.figure(figsize=(6, 4))
    markers = ['o', 's', '^']
    colors = ['C0', 'C1', 'C2']
    for metric, marker, color in zip(cv_df.columns, markers, colors):
        plt.plot(range(1, 6), cv_df[metric], marker=marker, color=color, label=metric)
        plt.hlines(cv_df[metric].mean(), 1, 5, colors=color, linestyles='--')
    plt.xticks(range(1, 6), cv_df.index)
    plt.ylim(0, 1)
    plt.title(f'5-Fold CV Scores for {name}')
    plt.xlabel('Fold')
    plt.ylabel('Score')
    plt.legend(loc='lower right')
    plt.tight_layout()
    plt.show()

In [None]:
# 1. Compute class-weight ratio
neg, pos = (y_train == 0).sum(), (y_train == 1).sum()
ratio = neg / pos

# 2. Define weighted models
models_weighted = {
    'RandomForest_weighted': RandomForestClassifier(
        class_weight={0: pos, 1: neg}, n_estimators=100, random_state=42),
    'XGBoost_weighted': XGBClassifier(
        scale_pos_weight=ratio, n_estimators=100, max_depth=3,
        use_label_encoder=False, eval_metric='logloss', random_state=42)
}

# 3. 5-Fold CV setup
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = ['accuracy', 'recall', 'roc_auc']

# 4. Run CV and collect results
cv_results_weighted = {}
for name, model in models_weighted.items():
    scores = cross_validate(model, X_train, y_train, cv=cv, scoring=scoring)
    cv_results_weighted[name] = {
        'cv_accuracy_mean': scores['test_accuracy'].mean(),
        'cv_recall_mean':   scores['test_recall'].mean(),
        'cv_auc_mean':      scores['test_roc_auc'].mean()
    }

# 5. Display results
results_weighted_df = pd.DataFrame(cv_results_weighted).T
print(results_weighted_df)

In [None]:
# 1. Compute positive-class weight ratio
neg, pos = (y_train == 0).sum(), (y_train == 1).sum()
ratio = neg / pos

# 2. Define two XGBoost variants
models = {
    'XGB_unweighted': XGBClassifier(
        n_estimators=100, max_depth=3,
        use_label_encoder=False, eval_metric='logloss',
        random_state=42
    ),
    'XGB_weighted': XGBClassifier(
        n_estimators=100, max_depth=3,
        scale_pos_weight=ratio,
        use_label_encoder=False, eval_metric='logloss',
        random_state=42
    )
}

# 3. 5-fold CV: accuracy, recall, ROC AUC
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = ['accuracy', 'recall', 'roc_auc']

print("=== 5-FOLD CROSS-VALIDATION ===")
for name, model in models.items():
    scores = cross_validate(model, X_train, y_train, cv=cv, scoring=scoring)
    df_cv = pd.DataFrame({
        'accuracy': scores['test_accuracy'],
        'recall':   scores['test_recall'],
        'roc_auc':  scores['test_roc_auc']
    }, index=[f'Fold {i+1}' for i in range(5)])
    
    print(f"\n{name}")
    display(df_cv)
    print("Mean scores:")
    display(df_cv.mean().to_frame(name='mean').T)

# 4. Hold-out evaluation: fit & report
print("\n=== HOLD-OUT EVALUATION ===")
roc_curves = {}
pr_curves = {}

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_hold)
    y_prob = model.predict_proba(X_hold)[:, 1]
    
    print(f"\n{name}")
    print("Confusion Matrix:")
    print(confusion_matrix(y_hold, y_pred))
    print("\nClassification Report:")
    print(classification_report(y_hold, y_pred, digits=3))
    print(f"Accuracy: {accuracy_score(y_hold, y_pred):.3f} | ROC AUC: {roc_auc_score(y_hold, y_prob):.3f}")
    
    # store for plotting
    fpr, tpr, _ = roc_curve(y_hold, y_prob)
    roc_curves[name] = (fpr, tpr)
    precision, recall, _ = precision_recall_curve(y_hold, y_prob)
    pr_curves[name] = (precision, recall, average_precision_score(y_hold, y_prob))

# 5. Plot ROC curves
plt.figure(figsize=(6, 5))
for name, (fpr, tpr) in roc_curves.items():
    plt.plot(fpr, tpr, lw=2, label=f'{name} (AUC={auc(fpr,tpr):.3f})')
plt.plot([0,1],[0,1],'--',color='gray')
plt.title('ROC Curves')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend(loc='lower right')
plt.tight_layout()
plt.show()

# 6. Plot Precision-Recall curves
plt.figure(figsize=(6, 5))
for name, (precision, recall, ap) in pr_curves.items():
    plt.plot(recall, precision, lw=2, label=f'{name} (AP={ap:.3f})')
plt.title('Precision-Recall Curves')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.legend(loc='upper right')
plt.tight_layout()
plt.show()

In [None]:


for col in ['default', 'housing', 'loan']:
    data[col] = data[col].map({'yes': 1, 'no': 0})
    
month_map = {'jan':1,'feb':2,'mar':3,'apr':4,'may':5,'jun':6,'jul':7,'aug':8,'sep':9,'oct':10,'nov':11,'dec':12}
data['month'] = data['month'].map(month_map)

edu_map = {'primary': 0, 'secondary': 1, 'tertiary': 2, 'unknown': -1}
data['education'] = data['education'].map(edu_map)

data = pd.get_dummies(data, columns=['job', 'contact', 'marital'], drop_first=True)

X = data.drop(columns=['y'])
y = data['y']

X_train, X_hold, y_train, y_hold = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

scaler = StandardScaler()
X_train[numerical_cols] = scaler.fit_transform(X_train[numerical_cols])
X_hold[numerical_cols] = scaler.transform(X_hold[numerical_cols])

models = {
    'LogisticRegression': LogisticRegression(max_iter=1000, random_state=42),
    'RandomForest': RandomForestClassifier(n_estimators=50, n_jobs=1, random_state=42),
    'XGBoost': XGBClassifier(n_estimators=50, max_depth=3, n_jobs=1,
                                      use_label_encoder=False, eval_metric='logloss', random_state=42)
}

# cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
# scoring = ['accuracy', 'roc_auc']
# cv_results = {}
# for name, model in models.items():
#     scores = cross_validate(model, X_train, y_train, cv=cv, scoring=scoring)
#     cv_results[name] = {
#         'cv_accuracy_mean': scores['test_accuracy'].mean(),
#         'cv_auc_mean': scores['test_roc_auc'].mean()
#     }

# hold_results = {}
# for name, model in models.items():
#     model.fit(X_train, y_train)
#     y_pred = model.predict(X_hold)
#     y_prob = model.predict_proba(X_hold)[:, 1]
#     hold_results[name] = {
#         'hold_accuracy': accuracy_score(y_hold, y_pred),
#         'hold_auc': roc_auc_score(y_hold, y_prob)
#     }

# # Compile results
# results_df = pd.DataFrame(cv_results).T.join(pd.DataFrame(hold_results).T)
# results_df

In [None]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = ['accuracy', 'roc_auc']
cv_results = {}
hold_results = {}
roc_curves = {}

for name, model in models.items():
    # Cross-validate on training set
    scores = cross_validate(model, X_train, y_train, cv=cv, scoring=scoring)
    cv_results[name] = {
        'cv_accuracy_mean': scores['test_accuracy'].mean(),
        'cv_auc_mean':    scores['test_roc_auc'].mean()
    }
    # Fit on full training set for hold-out evaluation
    model.fit(X_train, y_train)
    y_pred = model.predict(X_hold)
    y_prob = model.predict_proba(X_hold)[:, 1]
    hold_results[name] = {
        'hold_accuracy': accuracy_score(y_hold, y_pred),
        'hold_auc':      roc_auc_score(y_hold, y_prob)
    }
    # Compute ROC curve
    fpr, tpr, _ = roc_curve(y_hold, y_prob)
    roc_curves[name] = (fpr, tpr)

# Compile results
results_df = pd.DataFrame(cv_results).T.join(pd.DataFrame(hold_results).T)
print(results_df)

# Plot ROC curves
plt.figure(figsize=(6, 4))
for name, (fpr, tpr) in roc_curves.items():
    auc_val = hold_results[name]['hold_auc']
    plt.plot(fpr, tpr, lw=2, label=f'{name} (AUC = {auc_val:.3f})')
plt.plot([0, 1], [0, 1], linestyle='--', color='gray')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Hold-out ROC Curves')
plt.legend(loc='lower right')
plt.tight_layout()
plt.show()

1. print scores for CV
2. precisio-recall curve
3. different encoding
4. feature importance