# Import and clean data
- fill unknown cells with similars
- ID is added for every row
- remove rows with unknown cells

In [None]:
import pandas as pd

df = pd.read_csv('https://ali.dasmeh.ir/bank-direct-marketing-campaigns.csv')

In [None]:
# for combining conditions
from functools import reduce
import operator

# converting type of data
df.replace('unknown', None, inplace=True) # because in the next step I want to use dropna

df['y'].replace('yes', 1, inplace=True)
df['y'].replace('no', 0, inplace=True)

# 3 is there because I want to drop rows with more than 2 None
df = df.dropna(thresh = df.shape[1] - 3)

# assign id for all rows
df['id'] = df.reset_index().index + 1

def create_combined_condition(conditions_arr):
  combined_condition = conditions_arr[0] & conditions_arr[1]
  for condition in conditions_arr[2:] :
    combined_condition = combined_condition & condition

  return combined_condition

def search_for_the_most_similar_row(current_row, key_with_none_value):
  conditions = []
  for key, value in current_row.items():
    if value is not None:
      conditions.append((df[key] == value))
    else:
      conditions.append((df[key].notna()))

  combined_condition = create_combined_condition(conditions)
  similar_rows = df[combined_condition]
  conditions.pop()

  while similar_rows.shape[0]== 0 and len(conditions) > 3:
    combined_condition = create_combined_condition(conditions)
    similar_rows = df[combined_condition]
    conditions.pop()


  if similar_rows.shape[0] == 0:
    return False

  else:
    return similar_rows.iloc[0, :][key_with_none_value]

print("Number of rows with None value : " , df[df.isna().any(axis=1)].shape)

must_be_removed = []
for index, row in df[df.isna().any(axis=1)].iterrows():
  for key, value in row.items():
    # if key == "housing" and value is None:
    if value is None:
      new_value = search_for_the_most_similar_row(row, key)
      if new_value == False:
        must_be_removed.append(row['id']) # there was no similar row, then remove cuurent row.
      else:
        df.at[index, key] = new_value


for row_index in list(set(must_be_removed)):
  df =  df[df['id'] != row_index]

df.drop_duplicates()

# I do not know why but some rows did not get value instead of None and did not remove ! then I remove them here.
df = df.dropna()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['id'] = df.reset_index().index + 1


Number of rows with None value :  (10678, 21)


# Convert categories' names to numbers
this must be modified for any new dataset

In [None]:
# converting strings to numbers
def categories_to_number(df):
  modified_df = df.copy(deep=True)

  mapping_dict_for_week_days = {"mon": 1, "tue": 2, "wed": 3, "thu": 4, "fri": 5}
  modified_df["day_of_week"] = modified_df["day_of_week"].map(mapping_dict_for_week_days)

  mapping_dict_month_to_number = { "jan": 1, "feb": 2, "mar": 3, "apr": 4, "may": 5, "jun": 6, "jul": 7, "aug": 8, "sep": 9, "oct": 10, "nov": 11, "dec": 12 }
  modified_df["month"] = modified_df["month"].map(mapping_dict_month_to_number)

  mapping_dict_contact = { "cellular": 1, "telephone": 2 }
  modified_df["contact"] = modified_df["contact"].map(mapping_dict_contact)

  mapping_dict_loan = { "yes": True, "no": False }
  modified_df["loan"] = modified_df["loan"].map(mapping_dict_loan)

  mapping_dict_housing = { "yes": True, "no": False }
  modified_df["housing"] = modified_df["housing"].map(mapping_dict_housing)

  mapping_dict_education = { "basic.4y": 1, "basic.6y": 2, "basic.9y": 3, "high.school": 4, "illiterate": 5, "professional.course": 6, "university.degree": 7 }
  modified_df["education"] = modified_df["education"].map(mapping_dict_education)

  mapping_dict_marital = { "married": 1, "single": 2, "divorced": 3 }
  modified_df["marital"] = modified_df["marital"].map(mapping_dict_marital)

  mapping_dict_job = { "admin.": 1, "blue-collar": 2, "entrepreneur": 3 , "housemaid": 4 , "management": 5 , "retired": 6 , "self-employed": 7, "services": 8 , "student": 9 , "technician": 10, "unemployed": 11 }
  modified_df["job"] = modified_df["job"].map(mapping_dict_job)

  mapping_dict_default = { "no": False, "yes": True }
  modified_df["default"] = modified_df["default"].map(mapping_dict_default)

  del modified_df['poutcome']
  del modified_df['pdays']

  return modified_df

modified_df = categories_to_number(df)

# Model Training

In [None]:
# Import necessary libraries
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report


x = modified_df.loc[:, modified_df.columns != 'y']
features = x.loc[:, x.columns != 'id']

target = modified_df.loc[:, 'y']

# Step 2: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Step 3: Create and train the decision tree model
model = DecisionTreeClassifier(random_state=42)
model.fit(X_train, y_train)

# Step 4: Make predictions on the test set
y_pred = model.predict(X_test)

# Step 5: Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

# Print the evaluation metrics
print(f"Accuracy: {accuracy}")
print("\nClassification Report:")
print(classification_rep)

Accuracy: 0.8335807968324672

Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.90      0.91      7165
           1       0.30      0.35      0.32       917

    accuracy                           0.83      8082
   macro avg       0.61      0.62      0.61      8082
weighted avg       0.85      0.83      0.84      8082



In [None]:
import joblib

joblib.dump(model, 'dss_bank_marketing_trained_model.sav')


['dss_bank_marketing_trained_model.sav']

# Prediction

In [None]:
new_predict = pd.DataFrame([
  {
    "age": 56,
    "job": "housemaid",
    "marital": "married",
    "education": "basic.4y",
    "default": "no",
    "housing": "no",
    "loan": "no",
    "contact": "telephone",
    "month": "may",
    "day_of_week": "mon",
    "campaign": 1,
    "pdays": 999,
    "previous": 0,
    "poutcome": "nonexistent",
    "emp.var.rate": 1.1,
    "cons.price.idx": 93.994,
    "cons.conf.idx": -36.4,
    "euribor3m": 4.857,
    "nr.employed": 5191,
  },
  {
    "age": 44,
    "job": "technician",
    "marital": "married",
    "education": "professional.course",
    "default": "no",
    "housing": "no",
    "loan": "no",
    "contact": "cellular",
    "month": "nov",
    "day_of_week": "fri",
    "campaign": 1,
    "pdays": 999,
    "previous": 0,
    "poutcome": "nonexistent",
    "emp.var.rate": -1.1,
    "cons.price.idx": 94.76700000000001,
    "cons.conf.idx": -50.8,
    "euribor3m": 1.028,
    "nr.employed": 4963.6,
  }
]) # y = no, yes



r = categories_to_number(new_predict)

# y_pred = model.predict(r)
y_pred = joblib.load('dss_bank_marketing_trained_model.sav').predict(r)
y_pred

array([0, 1])