<a href="https://colab.research.google.com/github/aksharat/MLProblems/blob/main/Binary/CustomerOutcomePrediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


### Instructions:

- Fill in the methods of the DataModeler class to produce the same printed results
  as in the comments labeled `Expected Output` in the second half of the file.
- The DataModeler should predict the 'outcome' from the columns 'amount' and 'transaction date.'
  Your model should ignore the 'customer_id' column.
- For the modeling methods `fit`, `predict` and `model_summary` you can use any appropriate method.
  Try to get 100% accuracy on both training and test, as indicated in the output.
- Please feel free to import any popular libraries of choice for your solution!
- Your solution will be judged on both correctness and code quality.
- Good luck, and have fun!



In [11]:
# required in python 3.7 or so to consider annotations as strings
from __future__ import annotations  # From 3.11, it isn't required as it is considered by default

In [126]:
#import the standard libraries
import pandas as pd
import numpy as np
from matplotlib import dates as mdates
from sklearn.impute import SimpleImputer
# import the library
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
import pickle

In [236]:
class DataModeler:
  def __init__(self, sample_df: pd.DataFrame):
        '''
        Initialize the DataModeler as necessary.
        '''
        # ** Your code here **
        # keep a copy of the sample_df i.e the training data. This would be used in case of no external data available
        self.train_df = sample_df.copy()
        # self.train_df.reset_index(drop=True, inplace=True)
        # Store the labels and the customer_ids for retrival
        self.labels = sample_df['outcome']
        self.ids = sample_df['customer_id']
        # self.train_df.drop(columns=['outcome','customer_id'],inplace=True)
        # Set the model placeholder
        self.model = None
  def prepare_data(self, oos_df: pd.DataFrame = None) -> pd.DataFrame:
      '''
      Prepare a dataframe so it contains only the columns to model and having suitable types.
      If the argument is None, work on the training data passed in the constructor.
      '''
      # ** Your code here **
      # Set a flag to see to check the existance of the oos_df
      flag=0
      if oos_df is None:
        oos_df = self.train_df
        flag=1

      # oos_df = oos_df.loc[:,["amount","transaction_date"]]
      # Convert the object datatype to datetime, errors are set to NaT
      oos_df["transaction_date"] = pd.to_datetime(oos_df["transaction_date"],errors='coerce')
      # Convert the datetime dtype to numeric, i.e float, errors are set to np.nan or junk value
      oos_df["transaction_date"] = pd.to_numeric(oos_df["transaction_date"],errors='coerce')
      # Replace non-positive values (due to error coerce through the pandas functions of datetime and numeric) with np.nan
      oos_df.loc[oos_df["transaction_date"] <= 0, "transaction_date"] = np.nan
      if flag==1:
        # use the data from the constructor
        self.train_df = oos_df.drop(columns=['customer_id','outcome'])
      else:
        # external data is available
        if 'outcome' in oos_df.columns:
          # Column exists,set the outcome labels
          self.labels = oos_df['outcome']
        # set the ids based on customer id
        self.ids = oos_df['customer_id']
        # return just the necessary features
        return oos_df.loc[:,["amount","transaction_date"]]

  def impute_missing(self, oos_df: pd.DataFrame = None) -> pd.DataFrame:
      '''
      Fill any missing values with the appropriate mean (average) value.
      If the argument is None, work on the training data passed in the constructor.
      '''
      # Set a flag to see to check the existance of the oos_df
      flag=0
      if oos_df is None:
        oos_df = self.train_df
        flag=1
      # Impute missing values with the mean
      imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
      imputed_df = pd.DataFrame(imp_mean.fit_transform(oos_df), columns=oos_df.columns)
      if flag==1:
        # add in the customer ids
        self.train_df = pd.concat([self.ids,imputed_df],axis=1)
      else:
        # return the concatenation of the ids for reference and the imputed df
        return pd.concat([self.ids,imputed_df],axis=1)


  def fit(self) -> None:
      '''
      Fit the model of your choice on the training data paased in the constructor, assuming it has
      been prepared by the functions prepare_data and impute_missing
      '''
      # ** Your code here **
      # self.model = DecisionTreeClassifier(min_samples_split=10,max_depth=3)
      # self.model.fit(self.train_df.drop(columns=['customer_id']), self.labels)
      self.model = GradientBoostingClassifier()
      self.model.fit(self.train_df.drop(columns=['customer_id']), self.labels)
  def model_summary(self) -> str:
      '''
      Create a short summary of the model you have fit.
      '''
      # ** Your code here **
      if self.model is None:
        return "Model has not been trained yet."

      if isinstance(self.model, DecisionTreeClassifier):
        # to get the feature importance params
        feature_importance = self.model.feature_importances_
        params = self.model.get_params()

        summary = f"Decision Tree Model Summary:\n"
        summary += f"Feature Importance: {feature_importance}\n"
        summary += f"Model Parameters: {params}\n"
        return summary

      elif isinstance(self.model, GradientBoostingClassifier):
          # to get the feature importance params
          feature_importance = self.model.feature_importances_
          params = self.model.get_params()

          summary = f"Gradient Boosting Model Summary:\n"
          summary += f"Feature Importance: {feature_importance}\n"
          summary += f"Model Parameters: {params}\n"
          return summary

      return "Model summary not available for this model type."

  def predict(self, oos_df: pd.DataFrame = None) -> pd.Series[bool]:
      '''
      Make a set of predictions with your model. Assume the data has been prepared by the
      functions prepare_data and impute_missing.
      If the argument is None, work on the training data passed in the constructor.
      '''
      # ** Your code here **
      if oos_df is None:
        oos_df = self.train_df
      return self.model.predict(oos_df.drop(columns=['customer_id']))

  def save(self, path: str) -> None:
      '''
      Save the DataModeler so it can be re-used.
      '''
      # ** Your code here **
      with open(path, "wb") as f:
        pickle.dump(self, f)

  @staticmethod
  def load(path: str) -> DataModeler:
      '''
      Reload the DataModeler from the saved state so it can be re-used.
      '''
      # ** Your code here **
      with open(path, "rb") as f:
          modeler = pickle.load(f)

      return modeler


################################################################################

In [237]:
transact_train_sample = pd.DataFrame(
    {
        "customer_id": [11, 12, 13, 14, 15, 16, 17, 18, 19, 20],
        "amount": [1, 3, 12, 6, 0.5, 0.2, np.nan, 5, np.nan, 3],
        "transaction_date": [
            '2022-01-01',
            '2022-08-01',
            None,
            '2022-12-01',
            '2022-02-01',
            None,
            '2022-02-01',
            '2022-01-01',
            '2022-11-01',
            '2022-01-01'
        ],
        "outcome" : [False, True, True, True, False, False, True, True, True, False]
    }
)


In [238]:
print(f"Training sample:\n{transact_train_sample}\n")

Training sample:
   customer_id  amount transaction_date  outcome
0           11     1.0       2022-01-01    False
1           12     3.0       2022-08-01     True
2           13    12.0             None     True
3           14     6.0       2022-12-01     True
4           15     0.5       2022-02-01    False
5           16     0.2             None    False
6           17     NaN       2022-02-01     True
7           18     5.0       2022-01-01     True
8           19     NaN       2022-11-01     True
9           20     3.0       2022-01-01    False



Expected Output: <br>
Training sample:

| customer_id | amount | transaction_date | outcome |
|-------------|--------|------------------|---------|
| 11          | 1.0    | 2022-01-01       | False   |
| 12          | 3.0    | 2022-08-01       | True    |
| 13          | 12.0   | None             | True    |
| 14          | 6.0    | 2022-12-01       | True    |
| 15          | 0.5    | 2022-02-01       | False   |
| 16          | 0.2    | None             | False   |
| 17          | NaN    | 2022-02-01       | True    |
| 18          | 5.0    | 2022-01-01       | True    |
| 19          | NaN    | 2022-11-01       | True    |
| 20          | 3.0    | 2022-01-01       | False   |



In [239]:
print(f"Current dtypes:\n{transact_train_sample.dtypes}\n")

Current dtypes:
customer_id           int64
amount              float64
transaction_date     object
outcome                bool
dtype: object



Expected Output:<br>
Current dtypes:

| Column Name        | Data Type |
|--------------------|-----------|
| customer_id        | int64     |
| amount             | float64   |
| transaction_date   | object    |
| outcome            | bool      |


In [240]:
transactions_modeler = DataModeler(transact_train_sample)

transactions_modeler.prepare_data()

In [241]:
print(f"Changed columns to:\n{transactions_modeler.train_df.dtypes}\n")

Changed columns to:
amount              float64
transaction_date    float64
dtype: object



Expected Output:<br>
Changed columns to:

| Column Name        | Data Type |
|--------------------|-----------|
| amount             | float64   |
| transaction_date   | float64   |

In [242]:
transactions_modeler.impute_missing()

In [243]:
print(f"Imputed missing as mean:\n{transactions_modeler.train_df}\n")

Imputed missing as mean:
   customer_id   amount  transaction_date
0           11   1.0000      1.640995e+18
1           12   3.0000      1.659312e+18
2           13  12.0000      1.650845e+18
3           14   6.0000      1.669853e+18
4           15   0.5000      1.643674e+18
5           16   0.2000      1.650845e+18
6           17   3.8375      1.643674e+18
7           18   5.0000      1.640995e+18
8           19   3.8375      1.667261e+18
9           20   3.0000      1.640995e+18



Expected Output:<br>
Imputed missing as mean:

| customer_id | amount | transaction_date |
|-------------|--------|------------------|
| 11          | 1.0000 | 1.640995e+18     |
| 12          | 3.0000 | 1.659312e+18     |
| 13          | 12.0000 | 1.650845e+18    |
| 14          | 6.0000 | 1.669853e+18     |
| 15          | 0.5000 | 1.643674e+18     |
| 16          | 0.2000 | 1.650845e+18     |
| 17          | 3.8375 | 1.643674e+18     |
| 18          | 5.0000 | 1.640995e+18     |
| 19          | 3.8375 | 1.667261e+18     |
| 20          | 3.0000 | 1.640995e+18     |


In [244]:
print("Fitting  model")
transactions_modeler.fit()

print(f"Fit model:\n{transactions_modeler.model_summary()}\n")

Fitting  model
Fit model:
Gradient Boosting Model Summary:
Feature Importance: [0.66666667 0.33333333]
Model Parameters: {'ccp_alpha': 0.0, 'criterion': 'friedman_mse', 'init': None, 'learning_rate': 0.1, 'loss': 'log_loss', 'max_depth': 3, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_iter_no_change': None, 'random_state': None, 'subsample': 1.0, 'tol': 0.0001, 'validation_fraction': 0.1, 'verbose': 0, 'warm_start': False}




`Expected Output` <br>
Fitting  model <br>
Fit model:<br>
 <<< ANY SHORT SUMMARY OF THE MODEL YOU CHOSE >>>

In [245]:
in_sample_predictions = transactions_modeler.predict()
print(f"Predicted on training sample: {in_sample_predictions}\n")
print(f'Accuracy = {sum(in_sample_predictions ==  [False, True, True, True, False, False, True, True, True, False])/.1}%')

Predicted on training sample: [False  True  True  True False False  True  True  True False]

Accuracy = 100.0%



`Expected Output` <br>
Predicting on training sample:<br>
 [False  True  True  True False False True  True  True False]<br>
Accuracy = 100.0%

In [246]:
transactions_modeler.save("transact_modeler")
loaded_modeler = DataModeler.load("transact_modeler")

print(f"Loaded DataModeler sample df:\n{loaded_modeler.model_summary()}\n")


Loaded DataModeler sample df:
Gradient Boosting Model Summary:
Feature Importance: [0.66666667 0.33333333]
Model Parameters: {'ccp_alpha': 0.0, 'criterion': 'friedman_mse', 'init': None, 'learning_rate': 0.1, 'loss': 'log_loss', 'max_depth': 3, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_iter_no_change': None, 'random_state': None, 'subsample': 1.0, 'tol': 0.0001, 'validation_fraction': 0.1, 'verbose': 0, 'warm_start': False}




`Expected Output`<br>
Loaded DataModeler sample df:<br>
<<< THE SUMMARY OF THE MODEL YOU CHOSE >>>

In [247]:
transact_test_sample = pd.DataFrame(
    {
        "customer_id": [21, 22, 23, 24, 25],
        "amount": [0.5, np.nan, 8, 3, 2],
        "transaction_date": [
            '2022-02-01',
            '2022-11-01',
            '2022-06-01',
            None,
            '2022-02-01'
        ]
    }
)

In [248]:
adjusted_test_sample = transactions_modeler.prepare_data(transact_test_sample)

print(f"Changed columns to:\n{adjusted_test_sample.dtypes}\n")

Changed columns to:
amount              float64
transaction_date    float64
dtype: object



Expected Output: <br>
Changed columns to:

| Column Name        | Data Type |
|--------------------|-----------|
| amount             | float64   |
| transaction_date   | float64   |

dtype: object


In [249]:
filled_test_sample = transactions_modeler.impute_missing(adjusted_test_sample)

print(f"Imputed missing as mean:\n{filled_test_sample}\n")

Imputed missing as mean:
   customer_id  amount  transaction_date
0           21   0.500      1.643674e+18
1           22   3.375      1.667261e+18
2           23   8.000      1.654042e+18
3           24   3.000      1.652162e+18
4           25   2.000      1.643674e+18



Expected Output:<br>
Imputed missing as mean:

| customer_id | amount | transaction_date |
|-------------|--------|------------------|
| 21          | 0.5000 | 1.643674e+18     |
| 22          | 3.8375 | 1.667261e+18     |
| 23          | 8.0000 | 1.654042e+18     |
| 24          | 3.0000 | 1.650845e+18     |
| 25          | 2.0000 | 1.643674e+18     |


In [250]:
oos_predictions = transactions_modeler.predict(filled_test_sample)
print(f"Predicted on out of sample data: {oos_predictions}\n")
print(f'Accuracy = {sum(oos_predictions == [False, True, True, False, False])/.05}%')

Predicted on out of sample data: [False  True  True False False]

Accuracy = 100.0%


`Expected Output`
Predicted on out of sample data: <br>
[False True True False False] ([0 1 1 0 0])
Accuracy = 100.0%