In [1]:
from __future__ import annotations
import numpy as np
import pandas as pd


class DataModeler:
    def __init__(self, sample_df: pd.DataFrame):
        '''
        Initialize the DataModeler as necessary.
        '''
        # ** Your code here **
        self.train_df = sample_df.copy()
        self.model = None
        
    def prepare_data(self, oos_df: pd.DataFrame = None) -> pd.DataFrame:
        '''
        Prepare a dataframe so it contains only the columns to model and having suitable types.
        If the argument is None, work on the training data passed in the constructor.
        '''
        # ** Your code here **
        if oos_df is None:
            df = self.train_df.copy()
        else:
            df = oos_df.copy()
        
        # Convert the 'transaction_date' column to datetime
        df['transaction_date'] = df['transaction_date'].astype('datetime64[s]')

        # Convert the 'transaction_date' column to float values with handling of NaT values
        df['transaction_date'] = df['transaction_date'].apply(lambda x: x.timestamp() if not pd.isna(x) else np.nan)

        # Convert the 'transaction_date' column to floating-point numbers without truncating decimal values
        df['transaction_date'] = pd.to_numeric(df['transaction_date'])
        
        # Define columns you want to select
        features = ['transaction_date', 'amount', 'outcome']

        # Select features that exist in the DataFrame
        selected_features = [col for col in features if col in df.columns]

        # Create a new DataFrame with the selected features
        df = df[selected_features]

        return df

    def impute_missing(self, oos_df: pd.DataFrame = None) -> pd.DataFrame:
        '''
        Fill any missing values with the appropriate mean (average) value.
        If the argument is None, work on the training data passed in the constructor.
        Hint: Watch out for data leakage in your solution.
        '''
        # ** Your code here **
        if oos_df is None:
            df = self.train_df.copy()
        else:
            df = oos_df.copy()
            
        # Apply data preparation steps from prepare_data
        df = self.prepare_data(df)
        
        #Define the mean
        train_metrics = self.prepare_data(self.train_df.copy())
        mean_amount = train_metrics['amount'].mean()
        mean_transaction = train_metrics['transaction_date'].mean()
        
        # Impute missing values in amount and transcation date with the mean of the numeric values in the training data
        df['amount'] = df['amount'].fillna(mean_amount)
        df['transaction_date'] = df['transaction_date'].fillna(mean_transaction)

        return df

    def fit(self) -> None:
        '''
        Fit the model of your choice on the training data paased in the constructor, assuming it has
        been prepared by the functions prepare_data and impute_missing
        '''
        # ** Your code here **
        # Choose a model
        # ** Your code here **
        
        # Choose a model
        from sklearn.ensemble import GradientBoostingClassifier
        
        # Apply imputation of missing values
        train_df= self.impute_missing(self.train_df.copy())
        
        # Fit the model
        self.model = GradientBoostingClassifier().fit(train_df.drop(columns=["outcome"]), train_df["outcome"])


    def model_summary(self) -> str:
        '''
        Create a short summary of the model you have fit.
        '''
        # Choose a model
        from sklearn.ensemble import GradientBoostingClassifier
        
        # ** Your code here **
        if self.model is None:
            return "Model has not been trained yet."

        ##  GradientBoostingClassifier model
        if isinstance(self.model, GradientBoostingClassifier):
            feature_importance = self.model.feature_importances_
            params = self.model.get_params()

            summary = f"Gradient Boosting Model Summary:\n"
            summary += f"Feature Importance: {feature_importance}\n"
            summary += f"Model Parameters: {params}\n"
            return summary

        return "Model summary not available for this model type."

    def predict(self, oos_df: pd.DataFrame = None) -> pd.Series[bool]:
        '''
        Make a set of predictions with your model. Assume the data has been prepared by the
        functions prepare_data and impute_missing.
        If the argument is None, work on the training data passed in the constructor.
        '''

        # Make sure the input data has the same columns as the training data
        if oos_df is None:
            df = self.impute_missing(self.train_df.copy())
            #drop target
            df.drop(columns=['outcome'], inplace=True)
        else:
            df = self.impute_missing(oos_df.copy())
            
        predictions = self.model.predict(df)

        return predictions

    def save(self, path: str) -> None:
        '''
        Save the DataModeler so it can be re-used.
        '''
        # ** Your code here **
        # Salvar o modelo, se necessário
        import pickle

        with open(path, "wb") as f:
            pickle.dump(self, f)

    @staticmethod
    def load(path: str) -> DataModeler:
        '''
        Reload the DataModeler from the saved state so it can be re-used.
        '''
        # ** Your code here **
        # Carregar um modelo previamente salvo, se necessário
        import pickle

        with open(path, "rb") as f:
            modeler = pickle.load(f)

        return modeler

In [2]:
pwd

'C:\\Users\\Bruno\\Desktop\\OfferFit'

### Data

In [3]:
# You should not have to modify the code below this point

transact_train_sample = pd.DataFrame(
    {
        "customer_id": [11, 12, 13, 14, 15, 16, 17, 18, 19, 20],
        "amount": [1, 3, 12, 6, 0.5, 0.2, np.nan, 5, np.nan, 3],
        "transaction_date": [
            '2022-01-01',
            '2022-08-01',
            None,
            '2022-12-01',
            '2022-02-01',
            None,
            '2022-02-01',
            '2022-01-01',
            '2022-11-01',
            '2022-01-01'
        ],
        "outcome" : [False, True, True, True, False, False, True, True, True, False]
    }
)


In [4]:
print(f"Training sample:\n{transact_train_sample}\n")

Training sample:
   customer_id  amount transaction_date  outcome
0           11     1.0       2022-01-01    False
1           12     3.0       2022-08-01     True
2           13    12.0             None     True
3           14     6.0       2022-12-01     True
4           15     0.5       2022-02-01    False
5           16     0.2             None    False
6           17     NaN       2022-02-01     True
7           18     5.0       2022-01-01     True
8           19     NaN       2022-11-01     True
9           20     3.0       2022-01-01    False



In [5]:
print(f"Current dtypes:\n{transact_train_sample.dtypes}\n")

Current dtypes:
customer_id           int64
amount              float64
transaction_date     object
outcome                bool
dtype: object



In [6]:
transactions_modeler = DataModeler(transact_train_sample)

In [7]:
transactions_modeler.prepare_data()

Unnamed: 0,transaction_date,amount,outcome
0,1640995000.0,1.0,False
1,1659312000.0,3.0,True
2,,12.0,True
3,1669853000.0,6.0,True
4,1643674000.0,0.5,False
5,,0.2,False
6,1643674000.0,,True
7,1640995000.0,5.0,True
8,1667261000.0,,True
9,1640995000.0,3.0,False


In [8]:
print(f"Changed columns to:\n{transactions_modeler.train_df.dtypes}\n")

Changed columns to:
customer_id           int64
amount              float64
transaction_date     object
outcome                bool
dtype: object



In [9]:
transactions_modeler.impute_missing()

Unnamed: 0,transaction_date,amount,outcome
0,1640995000.0,1.0,False
1,1659312000.0,3.0,True
2,1650845000.0,12.0,True
3,1669853000.0,6.0,True
4,1643674000.0,0.5,False
5,1650845000.0,0.2,False
6,1643674000.0,3.8375,True
7,1640995000.0,5.0,True
8,1667261000.0,3.8375,True
9,1640995000.0,3.0,False


In [10]:
print("Fitting  model")
transactions_modeler.fit()

Fitting  model


In [11]:
print(f"Fit model:\n{transactions_modeler.model_summary()}\n")

Fit model:
Gradient Boosting Model Summary:
Feature Importance: [0.33333333 0.66666667]
Model Parameters: {'ccp_alpha': 0.0, 'criterion': 'friedman_mse', 'init': None, 'learning_rate': 0.1, 'loss': 'deviance', 'max_depth': 3, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_iter_no_change': None, 'random_state': None, 'subsample': 1.0, 'tol': 0.0001, 'validation_fraction': 0.1, 'verbose': 0, 'warm_start': False}




In [12]:
in_sample_predictions = transactions_modeler.predict()
print(f"Predicted on training sample: {in_sample_predictions}\n")
print(f'Accuracy = {sum(in_sample_predictions ==  [False, True, True, True, False, False, True, True, True, False])/.1}%')

Predicted on training sample: [False  True  True  True False False  True  True  True False]

Accuracy = 100.0%


In [13]:
transactions_modeler.save("transact_modeler")
loaded_modeler = DataModeler.load("transact_modeler")
print(f"Loaded DataModeler sample df:\n{loaded_modeler.model_summary()}\n")


Loaded DataModeler sample df:
Gradient Boosting Model Summary:
Feature Importance: [0.33333333 0.66666667]
Model Parameters: {'ccp_alpha': 0.0, 'criterion': 'friedman_mse', 'init': None, 'learning_rate': 0.1, 'loss': 'deviance', 'max_depth': 3, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_iter_no_change': None, 'random_state': None, 'subsample': 1.0, 'tol': 0.0001, 'validation_fraction': 0.1, 'verbose': 0, 'warm_start': False}




In [14]:
transact_test_sample = pd.DataFrame(
    {
        "customer_id": [21, 22, 23, 24, 25],
        "amount": [0.5, np.nan, 8, 3, 2],
        "transaction_date": [
            '2022-02-01',
            '2022-11-01',
            '2022-06-01',
            None,
            '2022-02-01'
        ]
    }
)
adjusted_test_sample = transactions_modeler.prepare_data(transact_test_sample)

print(f"Changed columns to:\n{adjusted_test_sample.dtypes}\n")

Changed columns to:
transaction_date    float64
amount              float64
dtype: object



In [15]:
filled_test_sample = transactions_modeler.impute_missing(adjusted_test_sample)
print(f"Imputed missing as mean:\n{filled_test_sample}\n")

Imputed missing as mean:
   transaction_date  amount
0      1.643674e+09  0.5000
1      1.667261e+09  3.8375
2      1.654042e+09  8.0000
3      1.650845e+09  3.0000
4      1.643674e+09  2.0000



In [16]:
oos_predictions = transactions_modeler.predict(filled_test_sample)
print(f"Predicted on out of sample data: {oos_predictions}\n")
print(f'Accuracy = {sum(oos_predictions == [False, True, True, False, False])/.05}%')

Predicted on out of sample data: [False  True  True False False]

Accuracy = 100.0%
