In [5]:
import pandas as pd

In [1]:
def import_churn_libraries():
    """
    Import commonly used libraries and packages for churn prediction.

    Returns:
    - pd: Data manipulation library.
    - np: Numerical operations library.
    - plt: Plotting library.
    - sns: Data visualization library.
    - train_test_split: Library for model selection.
    - StandardScaler: Library for data preprocessing.
    - RandomForestClassifier: Random Forest classifier.
    - accuracy_score: Library for model evaluation metrics.
    - confusion_matrix: Library for model evaluation metrics.
    - classification_report: Library for model evaluation metrics.
    """
    import pandas as pd
    import numpy as np
    import matplotlib.pyplot as plt
    import seaborn as sns
    from sklearn.model_selection import train_test_split
    from sklearn.preprocessing import StandardScaler
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

    return pd, np, plt, sns, train_test_split, StandardScaler, RandomForestClassifier, accuracy_score, confusion_matrix, classification_report


In [2]:
pd, np, plt, sns, train_test_split, StandardScaler, RandomForestClassifier, accuracy_score, confusion_matrix, classification_report = import_churn_libraries()


### 0. Loading Dataset

In [6]:
def load_data(file_path):
    # Assuming your data is in a CSV file, adjust accordingly for other formats
    data = pd.read_csv(file_path)
    return data

In [None]:
file_path = "../data/raw/new_raw.csv"

data = load_data(file_path)
data

### 1. Data Preprocessing Function:
- Handle missing values.
- Encode categorical variables.
- Scale or normalize features.

In [7]:
# Data Preprocessing Function
def preprocess_data(data):
    # Droping Unnamed: 0 columns
    data = data.drop(columns="Unnamed: 0")

    # Checking for data integrity
    # For example, ensuring 'txn_date' is in the correct format
    data['txn_date'] = pd.to_datetime(data['txn_date'], errors='coerce')

    # Handling outliers in 'txn_amount'
    Q1 = data['txn_amount'].quantile(0.25)
    Q3 = data['txn_amount'].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    data = data[(data['txn_amount'] >= lower_bound) & (data['txn_amount'] <= upper_bound)]

    return data


In [None]:
# # Preprocess data
# df_preprocessed = preprocess_data(data)

# df_preprocessed

### 2. Feature Engineering Function:
- Create new features.
- Transform existing features.

In [8]:
import pandas as pd

def feature_engineering(df_preprocessed):
    # Extract day from 'txn_date'
    df_preprocessed['txn_day'] = pd.to_datetime(df_preprocessed['txn_date']).dt.day
    
    # Extract month from 'txn_date'
    df_preprocessed['txn_month'] = pd.to_datetime(df_preprocessed['txn_date']).dt.month
    
    # Assuming 'txn_type' is categorical, perform one-hot encoding
    df_preprocessed = pd.concat([df_preprocessed, pd.get_dummies(df_preprocessed['txn_type'], prefix='txn_type').astype(int)], axis=1)

    # Dropping unnecessary columns
    #df_preprocessed = df_preprocessed.drop(['txn_date', 'txn_type'], axis=1)

    return df_preprocessed


In [12]:
df_preprocessed

NameError: name 'df_preprocessed' is not defined

In [None]:
# # Assuming df_preprocessed is your preprocessed DataFrame
# df_engineered = feature_engineering(df_preprocessed)

# # Display the resulting DataFrame
# df_engineered

In [9]:
import pandas as pd
from datetime import timedelta

def add_churn_column(df_engineered, churn_window_days=90):
    """
    Add a 'churn' column to the DataFrame based on the calculated churn date.

    Parameters:
    - df (pd.DataFrame): Input DataFrame with 'txn_date' column.
    - churn_window_days (int): Number of days to consider for churn (default is 90).

    Returns:
    - pd.DataFrame: DataFrame with the added 'churn' column.
    """


    # Convert 'txn_date' to datetime format if not already
    df_engineered['txn_date'] = pd.to_datetime(df_engineered['txn_date'])

    # Sort the DataFrame by 'txn_date'
    df_engineered = df_engineered.sort_values(by='txn_date')

    # Calculate the hypothetical churn date
    last_txn_date = df_engineered['txn_date'].max()
    churn_date = last_txn_date - timedelta(days=churn_window_days)

    # Create 'churn' column based on the calculated churn date
    df_engineered['churn'] = (df_engineered['txn_date'] <= churn_date).astype(int)
    return df_engineered


In [None]:
# df_churned = add_churn_column(df_engineered)
# df_churned.head()

### 3. Train-Test Split Function:
- Split the dataset into training and testing sets.

In [13]:
from sklearn.model_selection import train_test_split
#df_churned = df_engineered.copy()
# features = df_churned[['account_number', 'txn_amount', 'txn_month',
#        'txn_day', 'txn_type_add_money', 'txn_type_cash_in',
#        'txn_type_cash_out', 'txn_type_payment', 'txn_type_send_money',
#        'churn']]

# target = 'churn'

def split_data(data, target_column='churn'):
    # Extract features and target variable
    X = data.drop(target_column, axis=1)  # Assuming 'churn' is the target column
    y = data[target_column]

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    return X_train, X_test, y_train, y_test


### 4. Model Selection and Training Function:
- Choose a machine learning model.
- Train the selected model.

In [14]:
from sklearn.ensemble import RandomForestClassifier

def train_model(model, X_train, y_train):
    # Train the model
    model.fit(X_train, y_train)

    return model

### 5. Model Evaluation Function:
- Make predictions on the test set.
- Evaluate the model's performance.

In [15]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

def evaluate_model(model, X_test, y_test):
    # Make predictions on the test set
    y_pred = model.predict(X_test)

    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)

    # Generate confusion matrix
    conf_matrix = confusion_matrix(y_test, y_pred)

    # Generate classification report
    class_report = classification_report(y_test, y_pred)

    return accuracy, conf_matrix, class_report


In [None]:
# accuracy, conf_matrix, class_report = evaluate_model(trained_model, X_test, y_test)

# # Display the results
# print(f"Model Accuracy: {accuracy}")
# print("\nConfusion Matrix:")
# print(conf_matrix)
# print("\nClassification Report:")
# print(class_report)

### 6. Hyperparameter Tuning Function (Optional):
- Tune hyperparameters for better model performance.

In [None]:
# from sklearn.model_selection import GridSearchCV

# def tune_hyperparameters(base_model, param_grid, X_train, y_train):
#     # Initialize Grid Search with the base model and parameter grid
#     grid_search = GridSearchCV(base_model, param_grid, cv=5, scoring='accuracy', n_jobs=-1)

#     # Fit the grid search to the data
#     grid_search.fit(X_train, y_train)

#     # Get the best-tuned model
#     tuned_model = grid_search.best_estimator_

#     return tuned_model

# # Example parameter grid for RandomForestClassifier
# param_grid_rf = {
#     'n_estimators': [50, 100, 200],
#     'max_depth': [None, 10, 20],
#     'min_samples_split': [2, 5, 10],
#     'min_samples_leaf': [1, 2, 4]
# }

# # Assuming X_train and y_train are obtained from the split_data function
# # Assuming trained_model is the initial RandomForest model trained in your previous step
# tuned_model_rf = tune_hyperparameters(trained_model, param_grid_rf, X_train, y_train)

# # Now you can use tuned_model_rf for evaluation or further steps


### Save Models

In [39]:
# save_model.py

import joblib
from datetime import datetime

def save_model(model, model_name, directory="models"):
    """
    Save a machine learning model with a timestamped filename.

    Parameters:
    - model: The trained machine learning model.
    - model_name: The name of the model (e.g., "RandomForestClassifier").
    - directory: The directory where the model will be saved.
    """
    timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
    file_path = f"{directory}/{model_name}_{timestamp}.pkl"

    try:
        joblib.dump(model, file_path)
        print(f"Model saved successfully to {file_path}")
    except Exception as e:
        print(f"Error saving the model: {e}")


### 7. Main Function:
- Orchestrates the entire modeling process.

In [42]:
def main():
    pd, np, plt, sns, train_test_split, StandardScaler, RandomForestClassifier, accuracy_score, confusion_matrix, classification_report = import_churn_libraries()

    
    # Load data
    file_path = "../data/raw/new_raw.csv"  
    data = load_data(file_path)

    # Preprocess data
    preprocessed_data = preprocess_data(data)

    # Feature engineering
    engineered_data = feature_engineering(preprocessed_data)
    
    # Add 'churn' column
    df_churned = add_churn_column(engineered_data)
    
    features = df_churned[['account_number', 'txn_amount', 'txn_month',
       'txn_day', 'txn_type_add_money', 'txn_type_cash_in',
       'txn_type_cash_out', 'txn_type_payment', 'txn_type_send_money',
       'churn']]

    target = 'churn'
    
    # Split data    
    X_train, X_test, y_train, y_test = split_data(features)

    # Choose and train the model
    model = RandomForestClassifier(random_state=42)
    trained_model = train_model(model, X_train, y_train)

    # Save the trained model
    save_model(trained_model, model_name="RandomForestClassifier", directory="../models/")
    
    # Evaluate the model
    accuracy, conf_matrix, class_report = evaluate_model(trained_model, X_test, y_test)

    # Display results
    print(f"Model Accuracy: {accuracy}")
    print("\nConfusion Matrix:")
    print(conf_matrix)
    print("\nClassification Report:")
    print(class_report)

if __name__ == "__main__":
    main()


Model saved successfully to ../models//RandomForestClassifier_20240129011505.pkl
Model Accuracy: 1.0

Confusion Matrix:
[[37837     0]
 [    0 87459]]

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     37837
           1       1.00      1.00      1.00     87459

    accuracy                           1.00    125296
   macro avg       1.00      1.00      1.00    125296
weighted avg       1.00      1.00      1.00    125296

