In [1]:
import pandas as pd 
import json

# function to read file and metadata
def read_data(filepath):
    df = pd.read_csv(filepath)
    return df

data = read_data('data/BankChurners.csv')

In [59]:
def read_json(filepath):
    """
    Reads a JSON file from the specified file path.

    Args:
        filepath (str): Path to the JSON file.

    Returns:
        dict or list: Parsed JSON data as a Python dictionary or list.
    """
    try:
        with open(filepath, 'r') as file:
            data = json.load(file)
        return data
    except FileNotFoundError:
        print(f"Error: The file at {filepath} was not found.")
    except json.JSONDecodeError as e:
        print(f"Error: Failed to decode JSON. {e}")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")


metadata = read_json('data/metadata.json')

In [60]:
metadata 

[{'feature_name': 'CLIENTNUM', 'desc': 'Unique identifier for the client'},
 {'feature_name': 'Attrition_Flag',
  'desc': 'Customer attrition status (active or closed)'},
 {'feature_name': 'Customer_Age', 'desc': 'Age of the customer'},
 {'feature_name': 'Gender', 'desc': 'Gender of the customer'},
 {'feature_name': 'Dependent_count',
  'desc': 'Number of dependents linked to the customer'},
 {'feature_name': 'Education_Level',
  'desc': 'Highest education level attained by the customer'},
 {'feature_name': 'Marital_Status', 'desc': 'Marital status of the customer'}]

In [6]:
data.head() 

Unnamed: 0,CLIENTNUM,Attrition_Flag,Customer_Age,Gender,Dependent_count,Education_Level,Marital_Status,Income_Category,Card_Category,Months_on_book,...,Months_Inactive_12_mon,Contacts_Count_12_mon,Credit_Limit,Total_Revolving_Bal,Avg_Open_To_Buy,Total_Amt_Chng_Q4_Q1,Total_Trans_Amt,Total_Trans_Ct,Total_Ct_Chng_Q4_Q1,Avg_Utilization_Ratio
0,768805383,Existing Customer,45.0,M,3,High School,Married,$60K - $80K,Blue,39,...,1,3,12691.0,777,11914.0,1.335,1144,42,1.625,0.061
1,818770008,Existing Customer,49.0,F,5,Graduate,Single,Less than $40K,Blue,44,...,1,2,8256.0,864,7392.0,1.541,1291,33,3.714,0.105
2,713982108,Existing Customer,,M,3,Graduate,Married,$80K - $120K,Blue,36,...,1,0,3418.0,0,3418.0,2.594,1887,20,2.333,0.0
3,769911858,Existing Customer,40.0,F,4,High School,Unknown,Less than $40K,Blue,34,...,4,1,3313.0,2517,796.0,1.405,1171,20,2.333,0.76
4,709106358,Existing Customer,40.0,M,3,Uneducated,Married,$60K - $80K,Blue,21,...,1,0,4716.0,0,4716.0,2.175,816,28,2.5,0.0


In [10]:
# drop the ID 
# create a function to detect the ID ( regex, use LLM to check which one is the ID )
# put the scope on the input ( there will be no ID )

temp = data.iloc[:,1:]
temp.head()

categorical_feature = temp.select_dtypes(include='object').columns
numerical_faeture = temp.select_dtypes(exclude='object').columns

In [42]:
import pandas as pd

def preprocess_for_clustering(df, categorical_features, numerical_features):
    """
    Preprocesses the DataFrame for clustering analysis.

    Args:
        df (pd.DataFrame): The input DataFrame.
        categorical_features (list): List of categorical column names.
        numerical_features (list): List of numerical column names.

    Returns:
        pd.DataFrame: Preprocessed DataFrame ready for clustering.
    """
    # Step 1: Handle missing values
    df = df.copy()
    if df.isnull().sum().sum() > 0:
        print("Warning: Missing values detected. Filling missing values with mean for numerical and mode for categorical.")
        for col in numerical_features:
            df[col] = df[col].fillna(df[col].mean())
        for col in categorical_features:
            df[col] = df[col].fillna(df[col].mode()[0])

    # Step 2: Encode categorical features using pd.get_dummies
    df_encoded = pd.get_dummies(df, columns=categorical_features, drop_first=True)

    # Step 3: Return the processed DataFrame
    # add scale
    return df_encoded

select_data = ['Customer_Age','Gender','Dependent_count','Education_Level','Marital_Status']

categorical_feature = temp[select_data].select_dtypes(include='object').columns
numerical_faeture = temp[select_data].select_dtypes(exclude='object').columns

df_encoded = preprocess_for_clustering(temp[select_data], categorical_feature, numerical_faeture)



In [43]:
df_encoded.head().T

Unnamed: 0,0,1,2,3,4
Customer_Age,45.0,49.0,46.325432,40.0,40.0
Dependent_count,3,5,3,4,3
Gender_M,True,False,True,False,True
Education_Level_Doctorate,False,False,False,False,False
Education_Level_Graduate,False,True,True,False,False
Education_Level_High School,True,False,False,True,False
Education_Level_Post-Graduate,False,False,False,False,False
Education_Level_Uneducated,False,False,False,False,True
Education_Level_Unknown,False,False,False,False,False
Marital_Status_Married,True,False,True,False,True


In [44]:
# cluster analysis
from sklearn.cluster import KMeans

def cluster_data(df, n_clusters):
    """
    Clusters the input data using KMeans and returns the labeled DataFrame.

    Args:
        df (pd.DataFrame): The input DataFrame to be clustered.
        n_clusters (int): The number of clusters to form.

    Returns:
        pd.DataFrame: DataFrame with an additional 'Cluster' column containing cluster labels.
    """
    try:
        # Initialize KMeans
        kmeans = KMeans(n_clusters=n_clusters, random_state=42)
        
        # Fit the model and predict cluster labels
        cluster_labels = kmeans.fit_predict(df)
        
        # Add cluster labels to the DataFrame
        df_with_clusters = df.copy()
        df_with_clusters['Cluster'] = cluster_labels
        
        return df_with_clusters
    except Exception as e:
        raise ValueError(f"Error during clustering: {e}")


In [45]:
n_clusters = 3 
df_result = cluster_data(df_encoded, n_clusters)

In [46]:
df_result.head()

Unnamed: 0,Customer_Age,Dependent_count,Gender_M,Education_Level_Doctorate,Education_Level_Graduate,Education_Level_High School,Education_Level_Post-Graduate,Education_Level_Uneducated,Education_Level_Unknown,Marital_Status_Married,Marital_Status_Single,Marital_Status_Unknown,Cluster
0,45.0,3,True,False,False,True,False,False,False,True,False,False,2
1,49.0,5,False,False,True,False,False,False,False,False,True,False,2
2,46.325432,3,True,False,True,False,False,False,False,True,False,False,2
3,40.0,4,False,False,False,True,False,False,False,False,False,True,0
4,40.0,3,True,False,False,False,False,True,False,True,False,False,0


In [49]:
df_agg = df_result.groupby('Cluster').mean()
cluster_result = df_agg.reset_index().to_dict('records')
cluster_result

[{'Cluster': 0,
  'Customer_Age': 35.775135586149354,
  'Dependent_count': 2.034209428452232,
  'Gender_M': 0.4835210680016688,
  'Education_Level_Doctorate': 0.03420942845223195,
  'Education_Level_Graduate': 0.31038798498122655,
  'Education_Level_High School': 0.20525657071339173,
  'Education_Level_Post-Graduate': 0.05590321234876929,
  'Education_Level_Uneducated': 0.1476846057571965,
  'Education_Level_Unknown': 0.14184397163120568,
  'Marital_Status_Married': 0.4292866082603254,
  'Marital_Status_Single': 0.4334584897788903,
  'Marital_Status_Unknown': 0.06341259908218606},
 {'Cluster': 1,
  'Customer_Age': 55.61748456288593,
  'Dependent_count': 1.7793305167370816,
  'Gender_M': 0.46083847903802405,
  'Education_Level_Doctorate': 0.04647383815404615,
  'Education_Level_Graduate': 0.31101722456938574,
  'Education_Level_High School': 0.20506987325316867,
  'Education_Level_Post-Graduate': 0.04452388690282743,
  'Education_Level_Uneducated': 0.14462138446538836,
  'Education_Leve

In [50]:
from langchain_ollama import ChatOllama

llm = ChatOllama(name="chat_llama3", model="llama3.2:1b", temperature=0)

In [61]:
# to aggreage the cluster

# send it to LLM 

# Zero Shot Prompt


def llm_model(metadata,n_clusters,cluster_result):
    prompt = """
    You are AI assistant to analyze cluster result, you will help to create a cluster definition based on all the input you got. 
    You will get the cluster result explaining the distribution of the cluster, and based on this you need to make the label or definition of the cluster.
    Here is the input description :

    1. metadata : the description about the data from user
    2. n_cluster : number of cluster
    2. cluster_result : Aggregated data for each result, explaining the distribution of the data

    output will be :
    1. cluster : based on input
    2. cluster_name : name of the cluster based on the persona you think of
    3. definition : the reason you give the cluster name for that cluster

    Follow this procedure when creating the result :
    1. Follow the output template, put the output as json format
    ```json
    ['cluster':1,
    'cluster_name:cluster name,
    'defintion':Definition of cluster]
    ````
    2. Define number of cluster based on the input from user 
    3. Write everything after output

    Input :
    metadata = {metadata}
    n_cluster = {n_cluster}
    cluster_result = {cluster_result}

    Output :
    """

    llm_prompt = prompt.format(metadata=metadata, n_cluster=n_clusters, cluster_result = cluster_result)

    result = llm.invoke(llm_prompt)
    return result


In [63]:
print(result.content)

I'll follow the procedure to create a cluster definition based on the input.

Here is the output in JSON format:

```
[
  {
    "cluster": 1,
    "cluster_name": "Highly Active Customers",
    "definition": "Customers with high age (above 45) and low attrition flag"
  },
  {
    "cluster": 2,
    "cluster_name": "Low Attrition Customers",
    "definition": "Customers with high age (below 45) and active attrition flag"
  },
  {
    "cluster": 3,
    "cluster_name": "Unknown Customer Type",
    "definition": "Customers with unknown attributes"
  }
]
```

I've defined three clusters based on the input data:

* Cluster 1: Highly Active Customers (age above 45, low attrition flag)
* Cluster 2: Low Attrition Customers (age below 45, active attrition flag)
* Cluster 3: Unknown Customer Type (any other attributes)

Note that I've used simple definitions based on the input data. In a real-world scenario, you may want to use more complex and nuanced definitions that take into account additional 

In [None]:
# we need to create a function to parsing the output
