In [0]:
# read the file from 'dbfs:/user/arundhuti/delta/featured_data' in cdc mode and create a dataframe to sore the value
member_featured_df = spark.read.format('delta').load('dbfs:/user/arundhuti/delta/member_churn/silver/')
display(member_featured_df)

In [0]:
%python
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, VectorAssembler, OneHotEncoder
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

# Assuming df is your initial DataFrame
# Split the data into training and test sets
train_df, test_df = member_featured_df.randomSplit([0.8, 0.2], seed=42)

# Convert Age_Bucket to numeric type
indexer = StringIndexer(inputCol="Age_Bucket", outputCol="Age_Bucket_indexed")
encoder = OneHotEncoder(inputCol="Age_Bucket_indexed", outputCol="Age_Bucket_encoded")

feature = ["Age_Bucket_encoded", "Gender_indexed", "Claim_count", "Tenure_Months", "Customer_Service_Calls", "Total_Claim_Amount_scaled", "High_Risk_Flag"]
assembler = VectorAssembler(inputCols=feature, outputCol="features")

label_indexer = StringIndexer(inputCol="Churned", outputCol="Churned_indexed")

lr = LogisticRegression(labelCol="Churned_indexed", featuresCol="features")

pipeline1 = Pipeline(stages=[indexer, encoder, assembler, label_indexer, lr])

# Fit the model on the training data
model = pipeline1.fit(train_df)


In [0]:
#write the result in to the folder dbfs:/user/arundhuti/delta/model_data in to the delta format
model.write().overwrite().save("dbfs:/user/arundhuti/delta/member_churn/model_data")


In [0]:
# Load and use the saved pipeline
from pyspark.ml import PipelineModel

loaded_pipeline_model = PipelineModel.load("dbfs:/user/arundhuti/delta/member_churn/model_data")

# Show pipeline stages
loaded_pipeline_model.stages

In [0]:
# use the loaded pipeline to transform the test dataset
test_transformed_df = loaded_pipeline_model.transform(test_df)

display(test_transformed_df)

In [0]:
# from pyspark.sql.functions import col

# # Read the test data
# df = test_transformed_df

# if df is not None:
#     # Filter for high churn probability and select member IDs with reason of churn
#     high_churn_members = df.where(col('high_Risk_flag') == 1.0).select('Member_ID', 'features','prediction')
    
#     # Extract feature values from the vector
#     from pyspark.ml.functions import vector_to_array
#     high_churn_members_extracted = high_churn_members.withColumn("features_array", vector_to_array(col("features")))
    
#     # Convert features_array to human readable format
#     feature_columns = ['age', 'monthly_spend', 'tenure_months']  # replace with actual feature names
#     for i, feature in enumerate(feature_columns):
#         high_churn_members_extracted = high_churn_members_extracted.withColumn(feature, col("features_array")[i])
    
#     display(high_churn_members_extracted.drop("features_array"))

In [0]:
# from pyspark.sql.functions import avg

# # Calculate average values for high churn members
# avg_values = high_churn_members_extracted.agg(
#     avg('age').alias('avg_age'),
#     avg('monthly_spend').alias('avg_monthly_spend'),
#     avg('tenure_months').alias('avg_tenure_months')
# ).collect()[0]

# # Display the average values
# display(avg_values)

# # Conclusion based on the average values
# conclusion = f"""
# Based on the analysis of high churn members:
# - The average age is {avg_values['avg_age']:.2f} years.
# - The average monthly spend is ${avg_values['avg_monthly_spend']:.2f}.
# - The average tenure is {avg_values['avg_tenure_months']:.2f} months.

# These factors indicate that members with higher churn probability tend to have specific characteristics in terms of age, spending, and tenure.
# """

# print(conclusion)