# Setup and Imports

In [19]:
#! Some things aren't showed here as they are in other files for the development process
import os

os.chdir(
    r"/home/yousinator/personal/ChurnSage"
)  #! Change It to to the path on your device or cloud

import pandas as pd
from prisma_ml import DataSetInformation
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
import pickle

In [20]:
df = pd.read_csv(
    r"data/clean_churn_data.csv"
)
metadata = {
    "POS_MSISDN": "Point of Sale ID, where the contract was sold/initiated.",
    "Distributer": "Entity/shop responsible for service distribution or activation.",
    "Plan Name": "Name of the plan subscribed to.",
    "Customer Tenure": "Duration between activation and deactivation/current date.",
    "Tenure Category": "Categorization of tenure into segments (short-term, medium-term, and long-term).",
    "Status": "Current status of the line (Active, Suspended, etc.).",
    "Status Reason": "Reason behind the current status.",
    "Segment1": "Customer segmentation category.",
    "Segment2": "Customer segmentation category.",
    "Churn Flag": "Indicator of whether a customer has discontinued the service.",
    "Customer Interaction Score": "Frequency/quality of interactions with customer service.",
    "Loyalty Points": "Points assigned based on tenure, status, and segment.",
    "Data Usage Tier": "Classification based on presumed data usage (3:High, 2:Medium, 1:Low).",
    "Customer Segment Change Flag": "Indicates if a customer changed segments.",
}
info = DataSetInformation(df)

# Encoding

In [21]:
info.categorical_summary()

### Categorical Columns Summary:

Unnamed: 0,Column,Unique Values Count,Top Value,Top Value Percentage
0,POS_MSISDN,156,ORG3592,23.71%
1,Distributer,142,FNTFRPRJTF TBLFT TTPDL,21.18%
2,Plan Name,34,Orange 90,19.16%
3,Tenure Category,3,Short-term,47.17%
4,Status,4,Deactive,35.68%
5,Status Reason,19,Good Service,24.57%
6,Segment1,2,Prepaid,64.19%
7,Segment2,3,Residential,63.19%


### Column: Tenure Category

#### Value Counts and Percentage Distribution:

Unnamed: 0,Value,Count,Percentage
0,Short-term,12643,47.166573
1,Long-term,7434,27.733632
2,Medium-term,6728,25.099795


---

### Column: Status

#### Value Counts and Percentage Distribution:

Unnamed: 0,Value,Count,Percentage
0,Deactive,9564,35.67991
1,Active,6586,24.570043
2,Hard Suspended,5738,21.406454
3,Soft Suspended,4917,18.343593


---

### Column: Status Reason

#### Value Counts and Percentage Distribution:

Unnamed: 0,Value,Count,Percentage
0,Good Service,6586,24.570043
1,Violation of terms,1461,5.450476
2,Fraudulent activity,1430,5.334826
3,Illegal usage,1426,5.319903
4,Non-payment,1421,5.30125
5,Service upgrade,1242,4.633464
6,Document verification,1238,4.618541
7,Technical issue,1225,4.570043
8,Customer request,1212,4.521544
9,Network Issues,1003,3.741839


---

### Column: Segment1

#### Value Counts and Percentage Distribution:

Unnamed: 0,Value,Count,Percentage
0,Prepaid,17206,64.189517
1,Postpaid,9599,35.810483


---

### Column: Segment2

#### Value Counts and Percentage Distribution:

Unnamed: 0,Value,Count,Percentage
0,Residential,16939,63.193434
1,Corporate,5842,21.794441
2,PRO,4024,15.012125


---

In [22]:
info.dataframe_summary()

### Shape:

(26805, 14)

### Columns and Metadata:

Unnamed: 0,Data Type,Null Values,Precentage of Nulls
POS_MSISDN,object,0,0.0
Distributer,object,0,0.0
Plan Name,object,0,0.0
Customer Tenure,int64,0,0.0
Tenure Category,object,0,0.0
Status,object,0,0.0
Status Reason,object,0,0.0
Segment1,object,0,0.0
Segment2,object,0,0.0
Churn Flag,int64,0,0.0


### Duplicated Rows:

Unnamed: 0,Duplicated Rows Count
Total,0


In [23]:
df["Loyalty Points"].unique()

array([48, 65, 73, 10, 85, 55, 87, 84, 14, 76, 41, 28, 50, 30, 68, 17, 16,
       34, 54, 11, 80, 74, 60, 75, 46, 81, 52, 32, 27, 12, 24, 49, 89, 67,
       29, 78, 70, 83, 77, 23, 42, 82, 39, 88, 57, 63, 45, 69, 31, 37, 61,
       36, 56, 20, 22, 35, 59, 58, 33, 79, 66, 51, 13, 71, 19, 62, 26, 47,
       72, 38, 90, 40, 44, 43, 53, 21, 25, 86, 64, 18, 15])

## SKLearn Encoder

In [24]:
def save_as_pickle(path,variable):
    with open(path,"wb") as file:
        pickle.dump(variable,file)

### POS_MSISDN

In [25]:
le_pos = LabelEncoder()

df["POS_MSISDN"] = le_pos.fit_transform(df["POS_MSISDN"])

save_as_pickle(r"variables/le_pos.pkl",le_pos)

In [26]:
le_distributer = LabelEncoder()

df["Distributer"] = le_distributer.fit_transform(df["Distributer"])
save_as_pickle(r"variables/le_distributer.pkl", le_distributer)

In [27]:
le_plan = LabelEncoder()
df["Plan Name"] = le_plan.fit_transform(df["Plan Name"])

save_as_pickle(r"variables/le_plan.pkl", le_plan)

In [28]:
le_reason = LabelEncoder()

df["Status Reason"] = le_reason.fit_transform(df["Status Reason"])

save_as_pickle(r"variables/le_reason.pkl", le_reason)

## Scratch Encoder

### Tenure Category

In [29]:
df["Tenure Category"] = df["Tenure Category"].map({"Short-term": 0, "Medium-term": 1, "Long-term": 2})

### Status

In [30]:

df["Status"] = df["Status"].map({"Hard Suspended": 0, "Soft Suspended": 1, "Deactive": 2, "Active": 3})

### Segment 1

In [31]:
df["Segment1"] = df["Segment1"].map(
    {"Prepaid": 0, "Postpaid": 1}
)

### Segment 2

In [32]:
df["Segment2"] = df["Segment2"].map({"Residential": 0, "Corporate": 1, "PRO":2})

### Customer Segment Change Flag

In [33]:
df["Customer Segment Change Flag"] = df["Customer Segment Change Flag"].map(
    {False: 0, True: 1}
)

# Scaling

In [34]:
# Separate the 'Status' column
churn_column = df["Churn Flag"]
dropped_df = df.drop("Churn Flag", axis=1)

# Initialize and apply MinMaxScaler to the dropped_df
scaler = MinMaxScaler()
scaled_dropped_df = scaler.fit_transform(dropped_df.to_numpy())

# Create a DataFrame from the scaled dropped_df
df = pd.DataFrame(scaled_dropped_df, columns=dropped_df.columns)

# Reattach the 'Status' column
df["Churn Flag"] = churn_column

save_as_pickle(r"variables/scaler.pkl",scaler)

In [35]:
df.to_csv(
    r"data/processed_data.csv",
    index=False,
    encoding="utf-8-sig",
)