In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import joblib
import os
df = pd.read_csv('my_telecom_customer_churn.csv')

In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 13 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   CustomerID       2000 non-null   object
 1   Gender           2000 non-null   object
 2   Age              2000 non-null   int64 
 3   Tenure_Months    2000 non-null   int64 
 4   ContractType     2000 non-null   object
 5   MonthlyCharges   2000 non-null   int64 
 6   InternetService  2000 non-null   object
 7   TechSupport      2000 non-null   object
 8   OnlineSecurity   2000 non-null   object
 9   PaymentMethod    2000 non-null   object
 10  Complaints       2000 non-null   object
 11  TotalCharges     2000 non-null   int64 
 12  Churn            2000 non-null   object
dtypes: int64(4), object(9)
memory usage: 203.3+ KB


In [3]:
for column in df.columns:
    print(f"Unique values in column '{column}':")
    print(df[column].unique())
    print('-' * 50)

Unique values in column 'CustomerID':
['CUST0001' 'CUST0002' 'CUST0003' ... 'CUST1998' 'CUST1999' 'CUST2000']
--------------------------------------------------
Unique values in column 'Gender':
['Male' 'Female']
--------------------------------------------------
Unique values in column 'Age':
[64 29 33 41 36 25 48 38 34 40 54 23 42 35 32 43 58 62 49 27 24 39 31 26
 68 65 67 46 47 69 56 52 59 61 51 20 60 57 66 21 30 50 19 63 37 22 18 28
 55 45 44 53]
--------------------------------------------------
Unique values in column 'Tenure_Months':
[53 59 41 68  8  2 55 16 62 48 30 12 35 11 64 50 21 69 20 67 44 33 61  9
 65 45 47 40 54 46 15 19 26 52 22 51 38 29  4 71 28 66 34  1  6 25 17 27
 57 14 56 49 13 43 39 37 18 36 63 10  3 70 23 24 60 32  7 31 42  5 58]
--------------------------------------------------
Unique values in column 'ContractType':
['Month-to-Month' 'One-Year' 'Two-Year']
--------------------------------------------------
Unique values in column 'MonthlyCharges':
[115 107  6

In [4]:
df.nunique()

CustomerID         2000
Gender                2
Age                  52
Tenure_Months        71
ContractType          3
MonthlyCharges      100
InternetService       3
TechSupport           2
OnlineSecurity        2
PaymentMethod         4
Complaints            2
TotalCharges       1678
Churn                 2
dtype: int64

In [5]:
df

Unnamed: 0,CustomerID,Gender,Age,Tenure_Months,ContractType,MonthlyCharges,InternetService,TechSupport,OnlineSecurity,PaymentMethod,Complaints,TotalCharges,Churn
0,CUST0001,Male,64,53,Month-to-Month,115,FiberOptic,No,Yes,Cash,No,6108,No
1,CUST0002,Female,29,59,Month-to-Month,107,DSL,No,No,BankTransfer,Yes,6337,Yes
2,CUST0003,Male,33,41,Month-to-Month,66,FiberOptic,No,Yes,BankTransfer,Yes,2625,No
3,CUST0004,Male,41,68,Month-to-Month,63,DSL,No,Yes,CreditCard,No,4350,No
4,CUST0005,Male,36,68,One-Year,28,No,Yes,Yes,CreditCard,No,1954,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,CUST1996,Male,50,36,Two-Year,69,FiberOptic,No,No,BankTransfer,No,2520,No
1996,CUST1997,Female,69,19,Month-to-Month,21,FiberOptic,Yes,No,BankTransfer,No,299,No
1997,CUST1998,Male,42,38,Month-to-Month,92,No,No,No,CreditCard,No,3511,Yes
1998,CUST1999,Male,31,70,Month-to-Month,75,DSL,Yes,Yes,EWallet,No,5312,No


In [6]:
# Separate features (X) and target (y)
X = df.drop(['Churn', 'CustomerID'], axis=1)
y = df['Churn']

print("Sample dataset created successfully.")
print("Features (X):", list(X.columns))

# --- 2. Train-test split ---
# Split the data into training and testing sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Data split into training and testing sets.")

# --- 3. Preprocess and Train the model ---
print("Training the RandomForestClassifier model...")

# Identify categorical and numerical columns
categorical_features = ['Gender', 'ContractType', 'InternetService', 'TechSupport', 'OnlineSecurity', 'PaymentMethod', 'Complaints']
numerical_features = ['Age', 'Tenure_Months', 'MonthlyCharges', 'TotalCharges']

# Create a preprocessor using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(), categorical_features)
    ],
    remainder='passthrough'  # Keep numerical columns as they are
)

# Create a full pipeline with the preprocessor and the model
model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

# Train the model pipeline
model_pipeline.fit(X_train, y_train)
print("Model training complete.")

# --- 4. Save the trained model pipeline ---
# Use joblib to save the full pipeline to a file.
model_filename = 'churn_model_pipeline.pkl'
joblib.dump(model_pipeline, model_filename)

# Check if the file was saved successfully
if os.path.exists(model_filename):
    print(f"Model pipeline saved successfully as '{model_filename}'.")
else:
    print(f"Error: Model pipeline was not saved to '{model_filename}'.")

Sample dataset created successfully.
Features (X): ['Gender', 'Age', 'Tenure_Months', 'ContractType', 'MonthlyCharges', 'InternetService', 'TechSupport', 'OnlineSecurity', 'PaymentMethod', 'Complaints', 'TotalCharges']
Data split into training and testing sets.
Training the RandomForestClassifier model...
Model training complete.
Model pipeline saved successfully as 'churn_model_pipeline.pkl'.


# testing

In [1]:
# import pandas as pd
import joblib
import os

# Define the filename of the saved pipeline
model_filename = 'churn_model_pipeline.pkl'

# Check if the model file exists
if not os.path.exists(model_filename):
    print(f"Error: The file '{model_filename}' was not found.")
    print("Please make sure you have run the previous script to train and save the model pipeline.")
else:
    try:
        # Load the saved model pipeline
        print(f"Loading the model pipeline from '{model_filename}'...")
        loaded_model = joblib.load(model_filename)
        print("Model loaded successfully.")

        print("\n--- Please enter the details for the new customer ---")
        
        # Get user input for each feature
        # Note: Provide expected values for categorical inputs to avoid errors
        gender = input("Gender (Male/Female): ")
        age = int(input("Age: "))
        tenure = int(input("Tenure in Months: "))
        contract = input("Contract Type (Month-to-Month/One-Year/Two-Year): ")
        monthly_charges = float(input("Monthly Charges: "))
        internet_service = input("Internet Service (FiberOptic/DSL/No): ")
        tech_support = input("Tech Support (Yes/No): ")
        online_security = input("Online Security (Yes/No): ")
        payment_method = input("Payment Method (Cash/BankTransfer/CreditCard/EWallet): ")
        complaints = input("Complaints (Yes/No): ")
        total_charges = float(input("Total Charges: "))

        # Create a dictionary from the user input
        user_input_data = {
            'Gender': [gender],
            'Age': [age],
            'Tenure_Months': [tenure],
            'ContractType': [contract],
            'MonthlyCharges': [monthly_charges],
            'InternetService': [internet_service],
            'TechSupport': [tech_support],
            'OnlineSecurity': [online_security],
            'PaymentMethod': [payment_method],
            'Complaints': [complaints],
            'TotalCharges': [total_charges]
        }

        # Convert the dictionary to a pandas DataFrame
        # The pipeline expects input in this format
        user_input_df = pd.DataFrame(user_input_data)

        # Make a prediction using the loaded pipeline
        print("\nMaking a prediction...")
        prediction = loaded_model.predict(user_input_df)

        # Display the result
        print("\n--- Prediction Result ---")
        if prediction[0] == 'Yes':
            print("The model predicts this customer is LIKELY TO CHURN.")
        else:
            print("The model predicts this customer is UNLIKELY TO CHURN.")
    
    except Exception as e:
        print(f"\nAn error occurred during prediction: {e}")
        print("Please check if the input values are in the correct format (e.g., numbers for numerical features).")

Loading the model pipeline from 'churn_model_pipeline.pkl'...
Model loaded successfully.

--- Please enter the details for the new customer ---


Gender (Male/Female):  
Age:  



An error occurred during prediction: invalid literal for int() with base 10: ''
Please check if the input values are in the correct format (e.g., numbers for numerical features).


In [9]:
pip freeze > requirements.txt

Note: you may need to restart the kernel to use updated packages.


In [1]:
import pandas as pd

In [2]:
oo = pd.read_csv("my_telecom_customer_churn.csv")

In [3]:
oo

Unnamed: 0,CustomerID,Gender,Age,Tenure_Months,ContractType,MonthlyCharges,InternetService,TechSupport,OnlineSecurity,PaymentMethod,Complaints,TotalCharges,Churn
0,CUST0001,Male,64,53,Month-to-Month,115,FiberOptic,No,Yes,Cash,No,6108,No
1,CUST0002,Female,29,59,Month-to-Month,107,DSL,No,No,BankTransfer,Yes,6337,Yes
2,CUST0003,Male,33,41,Month-to-Month,66,FiberOptic,No,Yes,BankTransfer,Yes,2625,No
3,CUST0004,Male,41,68,Month-to-Month,63,DSL,No,Yes,CreditCard,No,4350,No
4,CUST0005,Male,36,68,One-Year,28,No,Yes,Yes,CreditCard,No,1954,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,CUST1996,Male,50,36,Two-Year,69,FiberOptic,No,No,BankTransfer,No,2520,No
1996,CUST1997,Female,69,19,Month-to-Month,21,FiberOptic,Yes,No,BankTransfer,No,299,No
1997,CUST1998,Male,42,38,Month-to-Month,92,No,No,No,CreditCard,No,3511,Yes
1998,CUST1999,Male,31,70,Month-to-Month,75,DSL,Yes,Yes,EWallet,No,5312,No
