In [None]:
# Step 1: Setting up the environment
# Install necessary libraries if you're not using Colab
# !pip install pandas numpy scikit-learn

# Import required libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


In [None]:
# Step 2: Load the dataset
data = pd.read_csv('churn_prediction.csv')

data.head()

Unnamed: 0,customer_id,vintage,age,gender,dependents,occupation,city,customer_nw_category,branch_code,days_since_last_transaction,...,previous_month_end_balance,average_monthly_balance_prevQ,average_monthly_balance_prevQ2,current_month_credit,previous_month_credit,current_month_debit,previous_month_debit,current_month_balance,previous_month_balance,churn
0,1,3135,66,Male,0.0,self_employed,187.0,2,755,224.0,...,1458.71,1458.71,1449.07,0.2,0.2,0.2,0.2,1458.71,1458.71,0
1,2,310,35,Male,0.0,self_employed,,2,3214,60.0,...,8704.66,7799.26,12419.41,0.56,0.56,5486.27,100.56,6496.78,8787.61,0
2,4,2356,31,Male,0.0,salaried,146.0,2,41,,...,5815.29,4910.17,2815.94,0.61,0.61,6046.73,259.23,5006.28,5070.14,0
3,5,478,90,,,self_employed,1020.0,2,582,147.0,...,2291.91,2084.54,1006.54,0.47,0.47,0.47,2143.33,2291.91,1669.79,1
4,6,2531,42,Male,2.0,self_employed,1494.0,3,388,58.0,...,1401.72,1643.31,1871.12,0.33,714.61,588.62,1538.06,1157.15,1677.16,1


In [None]:
# Step 3: Handling Missing Data
# Check for missing values
missing_values = data.isnull().sum() # Changed 'df' to 'data'
print("Missing Values:\n", missing_values)

Missing Values:
 customer_id                          0
vintage                              0
age                                  0
gender                             525
dependents                        2463
occupation                          80
city                               803
customer_nw_category                 0
branch_code                          0
days_since_last_transaction       3223
current_balance                      0
previous_month_end_balance           0
average_monthly_balance_prevQ        0
average_monthly_balance_prevQ2       0
current_month_credit                 0
previous_month_credit                0
current_month_debit                  0
previous_month_debit                 0
current_month_balance                0
previous_month_balance               0
churn                                0
dtype: int64


In [None]:
# You can either drop rows with missing values or fill them
# Let's fill missing numerical values with median and categorical with mode
data.fillna(data.median(numeric_only=True), inplace=True)  # Changed 'df' to 'data'
for column in data.select_dtypes(include=['object']).columns:  # Changed 'df' to 'data'
    data[column].fillna(data[column].mode()[0], inplace=True)  # Changed 'df' to 'data'

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[column].fillna(data[column].mode()[0], inplace=True)  # Changed 'df' to 'data'


In [None]:
# Step 4: Encoding Categorical Variables
# Identify categorical columns
categorical_cols = data.select_dtypes(include=['object']).columns # Changed 'df' to 'data'

In [None]:

# Apply Label Encoding for binary variables
label_encoder = LabelEncoder()
for col in categorical_cols:
    if data[col].nunique() == 2:  # Binary categorical variables # Changed 'df' to 'data'
        data[col] = label_encoder.fit_transform(data[col]) # Changed 'df' to 'data'
    else:  # One-Hot Encoding for multiple categories
        data = pd.get_dummies(data, columns=[col], drop_first=True) # Changed 'df' to 'data'

In [None]:
# Step 5: Feature Scaling
# Apply Standardization (mean=0, std=1) for numerical features
scaler = StandardScaler()
numerical_cols = data.select_dtypes(include=['int64', 'float64']).columns # Changed 'df' to 'data'
data[numerical_cols] = scaler.fit_transform(data[numerical_cols]) # Changed 'df' to 'data'

In [None]:
# Step 4: Encoding Categorical Variables
# Identify categorical columns
categorical_cols = data.select_dtypes(include=['object']).columns # Changed 'df' to 'data'

# Apply Label Encoding for binary variables
label_encoder = LabelEncoder()
# Create a copy of categorical_cols to iterate over
# This prevents issues when the original list is modified by get_dummies
for col in list(categorical_cols):
    if data[col].nunique() == 2:  # Binary categorical variables # Changed 'df' to 'data'
        data[col] = label_encoder.fit_transform(data[col]) # Changed 'df' to 'data'
    else:  # One-Hot Encoding for multiple categories
        data = pd.get_dummies(data, columns=[col], drop_first=True) # Changed 'df' to 'data'

In [None]:
churn_encoded_col = next((col for col in data.columns if 'Churn' in col), None)

# Check if churn_encoded_col was found
if churn_encoded_col:
    # Proceed with your logic using churn_encoded_col
    print(f"Churn column found: {churn_encoded_col}")
else:
    # Handle the case where 'Churn' is not found in any column
    print("Error: No column containing 'Churn' was found in the DataFrame.")
    # You can add further error handling or alternative logic here

Error: No column containing 'Churn' was found in the DataFrame.


In [None]:
# Step 6: Splitting the data into Training and Testing Sets

# Define the target variable (Churn) and feature set
# Let's first ensure we find the correct column for the churn target (assuming the column name contains 'Churn')
churn_encoded_col = next((col for col in data.columns if 'Churn' in col), None) # Changed 'df' to 'data'

In [None]:
# Step 6: Splitting the data into Training and Testing Sets

# Define the target variable (Churn) and feature set
# Assume the column name contains 'Churn' after encoding
churn_encoded_col = next((col for col in data.columns if 'Churn' in col), None)

# Check if churn column is found
if churn_encoded_col:
    # Separate target variable and features
    X = data.drop(columns=[churn_encoded_col])  # Features (all columns except churn)
    y = data[churn_encoded_col]  # Target (Churn)
    print(f"Churn column found: {churn_encoded_col}")
else:
    # If 'Churn' is not found directly, check for encoded columns (e.g., 'Churn_Yes')
    # Updated logic to find columns starting with 'Churn_'
    churn_encoded_cols = [col for col in data.columns if col.startswith('Churn_')]
    if churn_encoded_cols:
        # Assuming the first column starting with 'Churn_' is the target
        churn_encoded_col = churn_encoded_cols[0]
        # Separate target variable and features using the encoded column
        X = data.drop(columns=[churn_encoded_col])  # Features (all columns except churn)
        y = data[churn_encoded_col]  # Target (Churn)
        print(f"Churn column found (encoded): {churn_encoded_col}")
    else:
        print("Error: No column containing 'Churn' was found in the DataFrame.")
        print(f"Available columns: {data.columns.tolist()}")
        # Handle this case if needed (e.g., raise an exception or stop execution)
        # raise ValueError("Churn column not found!")  # Consider removing or handling differently
        # For example, you could try to identify the Churn column manually:
        possible_churn_cols = [col for col in data.columns if 'churn' in col.lower()]
        if possible_churn_cols:
            print(f"Possible Churn columns (case-insensitive): {possible_churn_cols}")
            # You could then ask the user to select the correct column or make an assumption
            churn_encoded_col = possible_churn_cols[0]  # Assuming the first one is correct
            X = data.drop(columns=[churn_encoded_col])
            y = data[churn_encoded_col]
        else:
            raise ValueError("Churn column not found!")

Error: No column containing 'Churn' was found in the DataFrame.
Available columns: ['customer_id', 'vintage', 'age', 'gender', 'dependents', 'city', 'customer_nw_category', 'branch_code', 'days_since_last_transaction', 'current_balance', 'previous_month_end_balance', 'average_monthly_balance_prevQ', 'average_monthly_balance_prevQ2', 'current_month_credit', 'previous_month_credit', 'current_month_debit', 'previous_month_debit', 'current_month_balance', 'previous_month_balance', 'churn', 'occupation_retired', 'occupation_salaried', 'occupation_self_employed', 'occupation_student']
Possible Churn columns (case-insensitive): ['churn']


In [None]:

# Now split the data into training (80%) and testing (20%) sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
# Step 7: Overview of Churn Prediction
# Here we are applying a simple machine learning model like logistic regression for churn prediction

# Import Logistic Regression model
from sklearn.linear_model import LogisticRegression

In [None]:
# Initialize the model
log_reg = LogisticRegression()

In [None]:
import pandas as pd

# Assuming 'y' is your original continuous target variable
# Divide into 2 categories (quantiles), handle potential duplicate edges

# Try qcut with duplicates='drop'
try:
    y_categorical = pd.qcut(y, q=2, labels=[0, 1], duplicates='drop')
except ValueError:
    # If duplicates='drop' fails due to too few unique bins, adjust labels:
    # If qcut results in only one bin, use a single label
    try:
        y_categorical = pd.qcut(y, q=2, labels=[0], duplicates='drop')
    except ValueError:
        #If qcut results in 3 bins and you want to drop the duplicate to maintain 2 bins
        # y_categorical = pd.qcut(y, q=2, labels=[0, 1], duplicates='drop')
        print("Warning: qcut resulted in non-unique bins even after dropping duplicates.")
        #If there are still errors you may need to adjust qcut parameters or further investigate data
        print("Consider adjusting the 'q' parameter in qcut or further investigate data for issues.")
        #Fallback to assign all values to a single category if qcut continues to fail:
        y_categorical = pd.Series(0, index=y.index, name=y.name)

In [None]:
import pandas as pd

# Assuming 'y' is your original continuous target variable
# Divide into 2 categories (quantiles), handle potential duplicate edges

# Try qcut with duplicates='raise' to explicitly identify the issue
try:
    y_categorical = pd.qcut(y, q=2, labels=[0, 1], duplicates='raise')
except ValueError as e:
    # Log the original error message for debugging
    print(f"Original ValueError: {e}")

    # Check unique values in the original 'y' variable
    unique_y = y.unique()
    print(f"Unique values in 'y': {unique_y}")

    # Check unique values after binning, if possible
    try:
        binned_y = pd.qcut(y, q=2, labels=False, duplicates='drop')
        unique_binned_y = binned_y.unique()
        print(f"Unique values after binning with qcut: {unique_binned_y}")
    except ValueError:
        print("Unable to bin data with qcut for analysis.")

    # If 'y' is indeed continuous, but resulting in only one bin:
    if len(unique_y) > 1:
        print("Warning: 'y' appears continuous but results in only one category when using qcut.")


Original ValueError: Bin edges must be unique: Index([-0.47695803960333255, -0.47695803960333255, 2.0966204927202003], dtype='float64', name='churn').
You can drop duplicate edges by setting the 'duplicates' kwarg
Unique values in 'y': [-0.47695804  2.09662049]
Unique values after binning with qcut: [0]


In [None]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

# ... (Your existing code to load and prepare data) ...

# Assuming 'y' is your original continuous target variable
# Divide into 2 categories (quantiles), handle potential duplicate edges

# Try qcut with duplicates='drop'
try:
    y_categorical = pd.qcut(y, q=2, labels=[0, 1], duplicates='drop')
except ValueError:
    # If duplicates='drop' fails due to too few unique bins, adjust labels:
    # If qcut results in only one bin, use a single label
    try:
        y_categorical = pd.qcut(y, q=2, labels=[0], duplicates='drop')
    except ValueError:
        #If qcut results in 3 bins and you want to drop the duplicate to maintain 2 bins
        # y_categorical = pd.qcut(y, q=2, labels=[0, 1], duplicates='drop')
        print("Warning: qcut resulted in non-unique bins even after dropping duplicates.")
        #If there are still errors you may need to adjust qcut parameters or further investigate data
        print("Consider adjusting the 'q' parameter in qcut or further investigate data for issues.")
        #Fallback to assign all values to a single category if qcut continues to fail:
        y_categorical = pd.Series(0, index=y.index, name=y.name)

#Check if y_categorical has only one unique value
if len(y_categorical.unique()) < 2:
    print("Warning: y_categorical has only one unique value. Logistic Regression requires at least two classes.")
    # Investigate why y_categorical has only one unique value and adjust data processing accordingly.
    # This might involve adjusting the qcut parameters or handling class imbalance.
    # Example: If data is imbalanced, you might consider oversampling or undersampling techniques.
else:
    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y_categorical, test_size=0.25, random_state=16)

    # Initialize the model
    log_reg = LogisticRegression()

    # Fit the model to the training data
    log_reg.fit(X_train, y_train)

    # Step 8: Predict on the test set
    # Now let's make predictions using the test data
    y_pred = log_reg.predict(X_test)



In [None]:

# Step 9: Model Evaluation
# Evaluate the model's performance using common classification metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


In [None]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# ... (Your existing code to load and prepare data) ...

# Assuming 'y' is your original continuous target variable
# Divide into 2 categories (quantiles), handle potential duplicate edges

# Try qcut with duplicates='drop'
try:
    y_categorical = pd.qcut(y, q=2, labels=[0, 1], duplicates='drop')
except ValueError:
    # If duplicates='drop' fails due to too few unique bins, adjust labels:
    # If qcut results in only one bin, use a single label
    try:
        y_categorical = pd.qcut(y, q=2, labels=[0], duplicates='drop')
    except ValueError:
        #If qcut results in 3 bins and you want to drop the duplicate to maintain 2 bins
        # y_categorical = pd.qcut(y, q=2, labels=[0, 1], duplicates='drop')
        print("Warning: qcut resulted in non-unique bins even after dropping duplicates.")
        #If there are still errors you may need to adjust qcut parameters or further investigate data
        print("Consider adjusting the 'q' parameter in qcut or further investigate data for issues.")
        #Fallback to assign all values to a single category if qcut continues to fail:
        y_categorical = pd.Series(0, index=y.index, name=y.name)

#Check if y_categorical has only one unique value
if len(y_categorical.unique()) < 2:
    print("Warning: y_categorical has only one unique value. Logistic Regression requires at least two classes.")
    print("Investigate why y_categorical has only one unique value and adjust data processing accordingly.")
    print("This might involve adjusting the qcut parameters or handling class imbalance.")
    print("Example: If data is imbalanced, you might consider oversampling or undersampling techniques.")
    # Here you could potentially:
    # 1. Adjust the `q` parameter in `pd.qcut`
    # 2. Implement oversampling or undersampling to address class imbalance
    # 3. If the target variable has no variability, consider if a classification model is appropriate
    # For now, we'll skip model training and evaluation
else:
    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y_categorical, test_size=0.25, random_state=16)

    # Initialize the model
    log_reg = LogisticRegression()

    # Fit the model to the training data
    log_reg.fit(X_train, y_train)

    # Step 8: Predict on the test set
    # Now let's make predictions using the test data
    y_pred = log_reg.predict(X_test)

    # Step 9: Model Evaluation
    # Evaluate the model's performance using common classification metrics

    # Calculate evaluation metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    print(f"Accuracy: {accuracy}")
    print(f"Precision: {precision}")



Investigate why y_categorical has only one unique value and adjust data processing accordingly.
This might involve adjusting the qcut parameters or handling class imbalance.
Example: If data is imbalanced, you might consider oversampling or undersampling techniques.


In [None]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# ... (Your existing code to load and prepare data) ...

# Assuming 'y' is your original continuous target variable
# Divide into 2 categories (quantiles), handle potential duplicate edges

# Try qcut with duplicates='drop'
try:
    y_categorical = pd.qcut(y, q=2, labels=[0, 1], duplicates='drop')
except ValueError:
    # If duplicates='drop' fails due to too few unique bins, adjust labels:
    # If qcut results in only one bin, use a single label
    try:
        y_categorical = pd.qcut(y, q=2, labels=[0], duplicates='drop')
    except ValueError:
        #If qcut results in 3 bins and you want to drop the duplicate to maintain 2 bins
        # y_categorical = pd.qcut(y, q=2, labels=[0, 1], duplicates='drop')
        print("Warning: qcut resulted in non-unique bins even after dropping duplicates.")
        #If there are still errors you may need to adjust qcut parameters or further investigate data
        print("Consider adjusting the 'q' parameter in qcut or further investigate data for issues.")
        #Fallback to assign all values to a single category if qcut continues to fail:
        y_categorical = pd.Series(0, index=y.index, name=y.name)

#Check if y_categorical has only one unique value
if len(y_categorical.unique()) < 2:
    print("Warning: y_categorical has only one unique value. Logistic Regression requires at least two classes.")
    print("Investigate why y_categorical has only one unique value and adjust data processing accordingly.")
    print("This might involve adjusting the qcut parameters or handling class imbalance.")
    print("Example: If data is imbalanced, you might consider oversampling or undersampling techniques.")
    # Here you could potentially:
    # 1. Adjust the `q` parameter in `pd.qcut`
    # 2. Implement oversampling or undersampling to address class imbalance
    # 3. If the target variable has no variability, consider if a classification model is appropriate
    # For now, we'll skip model training and evaluation
    # Initialize evaluation metrics to None to avoid NameError
    accuracy = None
    precision = None
    recall = None
    f1 = None
else:
    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y_categorical, test_size=0.25, random_state=16)

    # Initialize the model
    log_reg = LogisticRegression()

    # Fit the model to the training data
    log_reg.fit(X_train, y_train)

    # Step 8: Predict on the test set
    # Now let's make predictions using the test data
    y_pred = log_reg.predict(X_test)

    # Step 9: Model Evaluation
    # Evaluate the model's performance using common classification metrics

    # Calculate evaluation metrics
    accuracy = accuracy_score(y_test, y_pred)
```python
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# ... (Your existing code to load and prepare data) ...

# Assuming 'y' is your original continuous target variable
# Divide into 2 categories (quantiles), handle potential duplicate edges

# Try qcut with duplicates='drop'
try:
    y_categorical = pd.qcut(y, q=2, labels=[0, 1], duplicates='drop')
except ValueError:
    # If duplicates='drop' fails due to too few unique bins, adjust labels:
    # If qcut results in only one bin, use a single label
    try:
        y_categorical = pd.qcut(y, q=2, labels=[0], duplicates='drop')
    except ValueError:
        #If qcut results in 3 bins and you want to drop the duplicate to maintain 2 bins
        # y_categorical = pd.qcut(y, q=2, labels=[0, 1], duplicates='drop')
        print("Warning: qcut resulted in non-unique bins even after dropping duplicates.")
        #If there are still errors you may need to adjust qcut parameters or further investigate data
        print("Consider adjusting the 'q' parameter in qcut or further investigate data for issues.")
        #Fallback to assign all values to a single category if qcut continues to fail:
        y_categorical = pd.Series(0, index=y.index, name=y.name)

#Check if y_categorical has only one unique value
if len(y_categorical.unique()) < 2:
    print("Warning: y_categorical has only one unique value. Logistic Regression requires at least two classes.")
    print("Investigate why y_categorical has only one unique value and adjust data processing accordingly.")
    print("This might involve adjusting the qcut parameters or handling class imbalance.")
    print("Example: If data is imbalanced, you might consider oversampling or undersampling techniques.")
    # Here you could potentially:
    # 1. Adjust the `q` parameter in `pd.qcut`
    # 2. Implement oversampling or undersampling to address class imbalance
    # 3. If the target variable has no variability, consider if a classification model is appropriate
    # For now, we'll skip model training and evaluation
    # Initialize evaluation metrics to None to avoid NameError
    accuracy = None
    precision = None
    recall = None
    f1 = None
else:
    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y_categorical, test_size=0.25, random_state=16)

    # Initialize the model
    log_reg = LogisticRegression()

    # Fit the model to the training data
    log_reg.fit(X_train, y_train)

    # Step 8: Predict on the test set
    # Now let's make predictions using the test data
    y_pred = log_reg.predict(X_test)

    # Step 9: Model Evaluation
    # Evaluate the model's performance using common classification metrics

    # Calculate evaluation metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    print(f"Accuracy: {accuracy}"

SyntaxError: invalid syntax (<ipython-input-57-29a7bae324c1>, line 63)