In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score


In [2]:
data = pd.read_csv('Databel_Data.csv')

In [3]:
data = data.drop(columns=["Customer ID", "Churn Category", "Churn Reason"], errors='ignore')

In [4]:
categorical_cols = data.select_dtypes(include=["object"]).columns
label_encoders = {col: LabelEncoder() for col in categorical_cols}
for col in categorical_cols:
    data[col] = label_encoders[col].fit_transform(data[col].astype(str))

In [5]:
imputer = SimpleImputer(strategy="mean")
data = pd.DataFrame(imputer.fit_transform(data), columns=data.columns)

data.sample(10)

Unnamed: 0,Churn Label,Account Length (in months),Local Calls,Local Mins,Intl Calls,Intl Mins,Intl Active,Intl Plan,Extra International Charges,Customer Service Calls,...,Age,Under 30,Senior,Group,Number of Customers in Group,Device Protection & Online Backup,Contract Type,Payment Method,Monthly Charge,Total Charges
1049,0.0,59.0,289.0,648.8,0.0,0.0,0.0,0.0,0.0,1.0,...,38.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,59.0,3485.0
5647,0.0,7.0,50.0,112.5,28.0,82.6,1.0,0.0,27.5,2.0,...,46.0,0.0,0.0,1.0,6.0,0.0,1.0,0.0,10.0,69.0
2412,0.0,34.0,211.0,480.2,0.0,0.0,0.0,0.0,0.0,0.0,...,26.0,1.0,0.0,0.0,0.0,0.0,2.0,0.0,22.0,757.0
3615,1.0,20.0,113.0,261.2,0.0,0.0,0.0,0.0,259.1,0.0,...,80.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,49.0,975.0
4338,1.0,1.0,5.0,15.0,4.0,10.5,1.0,0.0,2.1,0.0,...,54.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,39.0,39.0
5052,1.0,1.0,3.0,6.0,0.0,0.0,0.0,0.0,0.0,4.0,...,81.0,0.0,1.0,0.0,0.0,0.0,0.0,2.0,37.0,37.0
5211,0.0,8.0,40.0,88.2,0.0,0.0,0.0,0.0,0.0,0.0,...,26.0,1.0,0.0,1.0,6.0,1.0,0.0,1.0,37.0,298.0
2473,0.0,20.0,123.0,285.1,0.0,0.0,0.0,0.0,0.0,0.0,...,39.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,72.0,1462.0
1362,0.0,30.0,69.0,150.9,60.0,306.0,1.0,0.0,61.2,0.0,...,24.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,59.0,1771.0
2688,0.0,71.0,376.0,778.9,0.0,0.0,0.0,0.0,0.0,0.0,...,37.0,0.0,0.0,0.0,0.0,1.0,2.0,1.0,57.0,4025.0


In [6]:
X = data.drop(columns=["Churn Label"], errors='ignore')
y = data["Churn Label"]


In [7]:
print("Class distribution before splitting:")
print(y.value_counts())


Class distribution before splitting:
Churn Label
0.0    4891
1.0    1796
Name: count, dtype: int64


In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)


In [9]:
print("Class distribution in training set:")
print(y_train.value_counts())
print("Shape of X_train:", X_train.shape)
print("Shape of y_train:", y_train.shape)
print("NaN values in X_train:", X_train.isnull().sum().sum())
print("NaN values in y_train:", y_train.isnull().sum())
print("First few rows of X_train:")
print(X_train.head())
print("First few values of y_train:")
print(y_train.head())


Class distribution in training set:
Churn Label
0.0    3423
1.0    1257
Name: count, dtype: int64
Shape of X_train: (4680, 25)
Shape of y_train: (4680,)
NaN values in X_train: 0
NaN values in y_train: 0
First few rows of X_train:
      Account Length (in months)  Local Calls  Local Mins  Intl Calls  \
4504                        59.0        372.0       824.5       236.0   
4998                        42.0        110.0       381.0         0.0   
1093                        60.0        221.0       536.5         0.0   
3708                        33.0         51.0       230.7         0.0   
4223                        63.0        247.0       501.9       315.0   

      Intl Mins  Intl Active  Intl Plan  Extra International Charges  \
4504      737.5          1.0        0.0                        368.8   
4998        0.0          0.0        0.0                          0.0   
1093        0.0          0.0        0.0                          0.0   
3708        0.0          0.0        1.0    

In [10]:
def evaluate_model(model, X_train, X_test, y_train, y_test, model_name):
    try:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        print(f"{model_name} Accuracy: {accuracy:.2f}")
    except Exception as e:
        print(f"Error with {model_name}: {e}")


In [11]:
logistic_model = LogisticRegression(max_iter=200)
evaluate_model(logistic_model, X_train, X_test, y_train, y_test, "Logistic Regression")


Logistic Regression Accuracy: 0.81


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [12]:
nb_model = GaussianNB()
evaluate_model(nb_model, X_train, X_test, y_train, y_test, "Naive Bayes")


Naive Bayes Accuracy: 0.76


In [13]:
knn_model = KNeighborsClassifier(n_neighbors=5)
evaluate_model(knn_model, X_train, X_test, y_train, y_test, "K-Nearest Neighbors")


K-Nearest Neighbors Accuracy: 0.71


In [14]:
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
evaluate_model(rf_model, X_train, X_test, y_train, y_test, "Random Forest")


Random Forest Accuracy: 0.91


In [15]:
dt_model = DecisionTreeClassifier(random_state=42)
evaluate_model(dt_model, X_train, X_test, y_train, y_test, "Decision Tree")


Decision Tree Accuracy: 0.85
