In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import missingno
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder , OrdinalEncoder , LabelEncoder , MinMaxScaler
from sklearn.impute import SimpleImputer
from scipy import stats
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
import warnings
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv(r"C:\Users\Ūśēr̥\Downloads/Telco-Customer-Churn.csv")

In [3]:
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [5]:
for i in df.columns:
    print(i,":",df[i].unique())
    print()

customerID : ['7590-VHVEG' '5575-GNVDE' '3668-QPYBK' ... '4801-JZAZL' '8361-LTMKD'
 '3186-AJIEK']

gender : ['Female' 'Male']

SeniorCitizen : [0 1]

Partner : ['Yes' 'No']

Dependents : ['No' 'Yes']

tenure : [ 1 34  2 45  8 22 10 28 62 13 16 58 49 25 69 52 71 21 12 30 47 72 17 27
  5 46 11 70 63 43 15 60 18 66  9  3 31 50 64 56  7 42 35 48 29 65 38 68
 32 55 37 36 41  6  4 33 67 23 57 61 14 20 53 40 59 24 44 19 54 51 26  0
 39]

PhoneService : ['No' 'Yes']

MultipleLines : ['No phone service' 'No' 'Yes']

InternetService : ['DSL' 'Fiber optic' 'No']

OnlineSecurity : ['No' 'Yes' 'No internet service']

OnlineBackup : ['Yes' 'No' 'No internet service']

DeviceProtection : ['No' 'Yes' 'No internet service']

TechSupport : ['No' 'Yes' 'No internet service']

StreamingTV : ['No' 'Yes' 'No internet service']

StreamingMovies : ['No' 'Yes' 'No internet service']

Contract : ['Month-to-month' 'One year' 'Two year']

PaperlessBilling : ['Yes' 'No']

PaymentMethod : ['Electronic check' 'Maile

In [6]:
df.drop("customerID",axis=1,inplace=True)

In [7]:
df.columns

Index(['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure',
       'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity',
       'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV',
       'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod',
       'MonthlyCharges', 'TotalCharges', 'Churn'],
      dtype='object')

### `Changing Data Type of Total Charges To Float`

In [8]:
df["TotalCharges"] = pd.to_numeric(df["TotalCharges"],errors="coerce")

In [9]:
df["TotalCharges"].unique()

array([  29.85, 1889.5 ,  108.15, ...,  346.45,  306.6 , 6844.5 ],
      shape=(6531,))

In [10]:
df[['OnlineSecurity','OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV','StreamingMovies']] = df[['OnlineSecurity','OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV','StreamingMovies']].replace("No internet service","No",)

In [11]:
df[["MultipleLines"]] = df[["MultipleLines"]].replace("No phone service","No")

In [12]:
X = df.drop("Churn",axis=1)
y = df[["Churn"]]

In [13]:
X_train , X_test , y_train , y_test = train_test_split(X,y,random_state=42,train_size=0.8)

In [14]:
X_train_num = X_train.select_dtypes(include=["int64","float64"])
X_train_cat = X_train.select_dtypes(exclude=["int64","float64"])
X_test_num = X_test.select_dtypes(include=["int64","float64"])
X_test_cat = X_test.select_dtypes(exclude=["int64","float64"])
one_hot_cols = ['gender','Partner','Dependents','PhoneService','PaperlessBilling','OnlineSecurity','OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV','StreamingMovies','PaymentMethod','MultipleLines']
ordinal_cols = ['Contract','InternetService;']
ohe_encoder = OneHotEncoder(drop="first",sparse_output=False)
ohe_train = pd.DataFrame(ohe_encoder.fit_transform(X_train_cat[one_hot_cols]),columns=ohe_encoder.get_feature_names_out(one_hot_cols))
ohe_test = pd.DataFrame(ohe_encoder.transform(X_test_cat[one_hot_cols]),columns=ohe_encoder.get_feature_names_out(one_hot_cols))
ordinal_encoder_contract = OrdinalEncoder(categories=[['Month-to-month', 'One year' ,'Two year']])
X_train_cat["Contract"] = ordinal_encoder_contract.fit_transform(X_train[["Contract"]])
X_test_cat["Contract"] = ordinal_encoder_contract.transform(X_test[["Contract"]])
ordinal_encoder_internet = OrdinalEncoder(categories=[['No','DSL','Fiber optic']])
X_train_cat["InternetService"] = ordinal_encoder_internet.fit_transform(X_train[["InternetService"]])
X_test_cat["InternetService"] = ordinal_encoder_internet.transform(X_test[["InternetService"]])
imputer = SimpleImputer(strategy="median")
X_train_num  = pd.DataFrame(imputer.fit_transform(X_train_num),columns = X_train_num.columns)
X_test_num = pd.DataFrame(imputer.transform(X_test_num),columns = X_test_num.columns)

X_train_cleaned = pd.concat([X_train_cat[["Contract","InternetService"]].reset_index(drop=True),ohe_train.reset_index(drop=True),X_train_num.reset_index(drop=True)],axis=1)
X_test_cleaned = pd.concat([X_test_cat[["Contract","InternetService"]].reset_index(drop=True),ohe_test.reset_index(drop=True),X_test_num.reset_index(drop=True)],axis=1)

scaler = MinMaxScaler()
X_train_cleaned = scaler.fit_transform(X_train_cleaned)
X_test_cleaned = scaler.transform(X_test_cleaned)

for i in [0.5,1.0,1.5,2.0,2.5,3.0,3.5,4.0,4.5,5.0,5.5,6.0,6.5]:
    sv = SVC(kernel = "linear" , C=float(i))
    sv.fit(X_train_cleaned,y_train)
    y_pred = sv.predict(X_test_cleaned)
    print(f"{i} {accuracy_score(y_test , y_pred)*100:.2f}")

sv = SVC(kernel = "linear" , C=2.0)
sv.fit(X_train_cleaned,y_train)
y_pred = sv.predict(X_test_cleaned)
print(f"Accuracy {accuracy_score(y_test , y_pred)*100:.2f}")

0.5 82.04
1.0 82.04
1.5 81.97
2.0 82.11
2.5 82.11
3.0 82.11
3.5 82.04
4.0 82.04
4.5 81.97
5.0 82.04
5.5 81.97
6.0 81.97
6.5 81.97
Accuracy 82.11


In [15]:
X_train_num = X_train.select_dtypes(include=["int64","float64"])
X_train_cat = X_train.select_dtypes(exclude=["int64","float64"])
X_test_num = X_test.select_dtypes(include=["int64","float64"])
X_test_cat = X_test.select_dtypes(exclude=["int64","float64"])

In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gender            7043 non-null   object 
 1   SeniorCitizen     7043 non-null   int64  
 2   Partner           7043 non-null   object 
 3   Dependents        7043 non-null   object 
 4   tenure            7043 non-null   int64  
 5   PhoneService      7043 non-null   object 
 6   MultipleLines     7043 non-null   object 
 7   InternetService   7043 non-null   object 
 8   OnlineSecurity    7043 non-null   object 
 9   OnlineBackup      7043 non-null   object 
 10  DeviceProtection  7043 non-null   object 
 11  TechSupport       7043 non-null   object 
 12  StreamingTV       7043 non-null   object 
 13  StreamingMovies   7043 non-null   object 
 14  Contract          7043 non-null   object 
 15  PaperlessBilling  7043 non-null   object 
 16  PaymentMethod     7043 non-null   object 


In [17]:
df.isna().sum()

gender               0
SeniorCitizen        0
Partner              0
Dependents           0
tenure               0
PhoneService         0
MultipleLines        0
InternetService      0
OnlineSecurity       0
OnlineBackup         0
DeviceProtection     0
TechSupport          0
StreamingTV          0
StreamingMovies      0
Contract             0
PaperlessBilling     0
PaymentMethod        0
MonthlyCharges       0
TotalCharges        11
Churn                0
dtype: int64

In [18]:
df.duplicated().sum()

np.int64(22)

In [19]:
for i in df.columns:
    print(i,":",df[i].unique())
    print()

gender : ['Female' 'Male']

SeniorCitizen : [0 1]

Partner : ['Yes' 'No']

Dependents : ['No' 'Yes']

tenure : [ 1 34  2 45  8 22 10 28 62 13 16 58 49 25 69 52 71 21 12 30 47 72 17 27
  5 46 11 70 63 43 15 60 18 66  9  3 31 50 64 56  7 42 35 48 29 65 38 68
 32 55 37 36 41  6  4 33 67 23 57 61 14 20 53 40 59 24 44 19 54 51 26  0
 39]

PhoneService : ['No' 'Yes']

MultipleLines : ['No' 'Yes']

InternetService : ['DSL' 'Fiber optic' 'No']

OnlineSecurity : ['No' 'Yes']

OnlineBackup : ['Yes' 'No']

DeviceProtection : ['No' 'Yes']

TechSupport : ['No' 'Yes']

StreamingTV : ['No' 'Yes']

StreamingMovies : ['No' 'Yes']

Contract : ['Month-to-month' 'One year' 'Two year']

PaperlessBilling : ['Yes' 'No']

PaymentMethod : ['Electronic check' 'Mailed check' 'Bank transfer (automatic)'
 'Credit card (automatic)']

MonthlyCharges : [29.85 56.95 53.85 ... 63.1  44.2  78.7 ]

TotalCharges : [  29.85 1889.5   108.15 ...  346.45  306.6  6844.5 ]

Churn : ['No' 'Yes']



In [20]:
one_hot_cols = ['gender','Partner','Dependents','PhoneService','PaperlessBilling','OnlineSecurity','OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV','StreamingMovies','PaymentMethod','MultipleLines']
ordinal_cols = ['Contract','InternetService;']

In [21]:
len(one_hot_cols)

13

### `OneHotEncoder`

In [22]:
ohe_encoder = OneHotEncoder(drop="first",sparse_output=False)
ohe_train = pd.DataFrame(ohe_encoder.fit_transform(X_train_cat[one_hot_cols]),columns=ohe_encoder.get_feature_names_out(one_hot_cols))
ohe_test = pd.DataFrame(ohe_encoder.transform(X_test_cat[one_hot_cols]),columns=ohe_encoder.get_feature_names_out(one_hot_cols))

In [23]:
ohe_train

Unnamed: 0,gender_Male,Partner_Yes,Dependents_Yes,PhoneService_Yes,PaperlessBilling_Yes,OnlineSecurity_Yes,OnlineBackup_Yes,DeviceProtection_Yes,TechSupport_Yes,StreamingTV_Yes,StreamingMovies_Yes,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,MultipleLines_Yes
0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
1,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0
2,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,1.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5629,1.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0
5630,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0
5631,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
5632,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0


### Ordinal Encoder

In [24]:
ordinal_encoder_contract = OrdinalEncoder(categories=[['Month-to-month', 'One year' ,'Two year']])
X_train_cat["Contract"] = ordinal_encoder_contract.fit_transform(X_train[["Contract"]])
X_test_cat["Contract"] = ordinal_encoder_contract.transform(X_test[["Contract"]])

In [25]:
ordinal_encoder_internet = OrdinalEncoder(categories=[['No','DSL','Fiber optic']])
X_train_cat["InternetService"] = ordinal_encoder_internet.fit_transform(X_train[["InternetService"]])
X_test_cat["InternetService"] = ordinal_encoder_internet.transform(X_test[["InternetService"]])

### `Imputing Missing Value Through Median`

In [26]:
imputer = SimpleImputer(strategy="median")
X_train_num  = pd.DataFrame(imputer.fit_transform(X_train_num),columns = X_train_num.columns)
X_test_num = pd.DataFrame(imputer.transform(X_test_num),columns = X_test_num.columns)

In [27]:
X_train_num.isna().sum()

SeniorCitizen     0
tenure            0
MonthlyCharges    0
TotalCharges      0
dtype: int64

In [28]:
X_train_cleaned = pd.concat([X_train_cat[["Contract","InternetService"]].reset_index(drop=True),ohe_train.reset_index(drop=True),X_train_num.reset_index(drop=True)],axis=1)
X_test_cleaned = pd.concat([X_test_cat[["Contract","InternetService"]].reset_index(drop=True),ohe_test.reset_index(drop=True),X_test_num.reset_index(drop=True)],axis=1)

In [29]:
X_train_cleaned

Unnamed: 0,Contract,InternetService,gender_Male,Partner_Yes,Dependents_Yes,PhoneService_Yes,PaperlessBilling_Yes,OnlineSecurity_Yes,OnlineBackup_Yes,DeviceProtection_Yes,...,StreamingTV_Yes,StreamingMovies_Yes,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,MultipleLines_Yes,SeniorCitizen,tenure,MonthlyCharges,TotalCharges
0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,21.0,64.85,1336.80
1,2.0,2.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,...,1.0,1.0,0.0,0.0,0.0,1.0,0.0,54.0,97.20,5129.45
2,0.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,23.45,23.45
3,0.0,2.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,4.0,70.20,237.95
4,2.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,61.90,1410.80
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5629,0.0,2.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,...,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,95.00,95.00
5630,2.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,0.0,0.0,1.0,0.0,23.0,91.10,2198.30
5631,0.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,12.0,21.15,306.05
5632,0.0,2.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,...,1.0,1.0,0.0,1.0,0.0,1.0,1.0,12.0,99.45,1200.15


In [30]:
X_train_cleaned.columns

Index(['Contract', 'InternetService', 'gender_Male', 'Partner_Yes',
       'Dependents_Yes', 'PhoneService_Yes', 'PaperlessBilling_Yes',
       'OnlineSecurity_Yes', 'OnlineBackup_Yes', 'DeviceProtection_Yes',
       'TechSupport_Yes', 'StreamingTV_Yes', 'StreamingMovies_Yes',
       'PaymentMethod_Credit card (automatic)',
       'PaymentMethod_Electronic check', 'PaymentMethod_Mailed check',
       'MultipleLines_Yes', 'SeniorCitizen', 'tenure', 'MonthlyCharges',
       'TotalCharges'],
      dtype='object')

In [31]:
for i in X_train_cleaned.columns:
    print(i,":",X_train_cleaned[i].unique())
    print()

Contract : [1. 2. 0.]

InternetService : [1. 2. 0.]

gender_Male : [0. 1.]

Partner_Yes : [0. 1.]

Dependents_Yes : [1. 0.]

PhoneService_Yes : [1. 0.]

PaperlessBilling_Yes : [0. 1.]

OnlineSecurity_Yes : [1. 0.]

OnlineBackup_Yes : [0. 1.]

DeviceProtection_Yes : [1. 0.]

TechSupport_Yes : [0. 1.]

StreamingTV_Yes : [0. 1.]

StreamingMovies_Yes : [1. 0.]

PaymentMethod_Credit card (automatic) : [0. 1.]

PaymentMethod_Electronic check : [0. 1.]

PaymentMethod_Mailed check : [1. 0.]

MultipleLines_Yes : [0. 1.]

SeniorCitizen : [0. 1.]

tenure : [21. 54.  1.  4.  0.  7. 32. 72. 19. 10. 45. 40. 47. 36. 69. 71. 35.  3.
 68. 42.  8. 46. 12. 26. 49. 33. 31. 66. 58. 13. 57.  6. 59. 15. 27. 34.
 18.  5. 39. 29.  2. 63. 20. 14. 56. 37. 24. 52. 43. 11. 16. 50. 38. 23.
 55. 48. 53. 70. 22. 28. 44. 65. 64. 60. 51.  9. 25. 61. 30. 17. 41. 67.
 62.]

MonthlyCharges : [64.85 97.2  23.45 ... 59.25 35.35 21.15]

TotalCharges : [1336.8  5129.45   23.45 ...  306.05 1200.15  457.3 ]



### `Scaling`

In [32]:
scaler = MinMaxScaler()
X_train_cleaned = scaler.fit_transform(X_train_cleaned)
X_test_cleaned = scaler.transform(X_test_cleaned)

In [33]:
for i in [0.5,1.0,1.5,2.0,2.5,3.0,3.5,4.0,4.5,5.0,5.5,6.0,6.5]:
    sv = SVC(kernel = "linear" , C=float(i))
    sv.fit(X_train_cleaned,y_train)
    y_pred = sv.predict(X_test_cleaned)
    print(f"{i} {accuracy_score(y_test , y_pred)*100:.2f}")

0.5 82.04
1.0 82.04
1.5 81.97
2.0 82.11
2.5 82.11
3.0 82.11
3.5 82.04
4.0 82.04
4.5 81.97
5.0 82.04
5.5 81.97
6.0 81.97
6.5 81.97


In [34]:
sv = SVC(kernel = "linear" , C=2.0)
sv.fit(X_train_cleaned,y_train)
y_pred = sv.predict(X_test_cleaned)
print(f"Accuracy {accuracy_score(y_test , y_pred)*100:.2f}")

Accuracy 82.11


In [35]:
import pickle
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

# --------------------------------------------------
# Train the final SVC model (same as your best model)
# --------------------------------------------------
sv = SVC(kernel="linear", C=2.0)

# Fit on cleaned training data
sv.fit(X_train_cleaned, y_train)

# Evaluate on test data
y_pred = sv.predict(X_test_cleaned)
print(f"Final Accuracy: {accuracy_score(y_test, y_pred) * 100:.2f}%")

# --------------------------------------------------
# Save the trained model
# --------------------------------------------------
with open("svc_model.pkl", "wb") as f:
    pickle.dump(sv, f)

print("✔ Trained SVC model saved successfully as svc_model.pkl")


Final Accuracy: 82.11%
✔ Trained SVC model saved successfully as svc_model.pkl


In [36]:
pickle.dump(scaler, open("scaler.pkl", "wb"))
pickle.dump(imputer, open("imputer.pkl", "wb"))
pickle.dump(ordinal_encoder_contract, open("ordinal_encoder_contract.pkl", "wb"))
pickle.dump(ordinal_encoder_internet, open("ordinal_encoder_internet.pkl", "wb"))
pickle.dump(ohe_encoder, open("ohe_encoder.pkl", "wb"))

In [3]:
!streamlit run a.py

Usage: streamlit run [OPTIONS] [TARGET] [ARGS]...
Try 'streamlit run --help' for help.

Error: Invalid value: File does not exist: a.py
