In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OrdinalEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.cluster import KMeans
from sklearn.pipeline import Pipeline
import pickle
from sklearn.ensemble import IsolationForest
from sklearn.metrics import classification_report

In [None]:
df = pd.read_csv("telco_data.csv")
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1.0,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34.0,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2.0,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45.0,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2.0,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            6293 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           6043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            4543 non-null   float64
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   6043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       5543 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [None]:
df=df.drop(columns=['customerID'])

In [None]:
x=df.drop('Churn',axis=1)
y=df.Churn

In [None]:
num_col=x.select_dtypes(include='number').columns
obj_col=x.select_dtypes(exclude='number').columns

In [None]:
xtrain,xtest,ytrain,ytest=train_test_split(x,y,train_size=0.8,random_state=42)

In [None]:
num_preprocessing=Pipeline(
    steps=[
        ('imputer_for_numcols',SimpleImputer(strategy='median')),
         ('standardscaler',StandardScaler())
    ]
)

In [None]:
cat_preprocessing=Pipeline(
    steps=[
        ('imputer_for_objcols',SimpleImputer(strategy='constant',fill_value='Unknown')),
         ('ordinalencoder',OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
    ]
)

In [None]:
preprocessing=ColumnTransformer(
    transformers=[
        ('num_preprocessing',num_preprocessing,num_col),
         ('cat_preprocessing',cat_preprocessing,obj_col)
    ]
)
x=preprocessing.fit_transform(x)

In [None]:
model=IsolationForest(n_estimators=200,contamination=0.01)
anomaly_values=model.fit_predict(x)

In [None]:
anomaly_values

array([1, 1, 1, ..., 1, 1, 1])

In [None]:
df['anomaly_values']=anomaly_values

* Churn=Yes(1)->Custom left the telecom service
* Churn=No(0)->Custom continued the telecom service

In [None]:
df['anomaly_values']=df['anomaly_values'].replace({'1':0,'-1':1})

In [None]:
dff=df

In [None]:
x=df.drop('Churn',axis=1)
y=df.Churn

In [None]:
num_col=x.select_dtypes(include='number').columns
obj_col=x.select_dtypes(exclude='number').columns

In [None]:
xtrain,xtest,ytrain,ytest=train_test_split(x,y,train_size=0.8,random_state=42)

In [None]:
num_preprocessing=Pipeline(
    steps=[
        ('imputer_for_numcols',SimpleImputer(strategy='median')),
         ('standardscaler',StandardScaler())
    ]
)

In [None]:
cat_preprocessing=Pipeline(
    steps=[
        ('imputer_for_objcols',SimpleImputer(strategy='constant',fill_value='Unknown')),
         ('ordinalencoder',OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
    ]
)

In [None]:
preprocessing=ColumnTransformer(
    transformers=[
        ('num_preprocessing',num_preprocessing,num_col),
         ('cat_preprocessing',cat_preprocessing,obj_col)
    ]
)
x=preprocessing.fit_transform(x)

In [None]:
preprocessing=ColumnTransformer(
    transformers=[
        ('num_preprocessing',num_preprocessing,num_col),
         ('cat_preprocessing',cat_preprocessing,obj_col)
    ]
)

main_pipeline=Pipeline(
    steps=[
        ('preprocessing',preprocessing),
        ('model',LogisticRegression())
    ]
)
main_pipeline.fit(xtrain,ytrain)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
main_pipeline.score(xtrain,ytrain)

0.7907348242811502

In [None]:
y_pred=main_pipeline.predict(xtest)

In [None]:
print(classification_report(ytest,y_pred))

              precision    recall  f1-score   support

          No       0.78      0.95      0.86      1036
         Yes       0.65      0.25      0.36       373

    accuracy                           0.77      1409
   macro avg       0.72      0.60      0.61      1409
weighted avg       0.75      0.77      0.73      1409

