In [10]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.impute import KNNImputer ##imputing missing values

from sklearn.model_selection import cross_val_score,cross_val_predict, train_test_split
from sklearn.preprocessing import OneHotEncoder,StandardScaler,PowerTransformer,LabelEncoder


##Creating pipeline 
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline,make_pipeline


from imblearn.ensemble import BalancedRandomForestClassifier
import warnings
warnings.filterwarnings("ignore")
import pickle

In [2]:
def data_preprocessing(df):
    
    ## Dropping column which are not significant
    df1 = df.drop(['customerID','gender','PhoneService'],axis=1).copy()

    ## getting missing Values
    df1['TotalCharges']= df1['TotalCharges'].apply(lambda x: x if x!= ' ' else np.nan).astype(float)
    
    ## Encoding
    le = LabelEncoder()
    
    df1['Churn']=le.fit_transform(df1['Churn'])
    
    df1[['MultipleLines','OnlineSecurity','OnlineBackup','DeviceProtection','TechSupport','StreamingTV','StreamingMovies']]= df1[['MultipleLines','OnlineSecurity','OnlineBackup','DeviceProtection','TechSupport','StreamingTV','StreamingMovies']].replace('No internet service','No')


    X= df1.drop('Churn', axis=1)
    y= df1['Churn']

    for col in X.columns:
        col_type = X[col].dtype
        if col_type == 'object' or col_type.name == 'category':
            X[col] = X[col].astype('category')

    return X,y

In [3]:
trf1 = ColumnTransformer([
    ("One_hot_encoding", OneHotEncoder(handle_unknown="ignore" ,sparse=False,drop="first"),[1, 2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14])],remainder='passthrough')

trf2 = ColumnTransformer([
    ("Missing_value",KNNImputer(n_neighbors=3, weights="uniform"),[21])],remainder='passthrough')

trf3 = BalancedRandomForestClassifier()

In [4]:
pipe=make_pipeline(trf1,trf2,trf3)
pipe

In [6]:
## Importing Data
df = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv')

## getting Xand y
X,y = data_preprocessing(df)

## Splitting data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [7]:
pipe.fit(X_train,y_train)
pipe.predict(X_test)

array([1, 0, 0, ..., 0, 1, 0])

In [13]:
with open("trained_pipeline_BRF(2).pkl","wb") as f:
    pickle.dump(pipe,f)

In [14]:
pickled_model = pickle.load(open("trained_pipeline_BRF(2).pkl","rb"))