In [1]:
import pandas as pd
import numpy as np

from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest
import pickle

Picking up model that is featured engineered already

In [2]:
df = pd.read_csv(r"C:\Users\zarah\Desktop\Lighthouse Labs\Final Project\Churn Project Files\DataFiles\df_model.csv") 


In [3]:
df.drop(['PaymentMethod_new', 'Partner_new', 'Dependents_new','DeviceProtection_new','StreamingTV_new', 'StreamingMovies_new','tenureCatLabel_new'], axis=1, inplace=True)

In [4]:
df

Unnamed: 0,SeniorCitizen,MonthlyCharges_Log,TotalCharges_Log,PhoneService_new,Contract_new,PaperlessBilling_new,Churn_new,gender_new,MultipleLines_new,InternetService_new,OnlineSecurity_new,OnlineBackup_new,TechSupport_new
0,0,3.396185,3.396185,0,0,1,0,0,1,0,0,2,0
1,0,4.042174,7.544068,1,1,0,0,1,0,0,2,0,0
2,0,3.986202,4.683519,1,0,1,1,1,0,0,2,2,0
3,0,3.744787,7.517928,0,1,0,0,1,1,0,2,0,2
4,0,4.258446,5.021575,1,0,1,1,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7026,0,3.051640,7.257990,1,2,1,0,0,0,2,1,1,1
7027,0,4.440296,7.596141,1,1,1,0,1,2,0,2,0,2
7028,0,4.636669,8.904209,1,1,1,0,0,2,1,0,2,0
7029,0,3.387774,5.847739,0,0,1,0,0,1,0,2,0,0


In [5]:
# Our own class to be inserted into the pipeline
class RawFeats:
    def __init__(self, feats):
        self.feats = feats

    def fit(self, X, y=None):
        pass

    def transform(self, X, y=None):
        return X[self.feats]

    def fit_transform(self, X, y=None):
        self.fit(X)
        return self.transform(X)

# Features from X we want to keep for our PCA
feats = ['SeniorCitizen', 'MonthlyCharges_Log', 'TotalCharges_Log',
       'PhoneService_new', 'Contract_new', 'PaperlessBilling_new',
       'gender_new', 'MultipleLines_new', 'InternetService_new',
       'OnlineSecurity_new', 'OnlineBackup_new', 'TechSupport_new']

# Creating a class object with indexes we want to keep
raw_feats = RawFeats(feats)

In [6]:
# Scaling and doing the PCA
sc = StandardScaler()
pca = PCA(n_components=5)
print(pca)

PCA(n_components=5)


In [7]:
# Selecting the k-best
selection = SelectKBest(k=3)
print(selection)

SelectKBest(k=3)


In [8]:
from sklearn.linear_model import LogisticRegression


In [9]:
# Logistic Regressor Model
lr = LogisticRegression()

In [10]:
# Combining everything into our pipeline
PCA_pipeline = Pipeline([
    ("rawFeats", raw_feats),
    ("scaler", sc),
    ("pca", pca)
])

kbest_pipeline = Pipeline([("kBest", selection)])

In [11]:
# Combining the outputs with FeatureUnion
all_features = FeatureUnion([
    ("pcaPipeline", PCA_pipeline), 
    ("kBestPipeline", kbest_pipeline)
])

In [12]:
# Creating the main pipeline
main_pipeline = Pipeline([
    ("features", all_features),
    ("lr", lr)
])

In [13]:
# Checking the estimator parameters first
estimator = LogisticRegression()
print(estimator.get_params().keys())

dict_keys(['C', 'class_weight', 'dual', 'fit_intercept', 'intercept_scaling', 'l1_ratio', 'max_iter', 'multi_class', 'n_jobs', 'penalty', 'random_state', 'solver', 'tol', 'verbose', 'warm_start'])


In [14]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split



In [15]:
# Separating the dataframe between the X (independent) and y (dependent) variables
X = df.drop('Churn_new', axis=1).select_dtypes(include=[np.number])
y = df['Churn_new']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25,random_state=100)

In [16]:
# Checking the end of our dataset
df.tail()

Unnamed: 0,SeniorCitizen,MonthlyCharges_Log,TotalCharges_Log,PhoneService_new,Contract_new,PaperlessBilling_new,Churn_new,gender_new,MultipleLines_new,InternetService_new,OnlineSecurity_new,OnlineBackup_new,TechSupport_new
7026,0,3.05164,7.25799,1,2,1,0,0,0,2,1,1,1
7027,0,4.440296,7.596141,1,1,1,0,1,2,0,2,0,2
7028,0,4.636669,8.904209,1,1,1,0,0,2,1,0,2,0
7029,0,3.387774,5.847739,0,0,1,0,0,1,0,2,0,0
7030,1,4.309456,5.725544,1,0,1,1,1,2,1,0,0,0


In [17]:
X_train.tail()

Unnamed: 0,SeniorCitizen,MonthlyCharges_Log,TotalCharges_Log,PhoneService_new,Contract_new,PaperlessBilling_new,gender_new,MultipleLines_new,InternetService_new,OnlineSecurity_new,OnlineBackup_new,TechSupport_new
79,0,3.254243,7.103815,1,1,1,0,2,2,1,1,1
3927,0,3.69511,6.019688,0,1,0,0,1,0,2,2,2
5955,0,4.123094,8.014385,1,0,0,0,0,0,2,0,2
6936,0,4.615615,8.82033,1,1,1,1,2,1,0,2,0
5640,0,2.998229,6.506456,1,0,0,0,0,2,1,1,1


In [18]:
y_train.tail

<bound method NDFrame.tail of 5513    1
1671    0
510     0
1590    1
6360    0
       ..
79      0
3927    0
5955    0
6936    0
5640    0
Name: Churn_new, Length: 5273, dtype: int64>

In [19]:
# Seeting the hyperparameter tunning with GridSearchCV
param_grid = {"features__pcaPipeline__pca__n_components": [1, 2, 3, 4, 5],
                  "features__kBestPipeline__kBest__k": [1, 2, 3],
                  "lr__max_iter":[100, 500, 1000],
                  "lr__C": [0.1, 1, 10, 100], 
                  "lr__penalty": ['l1', 'l2'],
                  "lr__solver": ['liblinear', 'saga']
             }

# Creating a Grid Search object
grid_search = GridSearchCV(main_pipeline, param_grid, n_jobs = -1, verbose=10, refit=True)    

# Fitting the model and tune parameters
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 720 candidates, totalling 3600 fits


GridSearchCV(estimator=Pipeline(steps=[('features',
                                        FeatureUnion(transformer_list=[('pcaPipeline',
                                                                        Pipeline(steps=[('rawFeats',
                                                                                         <__main__.RawFeats object at 0x000001651A902700>),
                                                                                        ('scaler',
                                                                                         StandardScaler()),
                                                                                        ('pca',
                                                                                         PCA(n_components=5))])),
                                                                       ('kBestPipeline',
                                                                        Pipeline(steps=[('kBest',
                

In [20]:
# Visualizing the pipeline
from sklearn import set_config
set_config(display='diagram')
main_pipeline

In [21]:
# Saving the pipeline_combined_final HTML to a file
from sklearn.utils import estimator_html_repr

with open('main_pipeline_13F.html', 'w', encoding="utf-16") as f:  
    f.write(estimator_html_repr(main_pipeline))

In [22]:
# Checking the best parameters
print(grid_search.best_params_)

{'features__kBestPipeline__kBest__k': 1, 'features__pcaPipeline__pca__n_components': 5, 'lr__C': 1, 'lr__max_iter': 100, 'lr__penalty': 'l1', 'lr__solver': 'saga'}


In [23]:
# Testing the model tunned
X_test_row = X_test.loc[:, ]
y_pred = grid_search.predict(X_test_row)
print("Predicted outcome:", y_pred[0:10])

Predicted outcome: [1 0 0 1 0 1 0 0 1 0]


In [24]:
from sklearn.metrics import accuracy_score,f1_score


In [25]:
# Calculating the accuracy and score of the best estimator
accuracy = accuracy_score(y_test, y_pred)
score = f1_score(y_test, y_pred, average='macro')

print("Accuracy:", accuracy)
print("Score:", score)

Accuracy: 0.7656427758816837
Score: 0.6734769687964339


In [26]:
X_test.shape


(1758, 12)

In [27]:
import pickle
pickle.dump( grid_search, open( "PipeModel_13F.pkl", "wb" ) )
