In [25]:
import pandas
from sklearn.model_selection import train_test_split
import pickle
import time
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB

In [5]:
data=pd.read_csv('../data/raw/churn.csv')

In [54]:
data.describe(include='all',).T

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
customerID,7043,7043.0,3096-GKWEB,1.0,,,,,,,
gender,7043,2.0,Male,3555.0,,,,,,,
SeniorCitizen,7043,,,,0.162147,0.368612,0.0,0.0,0.0,0.0,1.0
Partner,7043,2.0,No,3641.0,,,,,,,
Dependents,7043,2.0,No,4933.0,,,,,,,
tenure,7043,,,,32.3711,24.5595,0.0,9.0,29.0,55.0,72.0
PhoneService,7043,2.0,Yes,6361.0,,,,,,,
MultipleLines,7043,3.0,No,3390.0,,,,,,,
InternetService,7043,3.0,Fiber optic,3096.0,,,,,,,
OnlineSecurity,7043,3.0,No,3498.0,,,,,,,


In [40]:
cleaned_data=data.drop(columns='customerID')

In [47]:
cleaned_data['MultipleLines']=cleaned_data['MultipleLines'].replace('No phone service','No')

In [65]:
cleaned_data.isna().sum()

gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

In [53]:
cleaned_data.dtypes

gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges         object
Churn                object
dtype: object

In [79]:
cleaned_data.loc[cleaned_data['TotalCharges'] == " ",'TotalCharges']=cleaned_data.tenure*cleaned_data.MonthlyCharges

In [81]:
cleaned_data['TotalCharges']= pd.to_numeric(cleaned_data['TotalCharges'])

In [89]:
cleaned_data=pd.get_dummies(cleaned_data)

In [93]:
cleaned_data.to_csv('../data/processed/churn.csv', index=False)

In [16]:
class DataHandler(object):

    def __init__(self, file_path=str()):
        self.file_path = file_path
        self.data_frame = None
        self.train_data = None
        self.train_labels = None
        self.test_data = None
        self.test_labels = None

    def get_file_path(self):
        return str(self.file_path)

    def set_file_path(self, value=str()):
        self.file_path = value

    def load_csv(self):
        try:
            self.data_frame = pandas.read_csv(self.file_path)
        except Exception as e:
            print(e)
            print('ERROR: Unable to load csv file for file path: '
                  '{filepath}'.format(filepath=self.file_path))
        if self.data_frame is not None:
            print('Data loaded successfully from file path: '
                  '{file_path}'.format(file_path=self.file_path))

    def split_data(self, test_size=float()):
        self.train_data, self.test_data, self.train_labels, self.test_labels = \
            train_test_split(self.data_frame.iloc[:, :-1].values,
                             self.data_frame.iloc[:, -1].values,
                             stratify=self.data_frame.iloc[:, -1].values,
                             test_size=test_size)


In [27]:
class ModelHandler(object):

    def __init__(self, file_path=str(), model_type=str()):
        self.file_path = file_path
        self.model_type = model_type
        self.model = None
        self.results = None

    def get_file_path(self):
        return str(self.file_path)

    def set_file_path(self, value=str()):
        self.file_path = value

    def get_model_type(self):
        return str(self.model_type)

    def set_model_type(self, value=str()):
        self.model_type = value

    def create_model(self):
        if self.model_type == 'SVM':
            self.model = SVC()
        elif self.model_type == 'NB':
            self.model = GaussianNB()
        else:
            print('ERROR: Unable to create model as model type: '
                  '"{model_type}" not recognized'
                  .format(model_type=self.model_type))
        if self.model is not None:
            print('Model created for type: {model_type}'
                  .format(model_type=self.model_type))

    def train_model(self, train_data, train_labels):
        print('Model training started on {n_rows} data rows - please wait'
              .format(n_rows=len(train_labels)))
        start_time = time.time()
        self.model.fit(train_data, train_labels)
        print('Model trained - training time: {time}'
              .format(time=(time.time()-start_time)/60))

    def test_model(self, test_data, test_labels):
        self.results = self.model.predict(test_data)
        accuracy = sum(self.results == test_labels) / len(test_labels)
        print('Predictions:', self.results)
        print('Model testing completed - accuracy: {accuracy}'
              .format(accuracy=accuracy))

    def save_model(self):
        try:
            with open(self.file_path + '/model_' + str(self.model_type) + '.pickle', 'wb') as f:
                pickle.dump(self.model, f)
                f.close()
            print('Model saved to file path: {file_path}'
                  .format(file_path=self.file_path))
        except Exception as e:
            print(e)
            print('ERROR: Unable to pickle model to file path: {file_path}'
                  .format(file_path=self.file_path))


In [29]:
class GenericModel(object):

    def __init__(self):
        self.data_handler = DataHandler()
        self.model_handler = ModelHandler()

    def set_data_file_path(self, file_path=str()):
        self.data_handler.set_file_path(value=file_path)

    def load_data(self, test_size=float()):
        self.data_handler.load_csv()
        self.data_handler.split_data(test_size=test_size)

    def run_model(self, model_type=str(), file_path=str()):
        self.model_handler.set_model_type(value=model_type)
        self.model_handler.create_model()
        self.model_handler.train_model(train_data=self.data_handler.train_data,
                                       train_labels=self.data_handler.train_labels)
        self.model_handler.test_model(test_data=self.data_handler.test_data,
                                      test_labels=self.data_handler.test_labels)
        self.model_handler.set_file_path(value=file_path)
        self.model_handler.save_model()

In [95]:
data_file_path='../data/processed/churn.csv'
model_file_path='../models/'

In [102]:
churn_model=GenericModel()

In [103]:
churn_model.set_data_file_path(data_file_path)

In [104]:
churn_model.load_data(0.3)

Data loaded successfully from file path: ../data/processed/churn.csv


In [105]:
churn_model.run_model('SVM',model_file_path)

Model created for type: SVM
Model training started on 4930 data rows - please wait
Model trained - training time: 0.01655574639638265
Predictions: [0 0 0 ... 0 0 0]
Model testing completed - accuracy: 0.73450070989115
Model saved to file path: ../models/
