In [25]:
import pandas
from sklearn.model_selection import train_test_split
import pickle
import time
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB

In [5]:
data=pd.read_csv('../data/raw/churn.csv')

In [6]:
data.head(5)

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [16]:
class DataHandler(object):

    def __init__(self, file_path=str()):
        self.file_path = file_path
        self.data_frame = None
        self.train_data = None
        self.train_labels = None
        self.test_data = None
        self.test_labels = None

    def get_file_path(self):
        return str(self.file_path)

    def set_file_path(self, value=str()):
        self.file_path = value

    def load_csv(self):
        try:
            self.data_frame = pandas.read_csv(self.file_path)
        except Exception as e:
            print(e)
            print('ERROR: Unable to load csv file for file path: '
                  '{filepath}'.format(filepath=self.file_path))
        if self.data_frame is not None:
            print('Data loaded successfully from file path: '
                  '{file_path}'.format(file_path=self.file_path))

    def split_data(self, test_size=float()):
        self.train_data, self.test_data, self.train_labels, self.test_labels = \
            train_test_split(self.data_frame.iloc[:, :-1].values,
                             self.data_frame.iloc[:, -1].values,
                             stratify=self.data_frame.iloc[:, -1].values,
                             test_size=test_size)


In [27]:
class ModelHandler(object):

    def __init__(self, file_path=str(), model_type=str()):
        self.file_path = file_path
        self.model_type = model_type
        self.model = None
        self.results = None

    def get_file_path(self):
        return str(self.file_path)

    def set_file_path(self, value=str()):
        self.file_path = value

    def get_model_type(self):
        return str(self.model_type)

    def set_model_type(self, value=str()):
        self.model_type = value

    def create_model(self):
        if self.model_type == 'SVM':
            self.model = SVC()
        elif self.model_type == 'NB':
            self.model = GaussianNB()
        else:
            print('ERROR: Unable to create model as model type: '
                  '"{model_type}" not recognized'
                  .format(model_type=self.model_type))
        if self.model is not None:
            print('Model created for type: {model_type}'
                  .format(model_type=self.model_type))

    def train_model(self, train_data, train_labels):
        print('Model training started on {n_rows} data rows - please wait'
              .format(n_rows=len(train_labels)))
        start_time = time.time()
        self.model.fit(train_data, train_labels)
        print('Model trained - training time: {time}'
              .format(time=(time.time()-start_time)/60))

    def test_model(self, test_data, test_labels):
        self.results = self.model.predict(test_data)
        accuracy = sum(self.results == test_labels) / len(test_labels)
        print('Predictions:', self.results)
        print('Model testing completed - accuracy: {accuracy}'
              .format(accuracy=accuracy))

    def save_model(self):
        try:
            with open(self.file_path + '/model_' + str(self.model_type) + '.pickle', 'wb') as f:
                pickle.dump(self.model, f)
                f.close()
            print('Model saved to file path: {file_path}'
                  .format(file_path=self.file_path))
        except Exception as e:
            print(e)
            print('ERROR: Unable to pickle model to file path: {file_path}'
                  .format(file_path=self.file_path))


In [29]:
class GenericModel(object):

    def __init__(self):
        self.data_handler = DataHandler()
        self.model_handler = ModelHandler()

    def set_data_file_path(self, file_path=str()):
        self.data_handler.set_file_path(value=file_path)

    def load_data(self, test_size=float()):
        self.data_handler.load_csv()
        self.data_handler.split_data(test_size=test_size)

    def run_model(self, model_type=str(), file_path=str()):
        self.model_handler.set_model_type(value=model_type)
        self.model_handler.create_model()
        self.model_handler.train_model(train_data=self.data_handler.train_data,
                                       train_labels=self.data_handler.train_labels)
        self.model_handler.test_model(test_data=self.data_handler.test_data,
                                      test_labels=self.data_handler.test_labels)
        self.model_handler.set_file_path(value=file_path)
        self.model_handler.save_model()

In [37]:
data_file_path='../data/raw/churn.csv'
model_file_path='../models/'

In [30]:
churn_model=GenericModel()

In [33]:
churn_model.set_data_file_path(data_file_path)

In [35]:
churn_model.load_data(0.2)

Data loaded successfully from file path: ../data/raw/churn.csv


In [38]:
churn_model.run_model('SVM',model_file_path)

Model created for type: SVM
Model training started on 5634 data rows - please wait


ValueError: could not convert string to float: '0702-PGIBZ'