In [1]:
import json

In [38]:
CFGLog = {
    "data": {
        "path": "../data/yelp.csv",
        "x": "text",
        "y": "stars",
        "test_size": 0.2,
        "random_state": 2022,
        "ngram_range": (1, 2),
        "min_df": 10

    },
    "train": {
            "solver": "liblinear",
            "max_iter": 1000,
            "random_state": 2022,
            "C": 0.01,
            "penalty": "l2",
    },
    "output": {
        "output_path": r"../data/exported_models",
    }
}

class Config:
    """Config class which contains data, train and model hyperparameters"""

    def __init__(self, data, train, output):
        self.data = data
        # self.datapath = os.path.
        self.train = train
        self.output = output

    @classmethod # using config to define constructor of the class
    def from_json(cls, cfg):
        """Creates config from json"""
        params = json.loads(json.dumps(cfg), object_hook=HelperDict)
        # init all class instance with data and train attributes
        return cls(params.data, params.train, params.output) 


class HelperDict(object):
    """Helper class to convert json into Python object"""
    def __init__(self, dict_):
        self.__dict__.update(dict_)


In [26]:
from abc import ABC, abstractmethod

class BaseModel(ABC):
    """Abstract Model class that is inherited to all models"""

    def __init__(self, cfg):
        self.config = Config.from_json(cfg)

    @abstractmethod
    def load_data(self):
        pass

    @abstractmethod
    def build(self):
        pass

    @abstractmethod
    def train(self):
        pass

    @abstractmethod
    def evaluate(self):
        pass

import pandas as pd
from sklearn import model_selection, linear_model
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
from datetime import datetime

class DataLoader:
    """Data Loader class"""

    @staticmethod
    def load_data(data_config):
        """Loads dataset from path"""
        df = pd.read_csv(data_config.path) # data_config will have .path attribute
        return df

    @staticmethod
    def preprocess_data(data_config, dataset = None): #dataset -> df
        """ Preprocess and splits into training and test"""
        x = dataset[data_config.x]
        y = dataset[data_config.y]
        test_size = data_config.test_size
        random_state = data_config.random_state
        # splitting
        x_train, x_test, y_train, y_test = \
            model_selection.train_test_split(x, y, 
                                             test_size=test_size, 
                                             random_state=random_state)
        # vectorizing
        vectorizer = CountVectorizer(ngram_range = data_config.ngram_range, 
                                     min_df = data_config.min_df)
        # fit_transform on training texts: from text to sparse frequency fector embeddings
        # transform on test texts: using vector dimension from training set
        # Extract token counts out of raw text documents using the vocabulary
        # fitted with fit or the one provided to the constructor.
        # vectorizer.vocabulary_: A mapping of terms to feature indices.
        X_train = vectorizer.fit_transform(x_train)
        X_test = vectorizer.transform(x_test)
        Y_train = np.array(y_train)
        Y_test = np.array(y_test)

        return x, y, x_train, x_test, y_train, y_test, X_train, X_test, Y_train, Y_test, vectorizer


class LogisticRegressionTrainer():
    def __init__(self, model, X_train, Y_train):
        self.model = model
        self.X_train = X_train
        self.Y_train = Y_train
    def train(self):
        self.model.fit(self.X_train, self.Y_train)


class YelpLogisticRegression(BaseModel):
    """Logistic regression model"""

    def __init__(self, config):
        super().__init__(config)

    def load_data(self):
        """Loads and preprocess data to inputs accepted by the model"""
        self.dataset = DataLoader().load_data(self.config.data)
        self.x, self.y, \
            self.x_train, self.x_test, self.y_train, self.y_test,\
            self.X_train, self.X_test, self.Y_train, self.Y_test, self.vectorizer =  \
            DataLoader().preprocess_data(data_config = self.config.data, 
                                           dataset = self.dataset)

    def build(self):
        """Builds the model"""
        # as the model architecture is vanilla logistic regression, there is no customization here
        # this is useful when you want to customize layers in a deep learning model 
        self.model = linear_model.LogisticRegression()
        print('Logistic Regression model built')

    def train(self):
        """Compiles and trains the model with configured training hyper-parameters"""
        print('Set training hyper parameters')
        self.model = linear_model.LogisticRegression(
            solver = self.config.train.solver, 
            max_iter = self.config.train.max_iter, 
            random_state = self.config.train.random_state, 
            C = self.config.train.C,
            penalty = self.config.train.penalty)
        print('Training is started')
        start_time = datetime.now()
        trainer = LogisticRegressionTrainer(
            model = self.model, 
            X_train=self.X_train, 
            Y_train=self.Y_train)
        trainer.train()
        end_time = datetime.now()
        training_time = (end_time - start_time).total_seconds()
        print(f'Training is completed in {"{:.2f}".format(training_time)} seconds')
    
    def evaluate(self):
        """Predicts resuts for the test dataset"""
        self.Y_test_pred = self.model.predict(self.X_test)
        self.Y_test_pred_proba = self.model.predict_proba(self.X_test)
        print('Model evaluation on test set completed, check model attributes for results')

    def evaluate_document(self, document: str):
        """Predicts the rating for an input string given a trained model"""
        document_embedding = self.vectorizer.transform([document]) # vectorizer expects a list of strings
        return self.model.predict_proba(document_embedding), self.model.predict(document_embedding)



In [27]:
model = YelpLogisticRegression(CFGLog)
model.load_data()
model.build()
model.train()

Logistic Regression model built
Set training hyper parameters
Training is started
Training is completed in 1.51 seconds


In [28]:
document = r"Was it worth the for a salad and small pizza Absolutely not Bad service. Maybe the guys grandma died I don't know. I want to tell you what really made me mad about the experience. We order the small pizza and salad and the guys could have cared less and took our $ and we sat down. We were looking around and hmm, there's a sign saying x large pizza and large salad only 23. Wow that would have been nice if the guy told us that. I left hungry, mad and unsatisfied. To the owner: teach your employees the value of upselling and telling the specials. Something so small can affect a customers experience negatively. And your salads are severely overpriced Won't go back unless I'm desperate."
model.evaluate_document(document)

(array([[0.1733107 , 0.34590901, 0.18530299, 0.1227306 , 0.1727467 ]]),
 array([2], dtype=int64))

In [46]:
import datetime
import os
import pickle


class ModelSaving(object):
    
    @staticmethod
    def get_current_timestamp():
        now = datetime.datetime.now()
        return now.strftime("%Y%m%d_%H%M%S")

    # @staticmethod
    # def save_model_with_timestamp(model, output_config):
    #     filename = ModelSaving.get_current_timestamp() + '_LogReg' + '.pickle'
    #     filepath = os.path.join(output_config, filename)
    #     pickle.dump(model, open(filepath, 'wb'))
    #     return print('Saved model to: ', filepath)
    
    @staticmethod
    def save_model_with_timestamp(vectorizer, model, output_config):
        filename = ModelSaving.get_current_timestamp() + '_LogReg' + '.pickle'
        filepath = os.path.join(output_config, filename)
        with open(filepath, 'wb') as fout:
            pickle.dump((vectorizer, model), fout)
        return print('Saved model to: ', filepath)

In [49]:
config = Config.from_json(CFGLog)
output_config = config.output.output_path
print(output_config)
# need to pickle the object of <class 'sklearn.linear_model._logistic.LogisticRegression'>
ModelSaving().save_model_with_timestamp(model.vectorizer, model.model, output_config) 
print(type(model.model))

../data/exported_models
Saved model to:  ../data/exported_models\20230217_142422_LogReg.pickle
<class 'sklearn.linear_model._logistic.LogisticRegression'>


In [51]:
with open('../data/exported_models/20230217_142422_LogReg.pickle', 'rb') as f:
    vectorizer, model = pickle.load(f)
document = r"Was it worth the for a salad and small pizza Absolutely not Bad service. Maybe the guys grandma died I don't know. I want to tell you what really made me mad about the experience. We order the small pizza and salad and the guys could have cared less and took our $ and we sat down. We were looking around and hmm, there's a sign saying x large pizza and large salad only 23. Wow that would have been nice if the guy told us that. I left hungry, mad and unsatisfied. To the owner: teach your employees the value of upselling and telling the specials. Something so small can affect a customers experience negatively. And your salads are severely overpriced Won't go back unless I'm desperate."
document_embedding = vectorizer.transform([document])
print(model.predict(document_embedding))

[2]


In [52]:
config = Config.from_json(CFGLog)
output_config = config.output.output_path
os.listdir(output_config)

['20230217_143059_LogReg.pickle']

In [13]:
x_test = model.x_test
X_test = model.X_test
Y_test = model.Y_test
Y_test_pred_proba = model.model.predict_proba(X_test)
Y_test_pred = model.model.predict(X_test)
print(Y_test_pred_proba)
print(Y_test_pred)
print(Y_test)

[[0.29999692 0.32410897 0.11531732 0.12218435 0.13839243]
 [0.00330758 0.00528659 0.11448401 0.64093983 0.23598199]
 [0.29684838 0.28430779 0.18279811 0.19443509 0.04161063]
 ...
 [0.10461121 0.31665087 0.34181183 0.20045846 0.03646764]
 [0.00259873 0.00360898 0.02670638 0.47526981 0.49181609]
 [0.00458105 0.02761443 0.36196598 0.60364872 0.00218984]]
[2 4 1 ... 3 5 4]
[2 4 2 ... 2 5 2]


In [53]:
x_test

6487    I have ordered here before (General's chicken)...
8785    I just went here for lunch.  I really have not...
7390    Went here for dinner, there were only about 6 ...
7078    Fuego Bistro is one of my new favorites! A cou...
1230    Good Starbucks in the Happy Valley Shopping Ce...
                              ...                        
3963    I walked here in the heat and was immediately ...
5047    What a great property. Only had been to the ba...
6539    I feel bad giving this 2 stars but I'm basing ...
6433    I really adore this place. They've got some of...
3372    I want to give the Cornish Pasty Company anoth...
Name: text, Length: 2000, dtype: object

In [57]:
Y_test

array([2, 4, 2, ..., 2, 5, 2], dtype=int64)

In [54]:
x_test[8785]

"I just went here for lunch.  I really have nothing to add to the other reviews on here because they pretty much all sum it up.  It gets crowded, the ordering process is a clusterfuk, they place looks sketchy and dirty, the staff is very friendly and helpful, yada yada.  The place isn't worth all the hype but definitely is worth a visit if you are in the mood for something unique.  Its not the healthiest or highest quality cuisine, but it is a pretty solid place and worth a shot.\n\n* Jerk Chicken = very good\n* Pollo Diablo = spicy and awesome and my stomach is going to hate me later\n* Jade Red Chicken = excellent\n* Pork Fried Rice = very good\n* Red Salsa Stuff = excellent\n* Refried Beans = standard\n* Snickerdoodle Cookie = awesome"

In [32]:
model = YelpLogisticRegression(CFGLog)
model.load_data()
model.x_train

5516    I think this place is hyped up a little more t...
1255    I went here Saturday night with a party of 4. ...
4319    Good place to have a beer, duck fart, or whate...
9372    Given the high ratio of staff to customers, it...
1723    Great customer service, perfect dining area! T...
                              ...                        
6384    As an Irish lass thru and thru, the promise of...
4720    The drive to the top is really fun, and the vi...
173     Ok, so I'm catching up on past-due reviews.  F...
1244    The Good: We absolutely love their wings and i...
4989    Pretty good coffee. Nothing special but better...
Name: text, Length: 8000, dtype: object

In [13]:
# sample review document
model.x_train[5516]

"I think this place is hyped up a little more than it should be. I've tired their bruschetta, pizzas and salads. They were decent and I have no complaints but nothing to rave about. A decent place to pop in to for lunch since they have counter service that you order at."

In [8]:
# document vector
model.X_train.toarray()[5516]

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [29]:
# vocabulary from training data X_train
# key: value = term: index
vocab_dict = model.vectorizer.vocabulary_
# obtain the vocab index of a word in the sample document
print(vocab_dict['pizzas'])
# finding the word (key) in the document with this vocab index (value)
keys = [k for k, v in vocab_dict.items() if v == 10642]
print(keys)
# alternatively
print(list(vocab_dict.keys())[list(vocab_dict.values()).index(10642)])

10642
['pizzas']
pizzas


In [9]:
# indices of words appearing at least once:
non_zeros = list(np.nonzero(model.X_train.toarray()[5516])[0])
len(non_zeros)

42

In [37]:
model.build()
print(type(model.model))
model.model

Logistic Regression model built
<class 'sklearn.linear_model._logistic.LogisticRegression'>


LogisticRegression(C=0.01, max_iter=1000, random_state=2022, solver='liblinear')

In [40]:
config = Config.from_json(CFGLog) # initiate Config class with class method
print(type(config))
data_config = config.data # define data_config from Config class, containing data config info
print(type(data_config))
DataLoader().load_data(data_config).head(2) # using data config info to load actual data
data_config_path = data_config.path
print(data_config_path)
data_config_x = data_config.x
print(data_config_x)
data_config_y = data_config.y
print(data_config_y)


<class '__main__.Config'>
<class '__main__.HelperDict'>
../data/yelp.csv
text
stars


In [114]:
class MyClass:
    def method(self):
        return 'instance method called', self 
        # 'self' is replaced and returns the object in memory

    @classmethod
    def classmethod(cls):
        return 'class method called', cls

    @staticmethod
    def staticmethod():
        return 'static method called'

In [121]:
obj = MyClass()
# an instance can call instance method:
obj.method()

# calling instance method without initiating an instance 
# TypeError, 'self' is missing, as there is no instance
# MyClass.method()



('instance method called', <__main__.MyClass at 0x1de4dada1f0>)

In [123]:
# an instance can call its class method:
obj.classmethod()

('class method called', __main__.MyClass)

In [124]:
# an instance can call the class's static method:
obj.staticmethod()

'static method called'

Another way to look at this use of class methods is that they allow you to define alternative constructors for your classes. <br>

Python only allows one `__init__` method per class. Using class methods it’s possible to add as many alternative constructors as necessary. This can make the interface for your classes self-documenting (to a certain degree) and simplify their usage.

In [142]:
class Pizza:
    def __init__(self, ingredients):
        self.ingredients = ingredients

    # def pizza(self):
    #     return self(self.ingredients)
    
    def __repr__(self):
        return f'Your Pizza({self.ingredients})!'

    @classmethod
    def margherita(cls):
        return cls(['mozzarella', 'tomatoes'])

    @classmethod
    def prosciutto(cls):
        return cls(['mozzarella', 'tomatoes', 'ham'])

In [143]:
Pizza.margherita()
print(Pizza.prosciutto())


Your Pizza(['mozzarella', 'tomatoes', 'ham'])!
