In [1]:
import json

In [11]:
CFGLog = {
    "data": {
        "path": "../data/yelp.csv",
        "x": "text",
        "y": "stars",
        "test_size": 0.2,
        "random_state": 2022,
        "ngram_range": (1, 2),
        "min_df": 10

    },
    "train": {
            "solver": "liblinear",
            "max_iter": 1000,
            "random_state": 2022,
            "C": 0.01,
            "penalty": "l2",
    },
    "output": {
        "output_path": "../data/",
    }
}

class Config:
    """Config class which contains data, train and model hyperparameters"""

    def __init__(self, data, train, output):
        self.data = data
        # self.datapath = os.path.
        self.train = train
        self.output = output

    @classmethod # using config to define constructor of the class
    def from_json(cls, cfg):
        """Creates config from json"""
        params = json.loads(json.dumps(cfg), object_hook=HelperDict)
        # init all class instance with data and train attributes
        return cls(params.data, params.train, params.output) 


class HelperDict(object):
    """Helper class to convert json into Python object"""
    def __init__(self, dict_):
        self.__dict__.update(dict_)


In [23]:
from abc import ABC, abstractmethod

class BaseModel(ABC):
    """Abstract Model class that is inherited to all models"""

    def __init__(self, cfg):
        self.config = Config.from_json(cfg)

    @abstractmethod
    def load_data(self):
        pass

    # @abstractmethod
    # def build(self):
    #     pass

    @abstractmethod
    def train(self):
        pass

    @abstractmethod
    def evaluate(self):
        pass

import pandas as pd
from sklearn import model_selection
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np

class DataLoader:
    """Data Loader class"""

    @staticmethod
    def load_data(data_config):
        """Loads dataset from path"""
        df = pd.read_csv(data_config.path) # data_config will have .path attribute
        return df

    # @staticmethod
    # def preprocess_data(data_config, dataset = None, test_size = 0.2, random_state = 2022): #dataset -> df
    #     """ Preprocess and splits into training and test"""
    #     x = dataset[data_config.x]
    #     y = dataset[data_config.y]
    #     x_train, x_test, y_train, y_test = \
    #         model_selection.train_test_split(x, y, test_size=test_size, random_state=random_state)
    #     return x, y, x_train, x_test, y_train, y_test

    @staticmethod
    def preprocess_data(data_config, dataset = None): #dataset -> df
        """ Preprocess and splits into training and test"""
        x = dataset[data_config.x]
        y = dataset[data_config.y]
        test_size = data_config.test_size
        random_state = data_config.random_state
        # splitting
        x_train, x_test, y_train, y_test = \
            model_selection.train_test_split(x, y, 
                                             test_size=test_size, 
                                             random_state=random_state)
        # vectorizing
        vectorizer = CountVectorizer(ngram_range = data_config.ngram_range, 
                                     min_df = data_config.min_df)
        # fit_transform on training texts: from text to sparse frequency fector embeddings
        # transform on test texts: using vector dimension from training set
        # Extract token counts out of raw text documents using the vocabulary
        # fitted with fit or the one provided to the constructor.
        # vectorizer.vocabulary_: A mapping of terms to feature indices.
        X_train = vectorizer.fit_transform(x_train)
        X_test = vectorizer.transform(x_test)
        Y_train = np.array(y_train)
        Y_test = np.array(y_test)

        return x, y, x_train, x_test, y_train, y_test, X_train, X_test, Y_train, Y_test, vectorizer


class YelpLogisticRegression(BaseModel):
    """Unet Model Class"""

    def __init__(self, config):
        super().__init__(config)

    def load_data(self):
        self.dataset = DataLoader().load_data(self.config.data)
        self.x, self.y, \
            self.x_train, self.x_test, self.y_train, self.y_test,\
            self.X_train, self.X_test, self.Y_train, self.Y_test, self.vectorizer =  \
            DataLoader().preprocess_data(data_config = self.config.data, 
                                           dataset = self.dataset)

    # def build(self):
    #     return 'built model'
    def train(self):
        return 'trained model'
    def evaluate(self):
        return 'evaluated model'



In [60]:
model = YelpLogisticRegression(CFGLog)
model.load_data()
model.x_train

5516    I think this place is hyped up a little more t...
1255    I went here Saturday night with a party of 4. ...
4319    Good place to have a beer, duck fart, or whate...
9372    Given the high ratio of staff to customers, it...
1723    Great customer service, perfect dining area! T...
                              ...                        
6384    As an Irish lass thru and thru, the promise of...
4720    The drive to the top is really fun, and the vi...
173     Ok, so I'm catching up on past-due reviews.  F...
1244    The Good: We absolutely love their wings and i...
4989    Pretty good coffee. Nothing special but better...
Name: text, Length: 8000, dtype: object

In [74]:
model.vectorizer.vocabulary_

{'think': 14610,
 'this': 14638,
 'place': 10647,
 'is': 7117,
 'up': 15516,
 'little': 8067,
 'more': 8854,
 'than': 13327,
 'it': 7318,
 'should': 12164,
 'be': 1793,
 've': 15658,
 'tired': 14881,
 'their': 14303,
 'bruschetta': 2381,
 'pizzas': 10642,
 'and': 594,
 'salads': 11683,
 'they': 14484,
 'were': 16445,
 'decent': 3659,
 'have': 6257,
 'no': 9285,
 'complaints': 3244,
 'but': 2467,
 'nothing': 9441,
 'to': 14885,
 'rave': 11164,
 'about': 104,
 'pop': 10830,
 'in': 6884,
 'for': 5107,
 'lunch': 8293,
 'since': 12241,
 'counter': 3407,
 'service': 12012,
 'that': 13362,
 'you': 17174,
 'order': 10042,
 'at': 1494,
 'think this': 14625,
 'this place': 14689,
 'place is': 10672,
 'up little': 15536,
 'little more': 8078,
 'more than': 8872,
 'than it': 13336,
 'it should': 7460,
 'should be': 12165,
 'pizzas and': 10643,
 'and salads': 953,
 'they were': 14569,
 'were decent': 16461,
 'decent and': 3660,
 'and have': 787,
 'have no': 6326,
 'no complaints': 9289,
 'but nothi

In [61]:
model.x_train[5516]

"I think this place is hyped up a little more than it should be. I've tired their bruschetta, pizzas and salads. They were decent and I have no complaints but nothing to rave about. A decent place to pop in to for lunch since they have counter service that you order at."

In [102]:
model.vectorizer.vocabulary_['pizzas']

10642

In [101]:
# document vector
model.X_train.toarray()[5516]

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [100]:
# indices of words appearing at least once:
non_zeros = list(np.nonzero(model.X_train.toarray()[5516])[0])
len(non_zeros)

42

In [103]:
print(list(model.vectorizer.vocabulary_.keys())[list(model.vectorizer.vocabulary_.values()).index(10642)])

pizzas


In [40]:
config = Config.from_json(CFGLog) # initiate Config class with class method
print(type(config))
data_config = config.data # define data_config from Config class, containing data config info
print(type(data_config))
DataLoader().load_data(data_config).head(2) # using data config info to load actual data
data_config_path = data_config.path
print(data_config_path)
data_config_x = data_config.x
print(data_config_x)
data_config_y = data_config.y
print(data_config_y)


<class '__main__.Config'>
<class '__main__.HelperDict'>
../data/yelp.csv
text
stars


In [11]:
type(model.x)

pandas.core.series.Series

In [114]:
class MyClass:
    def method(self):
        return 'instance method called', self 
        # 'self' is replaced and returns the object in memory

    @classmethod
    def classmethod(cls):
        return 'class method called', cls

    @staticmethod
    def staticmethod():
        return 'static method called'

In [121]:
obj = MyClass()
# an instance can call instance method:
obj.method()

# calling instance method without initiating an instance 
# TypeError, 'self' is missing, as there is no instance
# MyClass.method()



('instance method called', <__main__.MyClass at 0x1de4dada1f0>)

In [123]:
# an instance can call its class method:
obj.classmethod()

('class method called', __main__.MyClass)

In [124]:
# an instance can call the class's static method:
obj.staticmethod()

'static method called'

Another way to look at this use of class methods is that they allow you to define alternative constructors for your classes. <br>

Python only allows one `__init__` method per class. Using class methods it’s possible to add as many alternative constructors as necessary. This can make the interface for your classes self-documenting (to a certain degree) and simplify their usage.

In [142]:
class Pizza:
    def __init__(self, ingredients):
        self.ingredients = ingredients

    # def pizza(self):
    #     return self(self.ingredients)
    
    def __repr__(self):
        return f'Your Pizza({self.ingredients})!'

    @classmethod
    def margherita(cls):
        return cls(['mozzarella', 'tomatoes'])

    @classmethod
    def prosciutto(cls):
        return cls(['mozzarella', 'tomatoes', 'ham'])

In [143]:
Pizza.margherita()
print(Pizza.prosciutto())


Your Pizza(['mozzarella', 'tomatoes', 'ham'])!
