In [21]:
import sklearn.utils
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.utils import shuffle
import sklearn.metrics as metrics
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
import numpy as np

In [39]:
class Builder_Creation:
    
    def __init__(self, X_train, y_train):
        self.X_train = X_train
        self.y_train = y_train

    def get_subsample(self, df_share):
        """
        1. Copy train dataset
        2. Shuffle data (don't miss the connection between X_train and y_train)
        3. Return df_share %-subsample of X_train and y_train
        """
        n_samples = int(df_share / 100 * len(self.X_train))
        print("number of observations used for training = ", n_samples)
        X_train_sub, y_train_sub = shuffle(self.X_train, self.y_train, random_state=42, n_samples=n_samples)
        return X_train_sub, y_train_sub

    
if __name__ == "__main__":
    """
    1. Load iris dataset
    2. Shuffle data and divide into train / test.
    """

    dataset = datasets.load_iris()
    features = dataset.data
    targets = dataset.target
    X_train, X_test, y_train, y_test = train_test_split(features, targets)
    pipe_lr = make_pipeline(StandardScaler(), LinearRegression())
    
    pattern_item = Builder_Creation(X_train, y_train)
    
    for df_share in range(10, 101, 10):
        """
        1. Preprocess curr_X_train, curr_y_train in the way you want
        2. Train Linear Regression on the subsample
        3. Save or print the score to check how df_share affects the quality
        """
        curr_X_train, curr_y_train = pattern_item.get_subsample(df_share)
        pipe_lr = LinearRegression().fit(curr_X_train, curr_y_train)
        pipe_lr.fit(curr_X_train, curr_y_train)
        y_pred_test = pipe_lr.predict(X_test)
        mse = metrics.mean_squared_error(y_test, y_pred_test)        
        print(f'Score for {df_share}%: {mse}')

number of observations used for training =  11
Score for 10%: 0.0754281385286842
number of observations used for training =  22
Score for 20%: 0.05711922218712858
number of observations used for training =  33
Score for 30%: 0.061250197185758755
number of observations used for training =  44
Score for 40%: 0.05634600896238332
number of observations used for training =  56
Score for 50%: 0.05917650323357551
number of observations used for training =  67
Score for 60%: 0.055275408731475364
number of observations used for training =  78
Score for 70%: 0.05446147878810882
number of observations used for training =  89
Score for 80%: 0.05355228831140269
number of observations used for training =  100
Score for 90%: 0.05313063624422463
number of observations used for training =  112
Score for 100%: 0.051750199610600275


In [38]:
class Decorator_Structure:
    def __init__(self, classifier_list) -> None:
        """
        Initialize a class item with a list of classificators
        """
        self.classifier_list = classifier_list
        

    def fit(self, feature_matrix, response):
        """
        Fit classifiers from the initialization stage
        """
        for c in self.classifier_list:
            c.fit(feature_matrix, response)

    def predict(self, feature_matrix):
        
        """
        Get predicts from all the classifiers and return
        the most popular answers
        """
        y_pred_dict = {}
        for c in self.classifier_list:
            y_pred = c.predict(feature_matrix)
            y_pred_dict[c] = y_pred
        return y_pred_dict
        


if __name__ == "__main__":
    """
    1. Load iris dataset
    2. Shuffle data and divide into train / test.
    3. Prepare classifiers to initialize <StructuralPatternName> class.
    4. Train the ensemble
    """
    dataset = datasets.load_iris()
    features = dataset.data
    targets = dataset.target
    X_train, X_test, y_train, y_test = train_test_split(features, targets)
    classifier_list = [
        DecisionTreeClassifier(max_depth=None, min_samples_split=2, random_state=0),
        RandomForestClassifier(n_estimators=10, max_depth=None, min_samples_split=2, random_state=42),
        ExtraTreesClassifier(n_estimators=10, max_depth=None, min_samples_split=2, random_state=0),
        RandomForestClassifier(n_estimators=10)
    ]
    
    ensemble = Decorator_Structure(classifier_list)    
    ensemble.fit(X_train, y_train)
    
    y_pred_dict = ensemble.predict(X_test)
    list_of_lists = []
    for k, v in y_pred_dict.items():
        list_of_lists.append(list(v))
    nume = np.array([sum(i) for i in zip(list_of_lists[0], list_of_lists[1], list_of_lists[2], list_of_lists[3])])
    deno = len(classifier_list)
    y_pred = np.floor(nume / deno)
    acc = accuracy_score(y_test, y_pred)
    print(f"Score with ensemble = {acc}")

Score with ensemble = 0.9210526315789473
