In [315]:
import pandas as pd
import numpy as np

In [316]:
df = pd.read_csv("../raw_dataframe.csv", encoding="cp1252")
df.head()

Unnamed: 0,name,price,rating,country,region,sweetness,grape,manufacturer,strength,volume
0,"Vino Tracer Riesling, Weinkellerei Hechtsheim,...",1312.0,4.7,Germanija,Pfal'ts,semi-dry,risling 100%,Weinkellerei Hechtsheim,12.0,0.75
1,"Vino Lighea, Donnafugata, 2021",4990.0,4.8,Italija,Sitsilija,dry,zibibbo 100%,Donnafugata,12.5,0.75
2,"Vino Chenin Blanc, David & Nadia, 2022",6790.0,4.5,Juzhnaja Afrika,Svortlend,dry,shenen blan 100%,David & Nadia,12.5,0.75
3,"Vino Pinot Noir Alpine Vineyard, Rhys Vineyard...",34990.0,5.0,Soedinennye Shtaty Ameriki,Kalifornija,dry,pino nuar,Rhys Vineyards,12.9,0.75
4,"Vino Grain de Gris, Listel, 2022",1393.0,4.6,Frantsija,Langedok-Russil'on,dry,sira,Listel,12.0,0.75


In [317]:
df.dropna(inplace=True)

In [318]:
df["grape"].head()

0        risling 100%
1        zibibbo 100%
2    shenen blan 100%
3           pino nuar
4                sira
Name: grape, dtype: object

In [319]:
def process_grapes(grape: str) -> str:
    if grape.endswith("%"):
        return ' '.join(grape.split()[:-1]).lower()

    return grape


df["grape"] = df["grape"].map(process_grapes)
df.head()

Unnamed: 0,name,price,rating,country,region,sweetness,grape,manufacturer,strength,volume
0,"Vino Tracer Riesling, Weinkellerei Hechtsheim,...",1312.0,4.7,Germanija,Pfal'ts,semi-dry,risling,Weinkellerei Hechtsheim,12.0,0.75
1,"Vino Lighea, Donnafugata, 2021",4990.0,4.8,Italija,Sitsilija,dry,zibibbo,Donnafugata,12.5,0.75
2,"Vino Chenin Blanc, David & Nadia, 2022",6790.0,4.5,Juzhnaja Afrika,Svortlend,dry,shenen blan,David & Nadia,12.5,0.75
3,"Vino Pinot Noir Alpine Vineyard, Rhys Vineyard...",34990.0,5.0,Soedinennye Shtaty Ameriki,Kalifornija,dry,pino nuar,Rhys Vineyards,12.9,0.75
4,"Vino Grain de Gris, Listel, 2022",1393.0,4.6,Frantsija,Langedok-Russil'on,dry,sira,Listel,12.0,0.75


In [320]:
def get_rating_range(rating: float) -> int:
    ranges = [
        (0, 1),
        (1, 2),
        (2, 3),
        (3, 4),
        (4, 5)
    ]

    for idx, rang in enumerate(ranges):
        if rang[0] <= rating < rang[1]:
            return idx + 1

    return 5 # last range is closed [4, 5] but others are half-open ( [0, 1) for example)

In [321]:
df["rating"] = df["rating"].map(get_rating_range)
df.tail(10)

Unnamed: 0,name,price,rating,country,region,sweetness,grape,manufacturer,strength,volume
3540,"Vino Primofiore, Giuseppe Quintarelli, 2021",16490.0,5,Italija,Veneto,dry,kaberne fran,Giuseppe Quintarelli,14.0,0.75
3541,"Vino Villa Solais, Santadi, 2019",2790.0,4,Italija,Sardinija,dry,nuragus,Santadi,13.0,0.75
3542,"Vino Chateau Ferriere, 2020",12490.0,5,Frantsija,Bordo,dry,kaberne fran,Chateau Ferriere,13.5,0.75
3543,"Vino Chateau La Lagune, 2017",12490.0,5,Frantsija,Bordo,dry,pti verdo,Chateau La Lagune,13.5,0.75
3544,"Vino Gevrey-Chambertin Clos du Couvent, Domain...",18490.0,1,Frantsija,Burgundija,dry,pino nuar,Domaine de Varoilles,13.0,0.75
3545,"Vino Bourgogne Kimmeridgien, Jean-Marc Brocard...",4290.0,5,Frantsija,Burgundija,dry,shardone,Jean-Marc Brocard (Domaine Sainte-Claire),13.0,0.75
3546,"Vino Riesling Kalkmergel, Knipser, 2021",5490.0,5,Germanija,Pfal'ts,dry,risling,Knipser,12.5,0.75
3547,"Vino Pouilly-Fume La Demoiselle de Bourgeois, ...",9490.0,5,Frantsija,Dolina Luary,dry,sovin'on blan,Henri Bourgeois,14.0,0.75
3548,"Vino Semisam Shardone/Sovin'on Blan, Shumrinka...",990.0,5,Rossija,Kuban',dry,sovin'on blan,Shumrinka,12.8,0.75
3549,"Vino Riesling Ried Loibenberg Smaragd, Emmeric...",13990.0,5,Avstrija,Nizhnjaja Avstrija,dry,risling,Emmerich Knoll,13.0,0.75


In [322]:
def train_test_split(dataframe: pd.DataFrame, test_size: float = 0.2) -> tuple:
    test = dataframe.sample(frac=test_size)
    train = dataframe.drop(test.index)
    return train, test

TARGET = "rating"

In [323]:
train, test = train_test_split(df.drop(["name"], axis=1))
train.head()

Unnamed: 0,price,rating,country,region,sweetness,grape,manufacturer,strength,volume
2,6790.0,5,Juzhnaja Afrika,Svortlend,dry,shenen blan,David & Nadia,12.5,0.75
3,34990.0,5,Soedinennye Shtaty Ameriki,Kalifornija,dry,pino nuar,Rhys Vineyards,12.9,0.75
5,23490.0,5,Frantsija,Burgundija,dry,pino nuar,Domaine Antonin Guyon,14.0,0.75
6,4490.0,5,Italija,Sitsilija,dry,kaberne sovin'on,Tasca d'Almerita,14.0,0.75
9,39990.0,5,Italija,Veneto,dry,kaberne fran,Giuseppe Quintarelli,14.0,1.5


In [324]:
test.head()

Unnamed: 0,price,rating,country,region,sweetness,grape,manufacturer,strength,volume
1908,3490.0,5,Rossija,Kuban',dry,krasnostop,Galitskij i Galitskij,13.0,0.75
1050,9490.0,5,Frantsija,Burgundija,dry,shardone,William Fevre,12.5,0.75
1510,1343.0,5,Juzhnaja Afrika,Stellenbosh,dry,merlo,Simonsig,13.5,0.75
734,6790.0,1,Frantsija,Bordo,dry,kaberne fran,Domaine Clarence Dillon,13.5,0.75
2006,2240.0,5,Italija,Veneto,semi-dry,pino gridzhio,Domini Veneti,12.0,0.75


In [325]:
df.head()

Unnamed: 0,name,price,rating,country,region,sweetness,grape,manufacturer,strength,volume
0,"Vino Tracer Riesling, Weinkellerei Hechtsheim,...",1312.0,5,Germanija,Pfal'ts,semi-dry,risling,Weinkellerei Hechtsheim,12.0,0.75
1,"Vino Lighea, Donnafugata, 2021",4990.0,5,Italija,Sitsilija,dry,zibibbo,Donnafugata,12.5,0.75
2,"Vino Chenin Blanc, David & Nadia, 2022",6790.0,5,Juzhnaja Afrika,Svortlend,dry,shenen blan,David & Nadia,12.5,0.75
3,"Vino Pinot Noir Alpine Vineyard, Rhys Vineyard...",34990.0,5,Soedinennye Shtaty Ameriki,Kalifornija,dry,pino nuar,Rhys Vineyards,12.9,0.75
4,"Vino Grain de Gris, Listel, 2022",1393.0,5,Frantsija,Langedok-Russil'on,dry,sira,Listel,12.0,0.75


In [326]:
df["country"] = df["country"].map(lambda x: x.lower())
df["region"] = df["region"].map(lambda x: x.lower())
df["manufacturer"] = df["manufacturer"].map(lambda x: x.lower())
print(f"Unique - country: {df['country'].nunique()}, region - {df['region'].nunique()}, manufacturer - {df.manufacturer.nunique()}")

Unique - country: 19, region - 78, manufacturer - 421


In [327]:
columns_to_split = list(df.columns)
columns_to_split.remove("manufacturer")
columns_to_split.remove("name")
columns_to_split.remove("rating")
print(columns_to_split)

['price', 'country', 'region', 'sweetness', 'grape', 'strength', 'volume']


In [328]:
# for (idx, column_name) in enumerate(df.columns):
#     if column_name in columns_to_split:
#         print(idx)

dataset = df.to_numpy(copy=True)
y = dataset[:, 2]
y

array([5, 5, 5, ..., 5, 5, 5], dtype=object)

In [329]:
categorical_features = ["country", "region", "sweetness", "grape", "manufacturer"]


class Node:
    """
    Represents a Node in Decision Tree
    """
    def __init__(self, feature = None, threshold = None, left = None, right = None, gain = None, value = None):
        self.feature = feature
        self.threshold = threshold
        self.left = left
        self.right = right
        self.gain = gain
        self.value = value

In [330]:
def validate_arguments(criterion: str):
    if criterion not in ["gini", "entropy"]:
        raise RuntimeError("Criterion must be either 'gini' or 'entropy'")


class DecisionTree:
    def __init__(self, max_depth: int = 5, min_samples: int = 2, criterion: str = "gini"):
        self.root = None
        validate_arguments(criterion)
        self.max_depth = max_depth
        self.min_samples = min_samples
        self.criterion = criterion

    def split_data(self, dataset: np.ndarray, feature: int, threshold):
        left_dataset = []
        right_dataset = []

        def check_numeric(value: float):
            return value <= threshold

        def check_categorical(value):
            return value in threshold

        check = check_categorical if feature in categorical_features else check_numeric

        for row in dataset:
            if check(row[feature]):
                left_dataset.append(row)
            else:
                right_dataset.append(row)

        left_dataset = np.array(left_dataset, dtype=object)
        right_dataset = np.array(right_dataset, dtype=object)
        return left_dataset, right_dataset

    def entropy(self, y):
        entropy = 0

        labels = np.unique(y)
        for label in labels:
            label_examples = y[y == label]
            pl = len(label_examples) / len(y)
            entropy += -pl * np.log2(pl)

        return entropy

    def gini(self, y):
        sm = 0

        labels = np.unique(y)
        for label in labels:
            label_examples = y[y == label]
            freq = len(label_examples) / len(y)
            sm += freq ** 2

        return 1 - sm

    def information_gain(self, parent, left, right):
        entropy_function = self.entropy if self.criterion == "entropy" else self.gini
        parent_entropy = entropy_function(parent)
        left_entropy = entropy_function(left)
        right_entropy = entropy_function(right)

        left_weight = len(left) / len(parent)
        right_weight = len(right) / len(parent)

        weighted = left_weight * left_entropy + right_weight * right_entropy
        information_gain = parent_entropy - weighted
        return information_gain

    def best_split(self, dataset):
        best_params = {
            "gain": -1,
            "feature": None,
            "threshold": None
        }

        for (idx, feature) in enumerate(columns_to_split):
            feature_values = dataset[:, idx]
            thresholds = np.unique(feature_values)

            for threshold in thresholds:
                left, right = self.split_data(dataset, idx, threshold)
                if len(left) and len(right):
                    y, yleft, yright = dataset[:, -1], left[:, -1], right[:, -1]
                    info_gain = self.information_gain(y, yleft, yright)
                    if info_gain > best_params["gain"]:
                        best_params["gain"] = info_gain
                        best_params["feature"] = idx
                        best_params["threshold"] = threshold
                        best_params["left"] = left
                        best_params["right"] = right

        return best_params

    def leaf_value(self, y):
        y = list(y)
        return max(y, key=y.count)

    def build(self, dataset: np.ndarray, current_depth = 0):
        y = dataset[:, -1]
        X = np.delete(dataset, -1, axis=1)

        n_samples, n_features = X.shape
        if n_samples >= self.min_samples and current_depth <= self.max_depth:
            best_split_params = self.best_split(dataset)
            if best_split_params["gain"] > 0:
                left_node = self.build(best_split_params["left"], current_depth + 1)
                right_node = self.build(best_split_params["right"], current_depth + 1)

                return Node(best_split_params["feature"], best_split_params["threshold"], left_node, right_node, best_split_params["gain"])

        value = self.leaf_value(y)
        return Node(value=value)

    def fit(self, X, y):
        dataset = np.column_stack((X, y))
        self.root = self.build(dataset)

    def predict(self, X):
        predictions = []
        for x in X:
            prediction = self.make_prediction(x, self.root)
            predictions.append(prediction)

        return np.array(predictions)

    def make_prediction(self, x, node):
        if node.value is not None:
            return node.value

        feature = x[node.feature]
        threshold = node.threshold
        def check_numeric(value: float):
            return value <= threshold

        def check_categorical(value):
            return value in threshold

        check = check_categorical if feature in categorical_features else check_numeric

        if check(feature):
            return self.make_prediction(x, node.left)
        return self.make_prediction(x, node.right)

In [331]:
from sklearn.model_selection import train_test_split

In [332]:
X = df.drop("rating", axis=1)
y = df["rating"]
X.head()

Unnamed: 0,name,price,country,region,sweetness,grape,manufacturer,strength,volume
0,"Vino Tracer Riesling, Weinkellerei Hechtsheim,...",1312.0,germanija,pfal'ts,semi-dry,risling,weinkellerei hechtsheim,12.0,0.75
1,"Vino Lighea, Donnafugata, 2021",4990.0,italija,sitsilija,dry,zibibbo,donnafugata,12.5,0.75
2,"Vino Chenin Blanc, David & Nadia, 2022",6790.0,juzhnaja afrika,svortlend,dry,shenen blan,david & nadia,12.5,0.75
3,"Vino Pinot Noir Alpine Vineyard, Rhys Vineyard...",34990.0,soedinennye shtaty ameriki,kalifornija,dry,pino nuar,rhys vineyards,12.9,0.75
4,"Vino Grain de Gris, Listel, 2022",1393.0,frantsija,langedok-russil'on,dry,sira,listel,12.0,0.75


In [333]:
X_train, X_test, y_train, y_test = list(map(lambda x: x.to_numpy(), train_test_split(X, y, test_size=0.2)))

In [334]:
type(X_train)

numpy.ndarray

In [335]:
decisionTree = DecisionTree(5, 2)
print(X_test.shape, y_test.shape)
decisionTree.fit(X_test, y_test)
predictions = decisionTree.predict(X_test)


(688, 9) (688,)


In [336]:
predictions

array([5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
       5, 5, 5, 5, 1, 5, 1, 5, 5, 5, 1, 1, 1, 5, 5, 5, 5, 5, 1, 1, 1, 5,
       1, 5, 5, 5, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
       5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 1, 5, 1, 5, 5, 5, 5, 5, 5, 5, 5,
       5, 1, 1, 5, 1, 5, 5, 5, 5, 5, 1, 5, 1, 5, 5, 5, 5, 1, 1, 5, 5, 5,
       1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 1, 5, 1, 5, 1, 5, 5, 5,
       1, 1, 1, 1, 1, 5, 5, 5, 5, 1, 1, 5, 5, 1, 5, 5, 5, 1, 1, 5, 5, 5,
       5, 5, 1, 5, 5, 5, 5, 5, 1, 5, 5, 5, 5, 5, 1, 5, 5, 5, 5, 1, 5, 5,
       5, 5, 5, 5, 5, 5, 1, 5, 5, 5, 5, 5, 5, 5, 1, 5, 5, 5, 5, 5, 1, 5,
       5, 1, 5, 5, 5, 5, 5, 1, 5, 1, 5, 5, 5, 5, 1, 1, 5, 1, 5, 5, 5, 5,
       5, 1, 5, 4, 5, 5, 5, 5, 1, 5, 1, 5, 5, 5, 5, 5, 5, 1, 1, 1, 5, 5,
       5, 5, 5, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 1, 5, 5, 5, 5,
       5, 5, 5, 1, 5, 5, 5, 1, 5, 5, 5, 5, 5, 1, 1, 5, 5, 1, 1, 5, 1, 5,
       5, 5, 5, 5, 5, 5, 5, 1, 5, 1, 1, 5, 5, 5, 5,

In [337]:
df.head()

Unnamed: 0,name,price,rating,country,region,sweetness,grape,manufacturer,strength,volume
0,"Vino Tracer Riesling, Weinkellerei Hechtsheim,...",1312.0,5,germanija,pfal'ts,semi-dry,risling,weinkellerei hechtsheim,12.0,0.75
1,"Vino Lighea, Donnafugata, 2021",4990.0,5,italija,sitsilija,dry,zibibbo,donnafugata,12.5,0.75
2,"Vino Chenin Blanc, David & Nadia, 2022",6790.0,5,juzhnaja afrika,svortlend,dry,shenen blan,david & nadia,12.5,0.75
3,"Vino Pinot Noir Alpine Vineyard, Rhys Vineyard...",34990.0,5,soedinennye shtaty ameriki,kalifornija,dry,pino nuar,rhys vineyards,12.9,0.75
4,"Vino Grain de Gris, Listel, 2022",1393.0,5,frantsija,langedok-russil'on,dry,sira,listel,12.0,0.75


In [338]:
from sklearn.metrics import accuracy_score, root_mean_squared_error

In [339]:
accuracy = accuracy_score(y_test, predictions)
rmse = root_mean_squared_error(y_test, predictions)

print(f"Accuracy score:          {accuracy}")
print(f"Root mean squared error: {rmse}")

Accuracy score:          0.9040697674418605
Root mean squared error: 1.1588888021781698


In [340]:
from scipy.stats import mode

class RandomForest:
    def __init__(self, n_trees, max_depth=5, min_samples=2, criterion='gini'):
        self.n_trees = n_trees
        self.trees = [DecisionTree(max_depth=max_depth, min_samples=min_samples, criterion=criterion)
                      for _ in range(n_trees)]

    def fit(self, X, y):
        n_samples, n_features = X.shape

        for tree in self.trees:
            indices = np.random.choice(range(n_samples), size=n_samples, replace=True)
            sample_X, sample_y = X[indices], y[indices]

            tree.fit(sample_X, sample_y)

    def predict(self, X):
        predictions = np.zeros((self.n_trees, X.shape[0]))

        for i, tree in enumerate(self.trees):
            predictions[i] = tree.predict(X)

        final_predictions = mode(predictions, axis=0).mode

        return final_predictions

In [343]:
import datetime

rf = RandomForest(n_trees=1_000)
start_time = datetime.datetime.now()
print(f"Fit-predict started at {start_time}")
rf.fit(X_train, y_train)
rf_predictions = rf.predict(X_test)
duration = datetime.datetime.now() - start_time
print(f"Duration - {duration}")

Fit-predict started at 2024-12-18 03:38:50.482915
Duration - 3:45:39.797273


In [344]:
accuracy = accuracy_score(y_test, rf_predictions)
rmse = root_mean_squared_error(y_test, rf_predictions)

print(f"Accuracy score:          {accuracy}")
print(f"Root mean squared error: {rmse}")

Accuracy score:          0.8459302325581395
Root mean squared error: 1.4858831842407068


In [345]:
import numpy as np

class GradientBoost:
    def __init__(self, n_trees=100, learning_rate=0.1, max_depth=3, min_samples=2, criterion='gini'):
        self.n_trees = n_trees
        self.learning_rate = learning_rate
        self.max_depth = max_depth
        self.min_samples = min_samples
        self.criterion = criterion
        self.trees = []
        self.initial_pred = None

    def fit(self, X, y):
        self.initial_pred = np.log(np.mean(y) / (1 - np.mean(y)))
        y_pred = np.full(y.shape, self.initial_pred)

        for _ in range(self.n_trees):
            residuals = y - self.sigmoid(y_pred)

            tree = DecisionTree(max_depth=self.max_depth, min_samples=self.min_samples, criterion=self.criterion)
            tree.fit(X, residuals)
            self.trees.append(tree)

            update = tree.predict(X)
            y_pred += self.learning_rate * update

    def sigmoid(self, z):
        return 1 / (1 + np.exp(-z))

    def predict_proba(self, X):
        y_pred = np.full(shape=(X.shape[0],), fill_value=self.initial_pred)

        for tree in self.trees:
            update = tree.predict(X)
            y_pred += self.learning_rate * update

        proba = self.sigmoid(y_pred)
        return np.vstack([1 - proba, proba]).T

    def predict(self, X):
        proba = self.predict_proba(X)
        return (proba[:, 1] > 0.5).astype(int)

In [None]:
gb = GradientBoost()
gb.fit(X_train, y_train)

  self.initial_pred = np.log(np.mean(y) / (1 - np.mean(y)))


In [None]:
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier



def measure_tree_depth_sklearn(X_train, y_train):
    depths = []
    max_features_list = range(1, X_train.shape[1] + 1)

    for max_features in max_features_list:
        clf = DecisionTreeClassifier(max_features=max_features)
        clf.fit(X_train, y_train)
        depths.append(clf.tree_.max_depth)

    return max_features_list, depths

def evaluate_model_depth(model_class, X_train, y_train):
    depths = []
    accuracies_train = []
    accuracies_test = []
    max_depths = range(1, 21)  # Меняем глубину дерева от 1 до 20

    for max_depth in max_depths:
        model = model_class(max_depth=max_depth)
        model.fit(X_train, y_train)
        depths.append(max_depth)
        y_pred_train = model.predict(X_train)
        y_pred_test = model.predict(X_test)
        accuracies_train.append(accuracy_score(y_train, y_pred_train))
        accuracies_test.append(accuracy_score(y_test, y_pred_test))

    return depths, accuracies_train, accuracies_test

def plot_tree_depth_variation(X_train, y_train):
    _, depths_sklearn = measure_tree_depth_sklearn(X_train, y_train)
    plt.plot(depths_sklearn, label='Sklearn Tree Depth')


    # depths_custom, _, _ = evaluate_model_depth(DecisionTree, X_train, y_train)
    # plt.plot(depths_custom, label='Custom Tree Depth')

    plt.xlabel('Max Features')
    plt.ylabel('Tree Depth')
    plt.legend()
    plt.title('Tree Depth Variation')
    plt.show()

def plot_accuracy_vs_depth(X_train, y_train, X_test, y_test):
    depths_sklearn, acc_train_sklearn, acc_test_sklearn = evaluate_model_depth(DecisionTreeClassifier, X_train, y_train)
    plt.plot(depths_sklearn, acc_train_sklearn, label='Sklearn Train Accuracy')
    plt.plot(depths_sklearn, acc_test_sklearn, label='Sklearn Test Accuracy')

    # depths_custom, acc_train_custom, acc_test_custom = evaluate_model_depth(MyDecisionTree, X_train, y_train)
    # plt.plot(depths_custom, acc_train_custom, label='Custom Train Accuracy')
    # plt.plot(depths_custom, acc_test_custom, label='Custom Test Accuracy')

    plt.xlabel('Tree Depth')
    plt.ylabel('Accuracy')
    plt.legend()
    plt.title('Accuracy vs. Tree Depth')
    plt.show()

def plot_forest_accuracy_vs_num_trees(X_train, y_train, X_test, y_test):
    n_trees_list = range(1, 101, 10)
    acc_train_sklearn = []
    acc_test_sklearn = []

    for n_trees in n_trees_list:
        rf_clf = RandomForestClassifier(n_estimators=n_trees)
        rf_clf.fit(X_train, y_train)

        train_pred = rf_clf.predict(X_train)
        train_accuracy = accuracy_score(y_train, train_pred)
        acc_train_sklearn.append(train_accuracy)

        test_pred = rf_clf.predict(X_test)
        test_accuracy = accuracy_score(y_test, test_pred)
        acc_test_sklearn.append(test_accuracy)

    acc_train_custom = []
    acc_test_custom = []

    for n_trees in n_trees_list:
        rf_custom = RandomForest(n_trees=n_trees)
        rf_custom.fit(X_train, y_train)

        train_pred_custom = rf_custom.predict(X_train)
        train_accuracy_custom = accuracy_score(y_train, train_pred_custom)
        acc_train_custom.append(train_accuracy_custom)

        test_pred_custom = rf_custom.predict(X_test)
        test_accuracy_custom = accuracy_score(y_test, test_pred_custom)
        acc_test_custom.append(test_accuracy_custom)

    plt.figure(figsize=(12, 6))

    plt.plot(n_trees_list, acc_train_sklearn, label='Sklearn RF Train Accuracy', marker='o')
    plt.plot(n_trees_list, acc_test_sklearn, label='Sklearn RF Test Accuracy', marker='o')

    plt.plot(n_trees_list, acc_train_custom, label='Custom RF Train Accuracy', linestyle='--', marker='x')
    plt.plot(n_trees_list, acc_test_custom, label='Custom RF Test Accuracy', linestyle='--', marker='x')

    plt.xlabel('Number of Trees')
    plt.ylabel('Accuracy')
    plt.title('Accuracy vs. Number of Trees in Random Forest')
    plt.legend()
    plt.grid(True)
    plt.show()