In [6]:
import joblib
import pandas as pd
import geopandas as gpd
from pyproj import Geod
import pickle
import folium
from tqdm.auto import tqdm

def calculate_area_in_square_meters(geometry):
    geod = Geod(ellps="WGS84")
    area = abs(geod.geometry_area_perimeter(geometry)[0])
    return area
def states_gdf_from_geojson(file_path = './data/geojsons/sudan_states.geojson'):
    gdf = gpd.read_file(file_path)
    gdf = gdf.to_crs(crs='EPSG:4326')
    gdf['Area_M2'] = gdf['geometry'].apply(calculate_area_in_square_meters)
    gdf.rename(columns={'admin1RefN':'State'}, inplace=True)
    gdf = gdf[['State', 'Area_M2', 'geometry']]
    return gdf
def seq_inside(gdf_sq):
    gdf = states_gdf_from_geojson(file_path='./data/geojsons/sudan_states_gaziera.geojson')
    withins = []
    for i in range(len(gdf_sq)):
        gdf_sq_i = gdf_sq.iloc[i:i+1]
        gdf_sq_i.reset_index(inplace=True, drop=True)
        inisde = gdf_sq_i.within(gdf).values[0]
        if inisde:
            withins.append(1)
        else:
            withins.append(0)
    return withins
def add_state_to_map(m):
    gdf = states_gdf_from_geojson(file_path='./data/geojsons/sudan_states_gaziera.geojson')
    gdf.explore(m=m)
    return m
def get_squares_2000_gdf():
    gdf = joblib.load('data/joblibs/labels.joblib')
    all_states = gdf['State'].values
    wanted_states_indicies = []
    for i in range(len(all_states)):
        state = all_states[i]
        if state.startswith('squares_2000'):
            wanted_states_indicies.append(i)
    gdf = gdf.iloc[wanted_states_indicies]
    gdf = gdf.reset_index(drop=True)
    gdf['inside'] = seq_inside(gdf)
    gdf = gdf[gdf['inside']==1]
    gdf = gdf.reset_index(drop=True)
    gdf = gdf.sample(1000)
    gdf = gdf.reset_index(drop=True)
    return gdf
def read_points_81_gdf():
    gdf_path = './data/training_data/gaizera_square_45X45_10_2023/training_data_ALL.pkl'
    with open(gdf_path, 'rb') as f:
        gdf = pickle.load(f)
    cols = gdf.columns
    geom_col = [col for col in cols if 'geometry' in col][0]
    geom = gdf[geom_col]
    gdf = gpd.GeoDataFrame(gdf, geometry=geom)
    return gdf    

def add_bounds_to_map(m, gdf):
    gdf_bbox = gdf.total_bounds
    gdf_bbox = [(gdf_bbox[1], gdf_bbox[0]), (gdf_bbox[3], gdf_bbox[2])]
    folium.Rectangle(bounds=gdf_bbox, color='red', fill=False).add_to(m)
    return m

def points_in_square(squares_gdf, points_gdf):
    points_in_square = []
    points = points_gdf['geometry'].values
    for point in tqdm(points):
        if squares_gdf.contains(point).any():
            points_in_square.append(1)
        else:
            points_in_square.append(0)
    return points_in_square


gdf = get_squares_2000_gdf()
m=gdf.explore(column='inside')
# m = add_state_to_map(m)
points_81_gdf = read_points_81_gdf()
m = add_bounds_to_map(m, points_81_gdf)
points_81_gdf_in_square.explore(m=m)

In [3]:
points_81_gdf['in_square'] = points_in_square(gdf, points_81_gdf)
points_81_gdf['in_square'].value_counts()

  0%|          | 0/4949880 [00:00<?, ?it/s]

0    4945630
1       4250
Name: in_square, dtype: int64

In [4]:
points_81_gdf_in_square = points_81_gdf[points_81_gdf['in_square']==1]
points_81_gdf_in_square = points_81_gdf_in_square.reset_index(drop=True)


In [None]:
from sklearn.ensemble import RandomForestClassifier
from data_processor import read_all_processed_data
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from sklearn.utils import resample
import joblib

# Read data
file_path = './data/joblibs/processed_data.joblib'
processed_data_gdf =joblib.load(file_path)


# Remove rows with labels == -1
processed_data_gdf = processed_data_gdf[processed_data_gdf['Labels'] != -1]

# Show the distribution of labels
processed_data_gdf['Labels'].value_counts()

processed_data_majority2 = processed_data_gdf[processed_data_gdf.Labels==2]
processed_data_majority1 = processed_data_gdf[processed_data_gdf.Labels==1]
processed_data_minority0 = processed_data_gdf[processed_data_gdf.Labels==0]

class_sample_size = len(processed_data_minority0)

# Downsample majority classes
processed_data_majority_downsampled2 = resample(processed_data_majority2,
                                 replace=False,    # sample without replacement
                                 n_samples=class_sample_size,     # to match minority class
                                 random_state=123) # reproducible results

processed_data_majority_downsampled1 = resample(processed_data_majority1,
                                    replace=False,    # sample without replacement
                                    n_samples=class_sample_size,     # to match minority class
                                    random_state=123) # reproducible results

# Combine minority class with downsampled majority class
processed_data_downsampled = pd.concat([processed_data_majority_downsampled2, processed_data_majority_downsampled1, processed_data_minority0])

processed_data_downsampled['Labels'].value_counts()


In [None]:
def train_and_predict(df, use_october=False):
    if not use_october:
        columns = df.columns
        columns_october = [col for col in columns if '-10-' in col]
        df = df.drop(columns_october, axis=1)
    y = df['Labels']
    X = df.drop(['Labels', 'geometry', 'latitude', 'longitude', 'location'], axis=1)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)
    clf = RandomForestClassifier(n_estimators=50, max_depth=10, random_state=0, verbose=2)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print('Accuracy score: ', accuracy_score(y_test, y_pred))
    print(confusion_matrix(y_test, y_pred))
    print(classification_report(y_test, y_pred))
    return clf



In [None]:
print('Without October')
clf_without_october = train_and_predict(processed_data_downsampled, use_october=False)
print()
print('----------------------------------------------')
print('----------------------------------------------')
print()
print('With October')
clf_with_october = train_and_predict(processed_data_downsampled, use_october=True)

In [None]:
model_path = './data/joblibs/rf_model_without_october.joblib'
joblib.dump(clf_without_october, model_path)
model_path = './data/joblibs/rf_model_with_october.joblib'
joblib.dump(clf_with_october, model_path)

In [None]:
#Feature importance
def feature_importance_plot(clf, df, use_october=False):
    if not use_october:
        columns = df.columns
        columns_october = [col for col in columns if '-10-' in col]
        df = df.drop(columns_october, axis=1)
    X = df.drop(['Labels', 'geometry', 'latitude', 'longitude', 'location'], axis=1)
    feature_importance = clf.feature_importances_
    feature_importance = 100.0 * (feature_importance / feature_importance.max())

    # Sort the index values and flip them so that they are arranged in decreasing order of importance
    index_sorted = np.flipud(np.argsort(feature_importance))

    # Center the location of the labels on the X-axis (for display purposes only)
    pos = np.arange(index_sorted.shape[0]) + 0.5

    # Plot the bar graph
    plt.figure(figsize=(12, 6))
    plt.bar(pos, feature_importance[index_sorted], align='center')
    plt.xticks(pos, X.columns[index_sorted], rotation=90)
    plt.ylabel('Relative Importance')
    plt.title('Variable Importance')
    plt.show()

In [None]:
feature_importance_plot(clf_without_october, processed_data_downsampled, use_october=False)

In [None]:
feature_importance_plot(clf_with_october, processed_data_downsampled, use_october=True)

In [None]:
import joblib
import geopandas as gpd
import matplotlib.pyplot as plt

def remove_october(df):
    columns = df.columns
    columns_october = [col for col in columns if '-10-' in col]
    df = df.drop(columns_october, axis=1)
    return df


def match_cols_from_src_to_dest_dfs(src_df, dst_df):
    def get_band_index_from_cols(cols, band, month):
        for i, col in enumerate(cols):
            band_col = col.split('_')[0]
            month_col = col.split('_')[1].split('-')[1]
            if band_col == band and month_col == month:
                return i
    src_cols = src_df.columns
    dst_cols = dst_df.columns
    mapping_dict = {}
    for src_col in src_cols:
        if src_col in ['location', 'Labels', 'latitude', 'longitude', 'geometry']:
            continue
        else:
            band = src_col.split('_')[0]
            month = src_col.split('_')[1].split('-')[1]
            dst_col_index = get_band_index_from_cols(dst_cols, band, month)
            mapping_dict[dst_cols[dst_col_index]] = src_col
            continue
    return mapping_dict



def preds_numb_to_str(preds):
    preds_str = []
    for pred in preds:
        if pred == 0:
            preds_str.append('Uncultivated')
        elif pred == 1:
            preds_str.append('Cultivated')
        elif pred == 2:
            preds_str.append('Other')
    return preds_str



file_path_2021 = './data/joblibs/processed_data.joblib'
file_path_2022 = './data/joblibs/processed_data_Gaziera_2022.joblib'
file_path_2023 = './data/joblibs/processed_data_gaziera_2023.joblib'
processed_data_gdf_2021 =joblib.load(file_path_2021)
processed_data_gdf_2022 =joblib.load(file_path_2022)
processed_data_gdf_2023 =joblib.load(file_path_2023)

processed_data_gdf_2021 = processed_data_gdf_2021[processed_data_gdf_2021['location'] == 'gaziera']


processed_data_gdf_2021 = remove_october(processed_data_gdf_2021)
processed_data_gdf_2022 = remove_october(processed_data_gdf_2022)
processed_data_gdf_2023 = remove_october(processed_data_gdf_2023)

mapping_dict_2022 = match_cols_from_src_to_dest_dfs(processed_data_gdf_2021, processed_data_gdf_2022)
mapping_dict_2023 = match_cols_from_src_to_dest_dfs(processed_data_gdf_2021, processed_data_gdf_2023)
processed_data_gdf_2022 = processed_data_gdf_2022.rename(columns=mapping_dict_2022)
processed_data_gdf_2023 = processed_data_gdf_2023.rename(columns=mapping_dict_2023)


processed_data_gdf_2021 = processed_data_gdf_2021.to_crs('EPSG:4326')
processed_data_gdf_2022 = processed_data_gdf_2022.to_crs('EPSG:4326')
processed_data_gdf_2023 = processed_data_gdf_2023.to_crs('EPSG:4326')


print(f'2021 shape: {processed_data_gdf_2021.shape}')
print(f'2022 shape: {processed_data_gdf_2022.shape}')
print(f'2023 shape: {processed_data_gdf_2023.shape}')

print(f'2021 location: {processed_data_gdf_2021.location.unique()}')
print(f'2022 location: {processed_data_gdf_2022.location.unique()}')
print(f'2023 location: {processed_data_gdf_2023.location.unique()}')

print(f'2021 NaNs: {processed_data_gdf_2021.isna().sum().sum()}')
print(f'2022 NaNs: {processed_data_gdf_2022.isna().sum().sum()}')
print(f'2023 NaNs: {processed_data_gdf_2023.isna().sum().sum()}')


In [None]:
def normalize_data(df):
    '''
    normalize columns with data type float by mean and std
    '''
    df = df.copy()
    float_columns = df.select_dtypes(include=['float']).columns
    for col in float_columns:
        df[col] = (df[col] - df[col].mean()) / df[col].std()
    return df

processed_data_gdf_2021_normalized = normalize_data(processed_data_gdf_2021)
processed_data_gdf_2022_normalized = normalize_data(processed_data_gdf_2022)
processed_data_gdf_2023_normalized = normalize_data(processed_data_gdf_2023)

In [None]:
model_name = 'RandomForestClassifier'
n_estimators = 200
max_depth = 20
test_size = 0.60
use_october = False
model_name = f'{model_name}_n_estimators_{n_estimators}_max_depth_{max_depth}_test_size_{test_size}_use_october_{use_october}_normalized'
model_path = f'./data/joblibs/{model_name}.joblib'
clf = joblib.load(model_path)

X_full_2021 = processed_data_gdf_2021.drop(['Labels', 'geometry', 'latitude', 'longitude', 'location'], axis=1)
y_full_2021 = processed_data_gdf_2021['Labels']
y_pred_full_2021 = clf.predict(X_full_2021)

X_full_2022 = processed_data_gdf_2022.drop(['Labels', 'geometry', 'latitude', 'longitude', 'location'], axis=1)
y_full_2022 = processed_data_gdf_2022['Labels']
y_pred_full_2022 = clf.predict(X_full_2022)

X_full_2023 = processed_data_gdf_2023.drop(['Labels', 'geometry', 'latitude', 'longitude', 'location'], axis=1)
y_full_2023 = processed_data_gdf_2023['Labels']
y_pred_full_2023 = clf.predict(X_full_2023)

processed_data_gdf_2021['Predicted'] = y_pred_full_2021
processed_data_gdf_2022['Predicted'] = y_pred_full_2022
processed_data_gdf_2023['Predicted'] = y_pred_full_2023


preds_2021 = processed_data_gdf_2021['Predicted']
preds_2022 = processed_data_gdf_2022['Predicted']
preds_2023 = processed_data_gdf_2023['Predicted']

preds_2021_str = preds_numb_to_str(preds_2021)
preds_2022_str = preds_numb_to_str(preds_2022)
preds_2023_str = preds_numb_to_str(preds_2023)

processed_data_gdf_2021['Predicted_str'] = preds_2021_str
processed_data_gdf_2022['Predicted_str'] = preds_2022_str
processed_data_gdf_2023['Predicted_str'] = preds_2023_str



processed_data_gdf_2021 = gpd.GeoDataFrame(processed_data_gdf_2021, geometry='geometry')
processed_data_gdf_2022 = gpd.GeoDataFrame(processed_data_gdf_2022, geometry='geometry')
processed_data_gdf_2023 = gpd.GeoDataFrame(processed_data_gdf_2023, geometry='geometry')



##################################################Normalized##################################################

X_full_2021 = processed_data_gdf_2021_normalized.drop(['Labels', 'geometry', 'latitude', 'longitude', 'location'], axis=1)
y_full_2021 = processed_data_gdf_2021_normalized['Labels']
y_pred_full_2021 = clf.predict(X_full_2021)

X_full_2022 = processed_data_gdf_2022_normalized.drop(['Labels', 'geometry', 'latitude', 'longitude', 'location'], axis=1)
y_full_2022 = processed_data_gdf_2022_normalized['Labels']
y_pred_full_2022 = clf.predict(X_full_2022)

X_full_2023 = processed_data_gdf_2023_normalized.drop(['Labels', 'geometry', 'latitude', 'longitude', 'location'], axis=1)
y_full_2023 = processed_data_gdf_2023_normalized['Labels']
y_pred_full_2023 = clf.predict(X_full_2023)

processed_data_gdf_2021_normalized['Predicted'] = y_pred_full_2021
processed_data_gdf_2022_normalized['Predicted'] = y_pred_full_2022
processed_data_gdf_2023_normalized['Predicted'] = y_pred_full_2023

preds_2021_str_normalized = preds_numb_to_str(y_pred_full_2021)
preds_2022_str_normalized = preds_numb_to_str(y_pred_full_2022)
preds_2023_str_normalized = preds_numb_to_str(y_pred_full_2023)

processed_data_gdf_2021_normalized['Predicted_str'] = preds_2021_str_normalized
processed_data_gdf_2022_normalized['Predicted_str'] = preds_2022_str_normalized
processed_data_gdf_2023_normalized['Predicted_str'] = preds_2023_str_normalized

processed_data_gdf_2021_normalized = gpd.GeoDataFrame(processed_data_gdf_2021_normalized, geometry='geometry')
processed_data_gdf_2022_normalized = gpd.GeoDataFrame(processed_data_gdf_2022_normalized, geometry='geometry')
processed_data_gdf_2023_normalized = gpd.GeoDataFrame(processed_data_gdf_2023_normalized, geometry='geometry')



In [None]:
clf

In [None]:
import numpy as np
def feature_importance_plot(clf, df, use_october=False):
    if not use_october:
        columns = df.columns
        columns_october = [col for col in columns if '-10-' in col]
        df = df.drop(columns_october, axis=1)
    X = df.drop(['Labels', 'geometry', 'latitude', 'longitude', 'location'], axis=1)
    feature_importance = clf.feature_importances_
    feature_importance = 100.0 * (feature_importance / feature_importance.max())
    sorted_idx = np.argsort(feature_importance)
    sorted_idx = sorted_idx[-10:]
    # pos = np.arange(sorted_idx.shape[0]) + .5
    # fig, ax = plt.subplots(figsize=(10, 3))
    # ax.barh(pos, feature_importance[sorted_idx], align='center')
    # ax.set_yticks(pos)
    # ax.set_yticklabels(X.columns[sorted_idx])
    # ax.set_xlabel('Relative Importance')
    # ax.set_title('Variable Importance')
    # plt.show()
    return feature_importance[sorted_idx], X.columns[sorted_idx]

feature_importance_2021, feature_names_2021 = feature_importance_plot(clf, processed_data_gdf_2021, use_october=False)
feature_importance_2022, feature_names_2022 = feature_importance_plot(clf, processed_data_gdf_2022, use_october=False)
feature_importance_2023, feature_names_2023 = feature_importance_plot(clf, processed_data_gdf_2023, use_october=False)

In [None]:
for i in range(5):
    print('--')
    #plot coulmn values distribution
    fig, ax = plt.subplots(1, 1, figsize=(10, 2))
    processed_data_gdf_2021[feature_names_2021[i]].hist(ax=ax, bins=100)
    processed_data_gdf_2022[feature_names_2022[i]].hist(ax=ax, bins=100)
    processed_data_gdf_2023[feature_names_2023[i]].hist(ax=ax, bins=100)
    ax.set_title(feature_names_2021[i])
    ax.legend(['2021', '2022', '2023'])
    plt.show()


    #plot coulmn values distribution after normalization by mean and std
    fig, ax = plt.subplots(1, 1, figsize=(10, 2))
    processed_data_gdf_2021_normalized[feature_names_2021[i]].hist(ax=ax, bins=100)
    processed_data_gdf_2022_normalized[feature_names_2022[i]].hist(ax=ax, bins=100)
    processed_data_gdf_2023_normalized[feature_names_2023[i]].hist(ax=ax, bins=100)
    ax.set_title(feature_names_2021[i]+' normalized')
    ax.legend(['2021', '2022', '2023'])
    plt.show()

In [None]:
labels_values_counts_2021 = processed_data_gdf_2021['Predicted_str'].value_counts()
labels_values_counts_2022 = processed_data_gdf_2022['Predicted_str'].value_counts()
labels_values_counts_2023 = processed_data_gdf_2023['Predicted_str'].value_counts()

fig, ax = plt.subplots(1, 3, figsize=(20, 5))
ax[0].bar(labels_values_counts_2021.index, labels_values_counts_2021.values)
ax[0].set_title('2021')
ax[1].bar(labels_values_counts_2022.index, labels_values_counts_2022.values)
ax[1].set_title('2022')
ax[2].bar(labels_values_counts_2023.index, labels_values_counts_2023.values)
ax[2].set_title('2023')

for i in range(3):
    ax[i].set_xlabel('Class')
    ax[i].set_ylabel('Count')
    ax[i].set_ylim([0, 500000])
plt.show()


print('Normalized')

labels_values_counts_2021_normalized = processed_data_gdf_2021_normalized['Predicted_str'].value_counts()
labels_values_counts_2022_normalized = processed_data_gdf_2022_normalized['Predicted_str'].value_counts()
labels_values_counts_2023_normalized = processed_data_gdf_2023_normalized['Predicted_str'].value_counts()

fig, ax = plt.subplots(1, 3, figsize=(20, 5))
ax[0].bar(labels_values_counts_2021_normalized.index, labels_values_counts_2021_normalized.values)
ax[0].set_title('2021')
ax[1].bar(labels_values_counts_2022_normalized.index, labels_values_counts_2022_normalized.values)
ax[1].set_title('2022')
ax[2].bar(labels_values_counts_2023_normalized.index, labels_values_counts_2023_normalized.values)
ax[2].set_title('2023')

for i in range(3):
    ax[i].set_xlabel('Class')
    ax[i].set_ylabel('Count')
    ax[i].set_ylim([0, 500000])
plt.show()



In [None]:
precentage_uncultivated_2021 = labels_values_counts_2021_normalized['Uncultivated'] / labels_values_counts_2021_normalized.sum()
precentage_uncultivated_2022 = labels_values_counts_2022_normalized['Uncultivated'] / labels_values_counts_2022_normalized.sum()
precentage_uncultivated_2023 = labels_values_counts_2023_normalized['Uncultivated'] / labels_values_counts_2023_normalized.sum()

precentage_cultivated_2021 = labels_values_counts_2021_normalized['Cultivated'] / labels_values_counts_2021_normalized.sum()
precentage_cultivated_2022 = labels_values_counts_2022_normalized['Cultivated'] / labels_values_counts_2022_normalized.sum()
precentage_cultivated_2023 = labels_values_counts_2023_normalized['Cultivated'] / labels_values_counts_2023_normalized.sum()

precentage_other_2021 = labels_values_counts_2021_normalized['Other'] / labels_values_counts_2021.sum()
precentage_other_2022 = labels_values_counts_2021_normalized['Other'] / labels_values_counts_2022.sum()
precentage_other_2023 = labels_values_counts_2021_normalized['Other'] / labels_values_counts_2023.sum()

print(f'precentage_cultivated_2021: {precentage_cultivated_2021*100 :.2f}%')
print(f'precentage_uncultivated_2021: {precentage_uncultivated_2021*100 :.2f}%')
print(f'precentage_other_2021: {precentage_other_2021*100 :.2f}%')
print(f'precentage_cultivated_2022: {precentage_cultivated_2022*100 :.2f}%')
print(f'precentage_uncultivated_2022: {precentage_uncultivated_2022*100 :.2f}%')
print(f'precentage_other_2022: {precentage_other_2022*100 :.2f}%')
print(f'precentage_cultivated_2023: {precentage_cultivated_2023*100 :.2f}%')
print(f'precentage_uncultivated_2023: {precentage_uncultivated_2023*100 :.2f}%')
print(f'precentage_other_2023: {precentage_other_2023*100 :.2f}%')

In [None]:
from matplotlib.colors import LinearSegmentedColormap
colors = [(0, 1, 0), (0, 0, 1), (1, 0, 0)]  # G -> B -> R
cmap_name = 'my_list'
cmap = LinearSegmentedColormap.from_list(cmap_name, colors, N=3)


print('Normalized')
processed_data_gdf_2021_normalized_sample = processed_data_gdf_2021_normalized.sample(frac=1)
processed_data_gdf_2022_normalized_sample = processed_data_gdf_2022_normalized.sample(frac=1)
processed_data_gdf_2023_normalized_sample = processed_data_gdf_2023_normalized.sample(frac=1)


fig, axs = plt.subplots(3, 1, figsize=(10, 25))
processed_data_gdf_2021_normalized_sample.plot(ax=axs[0], column='Predicted_str', legend=True, markersize=0.5, categorical=True, cmap=cmap)
processed_data_gdf_2022_normalized_sample.plot(ax=axs[1], column='Predicted_str', legend=True, markersize=0.5, categorical=True, cmap=cmap)
processed_data_gdf_2023_normalized_sample.plot(ax=axs[2], column='Predicted_str', legend=True, markersize=0.5, categorical=True, cmap=cmap)

axs[0].set_title(f'2021: cultivated={precentage_cultivated_2021*100 :.2f}%, uncultivated={precentage_uncultivated_2021*100 :.2f}%, other={precentage_other_2021*100 :.2f}%')
axs[1].set_title(f'2022: cultivated={precentage_cultivated_2022*100 :.2f}%, uncultivated={precentage_uncultivated_2022*100 :.2f}%, other={precentage_other_2022*100 :.2f}%')
axs[2].set_title(f'2023: cultivated={precentage_cultivated_2023*100 :.2f}%, uncultivated={precentage_uncultivated_2023*100 :.2f}%, other={precentage_other_2023*100 :.2f}%')

for i in range(3):
    axs[i].set_xlabel('Longitude')
    axs[i].set_ylabel('Latitude')

fig_name = 'all_years_predictions_normalized.png'
plt.savefig(fig_name, dpi=300)
plt.show()

In [None]:
def labels_numb_to_str(preds):
    preds_str = []
    for pred in preds:
        if pred == 0:
            preds_str.append('Uncultivated')
        elif pred == 1:
            preds_str.append('Cultivated')
        elif pred == -1:
            preds_str.append('Other')
    return preds_str

y_labels_full_2021 = processed_data_gdf_2021_normalized['Labels']
labels_2021_str_normalized = labels_numb_to_str(y_labels_full_2021)
processed_data_gdf_2021_normalized['Labels_str'] = labels_2021_str_normalized
labels_values_counts_2021_normalized = processed_data_gdf_2021_normalized['Labels_str'].value_counts()
precentage_uncultivated_2021 = labels_values_counts_2021_normalized['Uncultivated'] / labels_values_counts_2021_normalized.sum()
precentage_cultivated_2021 = labels_values_counts_2021_normalized['Cultivated'] / labels_values_counts_2021_normalized.sum()
precentage_other_2021 = labels_values_counts_2021_normalized['Other'] / labels_values_counts_2021.sum()

processed_data_gdf_2021_normalized_sample = processed_data_gdf_2021_normalized.sample(frac=1)
fig, ax = plt.subplots(figsize=(10, 10))
processed_data_gdf_2021_normalized_sample.plot(ax=ax, column='Labels_str', legend=True, markersize=0.5, categorical=True, cmap=cmap)
ax.set_title(f'2021: cultivated={precentage_cultivated_2021*100 :.2f}%, uncultivated={precentage_uncultivated_2021*100 :.2f}%, other={precentage_other_2021*100 :.2f}%')
ax.set_xlabel('Longitude')
ax.set_ylabel('Latitude')
plt.title('2021 labels')
fig_name = '2021_labels.png'
plt.savefig(fig_name, dpi=300)
plt.show()