In [None]:
from sklearn.ensemble import RandomForestClassifier
from data_processor import read_all_processed_data
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from sklearn.utils import resample
import joblib

# Read data
file_path = './data/joblibs/processed_data.joblib'
processed_data_gdf =joblib.load(file_path)


# Remove rows with labels == -1
processed_data_gdf = processed_data_gdf[processed_data_gdf['Labels'] != -1]

# Show the distribution of labels
processed_data_gdf['Labels'].value_counts()

processed_data_majority2 = processed_data_gdf[processed_data_gdf.Labels==2]
processed_data_majority1 = processed_data_gdf[processed_data_gdf.Labels==1]
processed_data_minority0 = processed_data_gdf[processed_data_gdf.Labels==0]

class_sample_size = len(processed_data_minority0)

# Downsample majority classes
processed_data_majority_downsampled2 = resample(processed_data_majority2,
                                 replace=False,    # sample without replacement
                                 n_samples=class_sample_size,     # to match minority class
                                 random_state=123) # reproducible results

processed_data_majority_downsampled1 = resample(processed_data_majority1,
                                    replace=False,    # sample without replacement
                                    n_samples=class_sample_size,     # to match minority class
                                    random_state=123) # reproducible results

# Combine minority class with downsampled majority class
processed_data_downsampled = pd.concat([processed_data_majority_downsampled2, processed_data_majority_downsampled1, processed_data_minority0])

processed_data_downsampled['Labels'].value_counts()


In [None]:
def train_and_predict(df, use_october=False):
    if not use_october:
        columns = df.columns
        columns_october = [col for col in columns if '-10-' in col]
        df = df.drop(columns_october, axis=1)
    y = df['Labels']
    X = df.drop(['Labels', 'geometry', 'latitude', 'longitude', 'location'], axis=1)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)
    clf = RandomForestClassifier(n_estimators=50, max_depth=10, random_state=0, verbose=2)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print('Accuracy score: ', accuracy_score(y_test, y_pred))
    print(confusion_matrix(y_test, y_pred))
    print(classification_report(y_test, y_pred))
    return clf



In [None]:
print('Without October')
clf_without_october = train_and_predict(processed_data_downsampled, use_october=False)
print()
print('----------------------------------------------')
print('----------------------------------------------')
print()
print('With October')
clf_with_october = train_and_predict(processed_data_downsampled, use_october=True)

In [None]:
model_path = './data/joblibs/rf_model_without_october.joblib'
joblib.dump(clf_without_october, model_path)
model_path = './data/joblibs/rf_model_with_october.joblib'
joblib.dump(clf_with_october, model_path)

In [None]:
#Feature importance
def feature_importance_plot(clf, df, use_october=False):
    if not use_october:
        columns = df.columns
        columns_october = [col for col in columns if '-10-' in col]
        df = df.drop(columns_october, axis=1)
    X = df.drop(['Labels', 'geometry', 'latitude', 'longitude', 'location'], axis=1)
    feature_importance = clf.feature_importances_
    feature_importance = 100.0 * (feature_importance / feature_importance.max())

    # Sort the index values and flip them so that they are arranged in decreasing order of importance
    index_sorted = np.flipud(np.argsort(feature_importance))

    # Center the location of the labels on the X-axis (for display purposes only)
    pos = np.arange(index_sorted.shape[0]) + 0.5

    # Plot the bar graph
    plt.figure(figsize=(12, 6))
    plt.bar(pos, feature_importance[index_sorted], align='center')
    plt.xticks(pos, X.columns[index_sorted], rotation=90)
    plt.ylabel('Relative Importance')
    plt.title('Variable Importance')
    plt.show()

In [None]:
feature_importance_plot(clf_without_october, processed_data_downsampled, use_october=False)

In [None]:
feature_importance_plot(clf_with_october, processed_data_downsampled, use_october=True)

In [None]:
import joblib
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt

file_path = './data/joblibs/processed_data.joblib'
processed_data_gdf =joblib.load(file_path)
processed_data_gdf = processed_data_gdf[processed_data_gdf['Labels'] != -1]

columns = processed_data_gdf.columns
columns_october = [col for col in columns if '-10-' in col]
processed_data_gdf = processed_data_gdf.drop(columns_october, axis=1)

model_path = './data/joblibs/rf_model_without_october.joblib'
clf = joblib.load(model_path)

X_full = processed_data_gdf.drop(['Labels', 'geometry', 'latitude', 'longitude', 'location'], axis=1)
y_full = processed_data_gdf['Labels']
y_pred_full = clf.predict(X_full)
print(accuracy_score(y_full, y_pred_full))

processed_data_gdf['Predicted'] = y_pred_full

pred_same_as_label = []
for pred, label in zip(y_pred_full, y_full):
    if pred == label:
        pred_same_as_label.append(1)
    else:
        pred_same_as_label.append(0)

processed_data_gdf['Pred_same_as_label'] = pred_same_as_label
processed_data_gdf = processed_data_gdf.to_crs(crs='EPSG:4326')


In [None]:
import folium
from shapely.geometry import Point
import geopandas as gpd

file_path = './data/joblibs/labels.joblib'
base = joblib.load(file_path)
base = base.to_crs(crs='EPSG:4326')
states = base['State'].values
keep_rows_index = []
for i, state in enumerate(states):
    if 'gaziera' in state.lower():
        keep_rows_index.append(i)
base = base.iloc[keep_rows_index]
base = base.reset_index(drop=True)


In [None]:
from matplotlib.colors import LinearSegmentedColormap
colors = [(1, 0, 0), (0, 1, 0), (0, 0, 1)]  # R -> G -> B
cmap_name = 'my_list'
cmap = LinearSegmentedColormap.from_list(cmap_name, colors, N=3)


In [None]:
target_location = 'gaziera'
target_location_base = 'Gaziera'
processed_data_gdf_target = processed_data_gdf[processed_data_gdf['location'] == target_location]
processed_data_gdf_target = processed_data_gdf_target.sample(200000)
base_target = base[base['State'] == target_location_base]
base_target_uncultivated = base_target[base_target['Crop_Type'] == 'Uncultivated']
base_target_cultivated = base_target[base_target['Crop_Type'] != 'Uncultivated']
fig, ax = plt.subplots(figsize=(25, 25))
ax.set_title('Accuracy')
colors = {0: 'red', 1: 'green'}
# base_target.plot(ax=ax, color='white', edgecolor='black', linewidth=0.5)
base_target_uncultivated.plot(ax=ax, color='white', edgecolor='red', linewidth=2)
base_target_cultivated.plot(ax=ax, color='white', edgecolor='green', linewidth=2)
processed_data_gdf_target.plot(ax=ax, column='Pred_same_as_label', legend=True, markersize=0.5, categorical=True, cmap=cmap)
plt.show()


In [None]:
sample_row = processed_data_gdf.iloc[0]
lat = sample_row['latitude']
lon = sample_row['longitude']
m = folium.Map([lat, lon], zoom_start=12)
base.explore(
    m=m,
    column='State',
    name='base',
    tooltip=['State', 'Crop_Type'],
    legend=False)
print('Done')
target_location = 'gaziera'
processed_data_gdf_small = processed_data_gdf[processed_data_gdf['location'] == target_location]
processed_data_gdf_small = processed_data_gdf_small.sample(25000)
processed_data_gdf_small.explore(
    m=m,
    column= 'Pred_same_as_label',
    name='predicted',
    legend=True,
    categorical=True,
    cmap=cmap,
    tooltip=['Predicted', 'Labels'],
    marker_kwds=dict(radius=1, fill=True))
folium.LayerControl().add_to(m)
m
