This script implements a semi-supervised learning approach using labeled and unlabeled geospatial data. A Random Forest classifier is first trained on labeled data, evaluated, and then used to generate pseudo-labels for the unlabeled dataset. High-confidence pseudo-labels are identified and combined with the labeled data to retrain the model, improving its performance. The final model is evaluated on the labeled test set, and predictions are generated for the remaining unlabeled data. This method extends the training dataset using confident predictions, enhancing model accuracy with limited labeled data.

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# File paths
combined_file = r'C:/Users/T00701453/Downloads/combined.csv'
ground_truth_file = r'C:/Users/T00701453/Downloads/gt.csv'

# Load datasets
combined_df = pd.read_csv(combined_file)
ground_truth_df = pd.read_csv(ground_truth_file)

# Merge datasets to create the labeled set
labeled_df = pd.merge(combined_df, ground_truth_df, on=['Latitude', 'Longitude'], how='inner')

# Separate labeled data into features (X) and labels (y)
X_labeled = labeled_df.drop(columns=['Latitude', 'Longitude', 'Labels'])
y_labeled = labeled_df['Labels']

# Unlabeled dataset: Keep rows that are NOT in the ground truth
unlabeled_df = combined_df[~combined_df[['Latitude', 'Longitude']].isin(ground_truth_df[['Latitude', 'Longitude']]).all(axis=1)]
X_unlabeled = unlabeled_df.drop(columns=['Latitude', 'Longitude'])

# Scale both labeled and unlabeled features
scaler = StandardScaler()
X_labeled_scaled = scaler.fit_transform(X_labeled)
X_unlabeled_scaled = scaler.transform(X_unlabeled)

# Split labeled data for initial supervised learning
X_train, X_test, y_train, y_test = train_test_split(X_labeled_scaled, y_labeled, test_size=0.3, random_state=42)

# Step 1: Train initial Random Forest on labeled data
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

# Evaluate on labeled test set
y_pred = rf_model.predict(X_test)
print(f"Initial Accuracy on Labeled Data: {accuracy_score(y_test, y_pred) * 100:.2f}%")
print("\nClassification Report (Initial):\n", classification_report(y_test, y_pred))

# Step 2: Predict on unlabeled data
pseudo_labels = rf_model.predict(X_unlabeled_scaled)
pseudo_label_probs = rf_model.predict_proba(X_unlabeled_scaled)

# Step 3: Select high-confidence predictions (e.g., confidence > 0.95)
confidence_threshold = 0.95
high_confidence_indices = np.max(pseudo_label_probs, axis=1) > confidence_threshold
X_pseudo = X_unlabeled_scaled[high_confidence_indices]
y_pseudo = pseudo_labels[high_confidence_indices]

print(f"Number of high-confidence pseudo-labeled points: {len(y_pseudo)}")

# Step 4: Combine labeled data with high-confidence pseudo-labeled data
X_combined = np.vstack((X_labeled_scaled, X_pseudo))
y_combined = np.hstack((y_labeled, y_pseudo))

# Step 5: Retrain the model on the combined dataset
rf_model.fit(X_combined, y_combined)

# Step 6: Evaluate on the original test set (from labeled data)
y_pred_final = rf_model.predict(X_test)
print(f"Final Accuracy after Semi-Supervised Learning: {accuracy_score(y_test, y_pred_final) * 100:.2f}%")
print("\nClassification Report (Final):\n", classification_report(y_test, y_pred_final))