In [None]:
pip install rasterio

Collecting rasterio
  Downloading rasterio-1.3.10-cp310-cp310-manylinux2014_x86_64.whl.metadata (14 kB)
Collecting affine (from rasterio)
  Downloading affine-2.4.0-py3-none-any.whl.metadata (4.0 kB)
Collecting snuggs>=1.4.1 (from rasterio)
  Downloading snuggs-1.4.7-py3-none-any.whl.metadata (3.4 kB)
Downloading rasterio-1.3.10-cp310-cp310-manylinux2014_x86_64.whl (21.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m21.5/21.5 MB[0m [31m24.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading snuggs-1.4.7-py3-none-any.whl (5.4 kB)
Downloading affine-2.4.0-py3-none-any.whl (15 kB)
Installing collected packages: snuggs, affine, rasterio
Successfully installed affine-2.4.0 rasterio-1.3.10 snuggs-1.4.7


In [None]:
from google.colab import drive
drive.mount('/content/drive')

import rasterio
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import RandomizedSearchCV

# Function to load and flatten image data
def load_image_data(file_path):
    img = rasterio.open(file_path)
    array = img.read()
    flattened_img = array.reshape(array.shape[0], -1).T  # Shape (height * width, bands)
    return flattened_img

# Load data for each year
file_paths = [
    'drive/MyDrive/Train_Data-20240801T122201Z-001/Train_Data/train_2015.tif',
    'drive/MyDrive/Train_Data-20240801T122201Z-001/Train_Data/train_2016.tif',
    'drive/MyDrive/Train_Data-20240801T122201Z-001/Train_Data/train_2017.tif',
    'drive/MyDrive/Train_Data-20240801T122201Z-001/Train_Data/train_2018.tif',
    'drive/MyDrive/Train_Data-20240801T122201Z-001/Train_Data/train_2019.tif',
    'drive/MyDrive/Train_Data-20240801T122201Z-001/Train_Data/train_2020.tif'
]

dfs = []
for file_path in file_paths:
    flattened_img = load_image_data(file_path)
    df = pd.DataFrame(flattened_img, columns=[f'Band_{i}' for i in range(1, 7)])
    dfs.append(df)

# Combine all data into a single DataFrame
df = pd.concat(dfs, ignore_index=True)

# Select only the first 20,000 entries
df = df.head(20000)

# Rename the 'Band_6' column to 'class'
df = df.rename(columns={'Band_6': 'class'})

# Remove entries with class 3 and 5
df = df[~df['class'].isin([3, 5])]

# Split the DataFrame into features and labels
X = df.drop(columns='class')
y = df['class']

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Train the RandomForestClassifier
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)

# Make predictions
y_pred = rf.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print('Classification Report:')
print(report)


Mounted at /content/drive
Accuracy: 0.93875
Classification Report:
              precision    recall  f1-score   support

         1.0       0.95      0.98      0.96      3383
         2.0       0.85      0.74      0.79       617

    accuracy                           0.94      4000
   macro avg       0.90      0.86      0.88      4000
weighted avg       0.94      0.94      0.94      4000



In [None]:

file_path_2021 = 'drive/MyDrive/Train_Data-20240801T122201Z-001/Train_Data/train_2021.tif'
flattened_img_2021 = load_image_data(file_path_2021)
df_2021 = pd.DataFrame(flattened_img_2021, columns=[f'Band_{i}' for i in range(1, 7)])

# Rename the 'Band_6' column to 'class'
df_2021 = df_2021.rename(columns={'Band_6': 'class'})

# Remove entries with class 3 and 5
df_2021 = df_2021[~df_2021['class'].isin([3, 5])]

# Extract features and labels
X_2021 = df_2021.drop(columns='class')
y_2021 = df_2021['class']

# Standardize the features using the same scaler
X_2021_scaled = scaler.transform(X_2021)

# Make predictions on the 2021 data
y_2021_pred = rf.predict(X_2021_scaled)

# Evaluate the model on the 2021 data
accuracy_2021 = accuracy_score(y_2021, y_2021_pred)
report_2021 = classification_report(y_2021, y_2021_pred)

print(f'Accuracy on 2021 data: {accuracy_2021}')
print('Classification Report on 2021 data:')
print(report_2021)

Accuracy on 2021 data: 0.5243909161062728
Classification Report on 2021 data:
              precision    recall  f1-score   support

         1.0       0.51      0.92      0.65     17695
         2.0       0.65      0.15      0.25     18589

    accuracy                           0.52     36284
   macro avg       0.58      0.53      0.45     36284
weighted avg       0.58      0.52      0.44     36284



In [None]:
class_counts = df['class'].value_counts()
print("Class counts in the training data:")
print(class_counts)

Class counts in the training data:
class
1.0    16932
2.0     3064
Name: count, dtype: int64


In [None]:

dfs = []
for file_path in file_paths:
    flattened_img = load_image_data(file_path)
    df = pd.DataFrame(flattened_img, columns=[f'Band_{i}' for i in range(1, 7)])
    dfs.append(df)

# Combine all data into a single DataFrame
df = pd.concat(dfs, ignore_index=True)

# Select only the first 20,000 entries
df = df.head(20000)

# Rename the 'Band_6' column to 'class'
df = df.rename(columns={'Band_6': 'class'})

# Remove entries with class 3 and 5
df = df[~df['class'].isin([3, 5])]

# Check class counts before balancing
print("Class counts before balancing:")
print(df['class'].value_counts())

# Find the count of the minority class
min_class_count = df['class'].value_counts().min()

# Undersample the majority class to match the minority class count
df_balanced = df.groupby('class').apply(lambda x: x.sample(min_class_count)).reset_index(drop=True)

# Check class counts after balancing
print("Class counts after balancing:")
print(df_balanced['class'].value_counts())

# Split the DataFrame into features and labels
X = df_balanced.drop(columns='class')
y = df_balanced['class']

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Train the RandomForestClassifier
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)

# Make predictions on the test set
y_pred = rf.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f'Accuracy on balanced training data: {accuracy}')
print('Classification Report on balanced training data:')
print(report)

# Load and preprocess data from 2021
file_path_2021 = 'drive/MyDrive/Train_Data-20240801T122201Z-001/Train_Data/train_2021.tif'
flattened_img_2021 = load_image_data(file_path_2021)
df_2021 = pd.DataFrame(flattened_img_2021, columns=[f'Band_{i}' for i in range(1, 7)])

# Rename the 'Band_6' column to 'class'
df_2021 = df_2021.rename(columns={'Band_6': 'class'})

# Remove entries with class 3 and 5
df_2021 = df_2021[~df_2021['class'].isin([3, 5])]

# Check class counts in 2021 data
print("Class counts in the 2021 data:")
print(df_2021['class'].value_counts())

# Extract features and labels
X_2021 = df_2021.drop(columns='class')
y_2021 = df_2021['class']

# Standardize the features using the same scaler
X_2021_scaled = scaler.transform(X_2021)

# Make predictions on the 2021 data
y_2021_pred = rf.predict(X_2021_scaled)

# Evaluate the model on the 2021 data
accuracy_2021 = accuracy_score(y_2021, y_2021_pred)
report_2021 = classification_report(y_2021, y_2021_pred)

print(f'Accuracy on 2021 data: {accuracy_2021}')
print('Classification Report on 2021 data:')
print(report_2021)


Class counts before balancing:
class
1.0    16932
2.0     3064
Name: count, dtype: int64
Class counts after balancing:
class
1.0    3064
2.0    3064
Name: count, dtype: int64
Accuracy on balanced training data: 0.8849918433931484
Classification Report on balanced training data:
              precision    recall  f1-score   support

         1.0       0.88      0.90      0.89       642
         2.0       0.89      0.87      0.88       584

    accuracy                           0.88      1226
   macro avg       0.89      0.88      0.88      1226
weighted avg       0.89      0.88      0.88      1226

Class counts in the 2021 data:
class
2.0    18589
1.0    17695
Name: count, dtype: int64
Accuracy on 2021 data: 0.4667346488810495
Classification Report on 2021 data:
              precision    recall  f1-score   support

         1.0       0.46      0.61      0.53     17695
         2.0       0.47      0.33      0.39     18589

    accuracy                           0.47     36284
   macro 

In [None]:
from imblearn.over_sampling import SMOTE  # Import SMOTE
dfs = []
for file_path in file_paths:
    flattened_img = load_image_data(file_path)
    df = pd.DataFrame(flattened_img, columns=[f'Band_{i}' for i in range(1, 7)])
    dfs.append(df)

# Combine all data into a single DataFrame
df = pd.concat(dfs, ignore_index=True)

# Rename the 'Band_6' column to 'class'
df = df.rename(columns={'Band_6': 'class'})

# Remove entries with class 3 and 5
df = df[~df['class'].isin([3, 5])]

# Check class counts before sampling
print("Class counts before sampling:")
print(df['class'].value_counts())

# Split the DataFrame into features and labels
X = df.drop(columns='class')
y = df['class']

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Apply SMOTE
smote = SMOTE(sampling_strategy='auto', random_state=42)  # Automatically balances the classes
X_smote, y_smote = smote.fit_resample(X_scaled, y)

# Check class counts after SMOTE
print("Class counts after SMOTE:")
print(pd.Series(y_smote).value_counts())

# To ensure a total of 20,000 entries, we might need to sample the balanced dataset
if len(X_smote) > 20000:
    indices = np.random.choice(len(X_smote), size=20000, replace=False)
    X_smote, y_smote = X_smote[indices], y_smote[indices]

# Check class counts after trimming to 20,000 entries
print("Class counts after trimming to 20,000 entries:")
print(pd.Series(y_smote).value_counts())

# Split the balanced and trimmed data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_smote, y_smote, test_size=0.2, random_state=42)

# Train the RandomForestClassifier
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)

# Make predictions on the test set
y_pred = rf.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f'Accuracy on balanced training data: {accuracy}')
print('Classification Report on balanced training data:')
print(report)

# Load and preprocess data from 2021
file_path_2021 = 'drive/MyDrive/Train_Data-20240801T122201Z-001/Train_Data/train_2021.tif'
flattened_img_2021 = load_image_data(file_path_2021)
df_2021 = pd.DataFrame(flattened_img_2021, columns=[f'Band_{i}' for i in range(1, 7)])

# Rename the 'Band_6' column to 'class'
df_2021 = df_2021.rename(columns={'Band_6': 'class'})

# Remove entries with class 3 and 5
df_2021 = df_2021[~df_2021['class'].isin([3, 5])]

# Check class counts in 2021 data
print("Class counts in the 2021 data:")
print(df_2021['class'].value_counts())

# Extract features and labels
X_2021 = df_2021.drop(columns='class')
y_2021 = df_2021['class']

# Standardize the features using the same scaler
X_2021_scaled = scaler.transform(X_2021)

# Make predictions on the 2021 data
y_2021_pred = rf.predict(X_2021_scaled)

# Evaluate the model on the 2021 data
accuracy_2021 = accuracy_score(y_2021, y_2021_pred)
report_2021 = classification_report(y_2021, y_2021_pred)

print(f'Accuracy on 2021 data: {accuracy_2021}')
print('Classification Report on 2021 data:')
print(report_2021)


Class counts before sampling:
class
1.0    176194
2.0     41478
Name: count, dtype: int64
Class counts after SMOTE:
class
2.0    176194
1.0    176194
Name: count, dtype: int64
Class counts after trimming to 20,000 entries:
class
2.0    10004
1.0     9996
Name: count, dtype: int64
Accuracy on balanced training data: 0.872
Classification Report on balanced training data:
              precision    recall  f1-score   support

         1.0       0.87      0.87      0.87      2017
         2.0       0.87      0.87      0.87      1983

    accuracy                           0.87      4000
   macro avg       0.87      0.87      0.87      4000
weighted avg       0.87      0.87      0.87      4000

Class counts in the 2021 data:
class
2.0    18589
1.0    17695
Name: count, dtype: int64
Accuracy on 2021 data: 0.5824605886892295
Classification Report on 2021 data:
              precision    recall  f1-score   support

         1.0       0.54      0.90      0.68     17695
         2.0       0.75  

In [None]:
#split

import rasterio
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Function to load and flatten image data
def load_image_data(file_path):
    img = rasterio.open(file_path)
    array = img.read()
    flattened_img = array.reshape(array.shape[0], -1).T  # Shape (height * width, bands)
    return flattened_img

# Load data for each year
file_paths = [
    'drive/MyDrive/Train_Data-20240801T122201Z-001/Train_Data/train_2015.tif',
    'drive/MyDrive/Train_Data-20240801T122201Z-001/Train_Data/train_2016.tif',
    'drive/MyDrive/Train_Data-20240801T122201Z-001/Train_Data/train_2017.tif',
    'drive/MyDrive/Train_Data-20240801T122201Z-001/Train_Data/train_2018.tif',
    'drive/MyDrive/Train_Data-20240801T122201Z-001/Train_Data/train_2019.tif',
    'drive/MyDrive/Train_Data-20240801T122201Z-001/Train_Data/train_2020.tif'
]

dfs = []
for file_path in file_paths:
    flattened_img = load_image_data(file_path)
    df = pd.DataFrame(flattened_img, columns=[f'Band_{i}' for i in range(1, flattened_img.shape[1] + 1)])
    dfs.append(df)

# Combine all data into a single DataFrame
df = pd.concat(dfs, ignore_index=True)

# Assuming the class labels are in Band_6, rename it to 'class'
df = df.rename(columns={'Band_6': 'class'})

# Remove unwanted classes
df = df[~df['class'].isin([3, 5])]

# Split the DataFrame into features and labels
X = df.drop(columns='class')
y = df['class']

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Apply stratified shuffle split to ensure class balance in the sample
splitter = StratifiedShuffleSplit(n_splits=1, test_size=200000, random_state=42)
for _, sample_indices in splitter.split(X_scaled, y):
    X_sampled, y_sampled = X_scaled[sample_indices], y.iloc[sample_indices]

# Split the sampled data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_sampled, y_sampled, test_size=0.2, random_state=42)

# Train the RandomForestClassifier
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)

# Make predictions
y_pred = rf.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print('Classification Report:')
print(report)


Accuracy: 0.91205
Classification Report:
              precision    recall  f1-score   support

         1.0       0.93      0.97      0.95     16235
         2.0       0.82      0.68      0.75      3765

    accuracy                           0.91     20000
   macro avg       0.87      0.82      0.85     20000
weighted avg       0.91      0.91      0.91     20000



In [None]:
# Load and preprocess data from 2021
file_path_2021 = 'drive/MyDrive/Train_Data-20240801T122201Z-001/Train_Data/train_2021.tif'
flattened_img_2021 = load_image_data(file_path_2021)
df_2021 = pd.DataFrame(flattened_img_2021, columns=[f'Band_{i}' for i in range(1, 7)])

# Rename the 'Band_6' column to 'class'
df_2021 = df_2021.rename(columns={'Band_6': 'class'})

# Remove entries with class 3 and 5
df_2021 = df_2021[~df_2021['class'].isin([3, 5])]

# Check class counts in 2021 data
print("Class counts in the 2021 data:")
print(df_2021['class'].value_counts())

# Extract features and labels
X_2021 = df_2021.drop(columns='class')
y_2021 = df_2021['class']

# Standardize the features using the same scaler
X_2021_scaled = scaler.transform(X_2021)

# Make predictions on the 2021 data
y_2021_pred = rf.predict(X_2021_scaled)

# Evaluate the model on the 2021 data
accuracy_2021 = accuracy_score(y_2021, y_2021_pred)
report_2021 = classification_report(y_2021, y_2021_pred)

print(f'Accuracy on 2021 data: {accuracy_2021}')
print('Classification Report on 2021 data:')
print(report_2021)

Class counts in the 2021 data:
class
2.0    18589
1.0    17695
Name: count, dtype: int64
Accuracy on 2021 data: 0.5602469408003528
Classification Report on 2021 data:
              precision    recall  f1-score   support

         1.0       0.53      0.97      0.68     17695
         2.0       0.86      0.17      0.28     18589

    accuracy                           0.56     36284
   macro avg       0.69      0.57      0.48     36284
weighted avg       0.70      0.56      0.48     36284



In [None]:
print(df['class'].value_counts())

class
1.0    176194
2.0     41478
Name: count, dtype: int64


In [None]:
#random
from google.colab import drive
drive.mount('/content/drive')

import rasterio
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import RandomizedSearchCV

# Function to load and flatten image data
def load_image_data(file_path):
    img = rasterio.open(file_path)
    array = img.read()
    flattened_img = array.reshape(array.shape[0], -1).T  # Shape (height * width, bands)
    return flattened_img

# Load data for each year
file_paths = [
    'drive/MyDrive/Train_Data-20240801T122201Z-001/Train_Data/train_2015.tif',
    'drive/MyDrive/Train_Data-20240801T122201Z-001/Train_Data/train_2016.tif',
    'drive/MyDrive/Train_Data-20240801T122201Z-001/Train_Data/train_2017.tif',
    'drive/MyDrive/Train_Data-20240801T122201Z-001/Train_Data/train_2018.tif',
    'drive/MyDrive/Train_Data-20240801T122201Z-001/Train_Data/train_2019.tif',
    'drive/MyDrive/Train_Data-20240801T122201Z-001/Train_Data/train_2020.tif'
]

dfs = []
for file_path in file_paths:
    flattened_img = load_image_data(file_path)
    df = pd.DataFrame(flattened_img, columns=[f'Band_{i}' for i in range(1, 7)])
    dfs.append(df)

# Combine all data into a single DataFrame
df = pd.concat(dfs, ignore_index=True)
df = df.rename(columns={'Band_6': 'class'})
df = df[~df['class'].isin([3, 5])]

# Filter the data to only include class 1 and class 2
df_class_1 = df[df['class'] == 1]
df_class_2 = df[df['class'] == 2]

# Randomly sample the desired number of entries from each class
df_class_1_sampled = df_class_1.sample(n=11000, random_state=42)
df_class_2_sampled = df_class_2.sample(n=10000, random_state=42)

# Combine the sampled entries
df_sampled = pd.concat([df_class_1_sampled, df_class_2_sampled])

# Split the DataFrame into features and labels
X = df_sampled.drop(columns='class')
y = df_sampled['class']

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Train the RandomForestClassifier
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)

# Make predictions
y_pred = rf.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print('Classification Report:')
print(report)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Accuracy: 0.8664285714285714
Classification Report:
              precision    recall  f1-score   support

         1.0       0.86      0.89      0.88      2206
         2.0       0.88      0.84      0.86      1994

    accuracy                           0.87      4200
   macro avg       0.87      0.87      0.87      4200
weighted avg       0.87      0.87      0.87      4200



In [None]:
# Load and preprocess the 2021 data
file_path_2021 = 'drive/MyDrive/Train_Data-20240801T122201Z-001/Train_Data/train_2021.tif'
flattened_img_2021 = load_image_data(file_path_2021)
df_2021 = pd.DataFrame(flattened_img_2021, columns=[f'Band_{i}' for i in range(1, 7)])
df_2021 = df_2021.rename(columns={'Band_6': 'class'})
df_2021 = df_2021[~df_2021['class'].isin([3, 5])]

# Filter the 2021 data to only include class 1 and class 2
df_2021_class_1 = df_2021[df_2021['class'] == 1]
df_2021_class_2 = df_2021[df_2021['class'] == 2]

# Combine the 2021 data
df_2021_combined = pd.concat([df_2021_class_1, df_2021_class_2])

# Split the 2021 data into features and labels
X_2021 = df_2021_combined.drop(columns='class')
y_2021 = df_2021_combined['class']

# Standardize the 2021 features using the same scaler
X_2021_scaled = scaler.transform(X_2021)

# Make predictions on the 2021 data
y_2021_pred = rf.predict(X_2021_scaled)

# Evaluate the model on the 2021 data
accuracy_2021 = accuracy_score(y_2021, y_2021_pred)
report_2021 = classification_report(y_2021, y_2021_pred)

print(f'Accuracy on 2021 data: {accuracy_2021}')
print('Classification Report for 2021 data:')
print(report_2021)


Accuracy on 2021 data: 0.6212104508874435
Classification Report for 2021 data:
              precision    recall  f1-score   support

         1.0       0.57      0.90      0.70     17695
         2.0       0.79      0.36      0.49     18589

    accuracy                           0.62     36284
   macro avg       0.68      0.63      0.59     36284
weighted avg       0.68      0.62      0.59     36284



In [None]:
#random
from google.colab import drive
drive.mount('/content/drive')

import rasterio
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import RandomizedSearchCV

# Combine all data into a single DataFrame
df = pd.concat(dfs, ignore_index=True)
df = df.rename(columns={'Band_6': 'class'})
df = df[~df['class'].isin([3, 5])]

# Filter the data to only include class 1 and class 2
df_class_1 = df[df['class'] == 1]
df_class_2 = df[df['class'] == 2]

# Randomly sample the desired number of entries from each class
df_class_1_sampled = df_class_1.sample(n=3000, random_state=42)
df_class_2_sampled = df_class_2.sample(n=4000, random_state=42)

# Combine the sampled entries
df_sampled = pd.concat([df_class_1_sampled, df_class_2_sampled])

# Split the DataFrame into features and labels
X = df_sampled.drop(columns='class')
y = df_sampled['class']

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Train the RandomForestClassifier

rf = RandomForestClassifier(n_estimators=100, min_samples_split=10,
                             min_samples_leaf=1, max_features='sqrt',
                             max_depth=100, bootstrap=True) # Pass parameters as keyword arguments
rf.fit(X_train, y_train)

# Make predictions
y_pred = rf.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print('Classification Report:')
print(report)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Accuracy: 0.8492857142857143
Classification Report:
              precision    recall  f1-score   support

         1.0       0.81      0.85      0.83       605
         2.0       0.88      0.85      0.86       795

    accuracy                           0.85      1400
   macro avg       0.85      0.85      0.85      1400
weighted avg       0.85      0.85      0.85      1400



In [None]:
# Load and preprocess the 2021 data
file_path_2021 = 'drive/MyDrive/Train_Data-20240801T122201Z-001/Train_Data/train_2021.tif'
flattened_img_2021 = load_image_data(file_path_2021)
df_2021 = pd.DataFrame(flattened_img_2021, columns=[f'Band_{i}' for i in range(1, 7)])
df_2021 = df_2021.rename(columns={'Band_6': 'class'})
df_2021 = df_2021[~df_2021['class'].isin([3, 5])]

# Filter the 2021 data to only include class 1 and class 2
df_2021_class_1 = df_2021[df_2021['class'] == 1]
df_2021_class_2 = df_2021[df_2021['class'] == 2]

# Combine the 2021 data
df_2021_combined = pd.concat([df_2021_class_1, df_2021_class_2])

# Split the 2021 data into features and labels
X_2021 = df_2021_combined.drop(columns='class')
y_2021 = df_2021_combined['class']

# Standardize the 2021 features using the same scaler
X_2021_scaled = scaler.transform(X_2021)

# Make predictions on the 2021 data
y_2021_pred = rf.predict(X_2021_scaled)

# Evaluate the model on the 2021 data
accuracy_2021 = accuracy_score(y_2021, y_2021_pred)
report_2021 = classification_report(y_2021, y_2021_pred)

print(f'Accuracy on 2021 data: {accuracy_2021}')
print('Classification Report for 2021 data:')
print(report_2021)


Accuracy on 2021 data: 0.6446091941351559
Classification Report for 2021 data:
              precision    recall  f1-score   support

         1.0       0.59      0.86      0.70     17695
         2.0       0.77      0.44      0.56     18589

    accuracy                           0.64     36284
   macro avg       0.68      0.65      0.63     36284
weighted avg       0.68      0.64      0.63     36284



In [None]:
#random
from google.colab import drive
drive.mount('/content/drive')

import rasterio
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import RandomizedSearchCV

# Function to load and flatten image data
def load_image_data(file_path):
    img = rasterio.open(file_path)
    array = img.read()
    flattened_img = array.reshape(array.shape[0], -1).T  # Shape (height * width, bands)
    return flattened_img

# Load data for each year
file_paths = [
    'drive/MyDrive/Train_Data-20240801T122201Z-001/Train_Data/train_2015.tif',
    'drive/MyDrive/Train_Data-20240801T122201Z-001/Train_Data/train_2016.tif',
    'drive/MyDrive/Train_Data-20240801T122201Z-001/Train_Data/train_2017.tif',
    'drive/MyDrive/Train_Data-20240801T122201Z-001/Train_Data/train_2018.tif',
    'drive/MyDrive/Train_Data-20240801T122201Z-001/Train_Data/train_2019.tif',
    'drive/MyDrive/Train_Data-20240801T122201Z-001/Train_Data/train_2020.tif'
]

dfs = []
for file_path in file_paths:
    flattened_img = load_image_data(file_path)
    df = pd.DataFrame(flattened_img, columns=[f'Band_{i}' for i in range(1, 7)])
    dfs.append(df)

# Combine all data into a single DataFrame
df = pd.concat(dfs, ignore_index=True)
df = df.rename(columns={'Band_6': 'class'})
df = df[~df['class'].isin([3, 5])]


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:

# Load and preprocess data from 2021
file_path_2021 = 'drive/MyDrive/Train_Data-20240801T122201Z-001/Train_Data/train_2021.tif'
flattened_img_2021 = load_image_data(file_path_2021)
df_2021 = pd.DataFrame(flattened_img_2021, columns=[f'Band_{i}' for i in range(1, 7)])

# Rename the 'Band_6' column to 'class'
df_2021 = df_2021.rename(columns={'Band_6': 'class'})

# Remove entries with class 3 and 5
df_2021 = df_2021[~df_2021['class'].isin([3, 5])]

# Check class counts in 2021 data
print("Class counts in the 2021 data:")
print(df_2021['class'].value_counts())

# Extract features and labels
X_2021 = df_2021.drop(columns='class')
y_2021 = df_2021['class']

# Standardize the features using the same scaler
X_2021_scaled = scaler.transform(X_2021)

# Make predictions on the 2021 data
y_2021_pred = rf.predict(X_2021_scaled)

# Evaluate the model on the 2021 data
accuracy_2021 = accuracy_score(y_2021, y_2021_pred)
report_2021 = classification_report(y_2021, y_2021_pred)

print(f'Accuracy on 2021 data: {accuracy_2021}')
print('Classification Report on 2021 data:')
print(report_2021)

# Concatenate test data with predicted class values
df_2021_with_predictions = df_2021.copy()
df_2021_with_predictions['Predicted_Class'] = y_2021_pred

print("DataFrame with test data and predicted class values:")
print(df_2021_with_predictions.head())

Class counts in the 2021 data:
class
2.0    18589
1.0    17695
Name: count, dtype: int64
Accuracy on 2021 data: 0.6446091941351559
Classification Report on 2021 data:
              precision    recall  f1-score   support

         1.0       0.59      0.86      0.70     17695
         2.0       0.77      0.44      0.56     18589

    accuracy                           0.64     36284
   macro avg       0.68      0.65      0.63     36284
weighted avg       0.68      0.64      0.63     36284

DataFrame with test data and predicted class values:
   Band_1  Band_2  Band_3   Band_4   Band_5  class  Predicted_Class
0  9205.0  9186.0  9109.0  16885.0  14454.0    2.0              2.0
1  9156.0  9081.0  8532.0  16668.0  14378.0    2.0              2.0
2  9244.0  9346.0  8969.0  17091.0  14299.0    2.0              2.0
3  9304.0  9312.0  8913.0  16030.0  14446.0    2.0              2.0
4  9342.0  9296.0  8922.0  17583.0  14961.0    2.0              2.0


In [None]:
import rasterio
from rasterio.transform import from_origin

# Load the 2021 data again to get the original image dimensions and properties
file_path_2021 = 'drive/MyDrive/Train_Data-20240801T122201Z-001/Train_Data/train_2021.tif'
original_img_2021 = rasterio.open(file_path_2021)

# Get the original image dimensions
height, width = original_img_2021.height, original_img_2021.width
transform = original_img_2021.transform
crs = original_img_2021.crs

# Initialize an array filled with a default value (e.g., 0) for the entire image
predicted_classes_reshaped = np.zeros((height, width))

# Map predicted classes back to their original positions, accounting for filtered pixels
index = 0
for i in range(height):
    for j in range(width):
        # Check if the pixel was originally in the filtered DataFrame
        if index < len(df_2021_with_predictions):
            predicted_classes_reshaped[i, j] = df_2021_with_predictions['Predicted_Class'].iloc[index]
            index += 1

# Save the reshaped predicted classes to a new TIFF file
output_file_path = 'drive/MyDrive/Train_Data-20240801T122201Z-001/Train_Data/predicted_classes_2021.tif'
with rasterio.open(
    output_file_path,
    'w',
    driver='GTiff',
    height=height,
    width=width,
    count=1,  # Number of bands
    dtype=predicted_classes_reshaped.dtype,
    crs=crs,
    transform=transform
) as dst:
    dst.write(predicted_classes_reshaped, 1)

print(f'Predicted classes saved to {output_file_path}')

Predicted classes saved to drive/MyDrive/Train_Data-20240801T122201Z-001/Train_Data/predicted_classes_2021.tif


In [None]:
import rasterio
from rasterio.transform import from_origin

# Load the 2021 data again to get the original image dimensions and properties
file_path_2021 = 'drive/MyDrive/Train_Data-20240801T122201Z-001/Train_Data/train_2021.tif'
original_img_2021 = rasterio.open(file_path_2021)

# Get the original image dimensions
height, width = original_img_2021.height, original_img_2021.width
transform = original_img_2021.transform
crs = original_img_2021.crs

# Reshape the predicted classes back to the original image dimensions
predicted_classes = df_2021_with_predictions['Predicted_Class'].values

# Ensure the predicted classes array matches the expected dimensions
expected_size = height * width
if predicted_classes.size > expected_size:
    predicted_classes = predicted_classes[:expected_size]
elif predicted_classes.size < expected_size:
    padding_size = expected_size - predicted_classes.size
    predicted_classes = np.pad(predicted_classes, (0, padding_size), 'constant', constant_values=0)

predicted_classes_reshaped = predicted_classes.reshape((height, width))

# Save the reshaped predicted classes to a new TIFF file
output_file_path = 'drive/MyDrive/Train_Data-20240801T122201Z-001/Train_Data/predicted_classes_2021.tif'
with rasterio.open(
    output_file_path,
    'w',
    driver='GTiff',
    height=height,
    width=width,
    count=1,  # Number of bands
    dtype=predicted_classes_reshaped.dtype,
    crs=crs,
    transform=transform
) as dst:
    dst.write(predicted_classes_reshaped, 1)

print(f'Predicted classes saved to {output_file_path}')


Predicted classes saved to drive/MyDrive/Train_Data-20240801T122201Z-001/Train_Data/predicted_classes_2021.tif


In [None]:
# Load and preprocess the 2021 data
file_path_2021 = 'drive/MyDrive/Train_Data-20240801T122201Z-001/Train_Data/train_2021.tif'
flattened_img_2021 = load_image_data(file_path_2021)
df_2021 = pd.DataFrame(flattened_img_2021, columns=[f'Band_{i}' for i in range(1, 7)])
df_2021 = df_2021.rename(columns={'Band_6': 'class'})
df_2021 = df_2021[~df_2021['class'].isin([3, 5])]

# Filter the 2021 data to only include class 1 and class 2
df_2021_class_1 = df_2021[df_2021['class'] == 1]
df_2021_class_2 = df_2021[df_2021['class'] == 2]

# Randomly sample the desired number of entries from each class in the 2021 data
df_2021_class_1_sampled = df_2021_class_1.sample(n=1000, random_state=42)
df_2021_class_2_sampled = df_2021_class_2.sample(n=800, random_state=42)

# Combine the sampled 2021 entries
df_2021_sampled = pd.concat([df_2021_class_1_sampled, df_2021_class_2_sampled])

# Split the 2021 data into features and labels
X_2021 = df_2021_sampled.drop(columns='class')
y_2021 = df_2021_sampled['class']

# Standardize the 2021 features using the same scaler
X_2021_scaled = scaler.transform(X_2021)

# Make predictions on the sampled 2021 data
y_2021_pred = rf.predict(X_2021_scaled)

# Evaluate the model on the sampled 2021 data
accuracy_2021 = accuracy_score(y_2021, y_2021_pred)
report_2021 = classification_report(y_2021, y_2021_pred)

print(f'Accuracy on 2021 sampled data: {accuracy_2021}')
print('Classification Report for 2021 sampled data:')
print(report_2021)

Accuracy on 2021 sampled data: 0.655
Classification Report for 2021 sampled data:
              precision    recall  f1-score   support

         1.0       0.65      0.84      0.73      1000
         2.0       0.68      0.42      0.52       800

    accuracy                           0.66      1800
   macro avg       0.66      0.63      0.63      1800
weighted avg       0.66      0.66      0.64      1800



In [None]:
# Load and preprocess the 2021 data
file_path_2021 = 'drive/MyDrive/Train_Data-20240801T122201Z-001/Train_Data/train_2021.tif'
flattened_img_2021 = load_image_data(file_path_2021)
df_2021 = pd.DataFrame(flattened_img_2021, columns=[f'Band_{i}' for i in range(1, 7)])
df_2021 = df_2021.rename(columns={'Band_6': 'class'})
df_2021 = df_2021[~df_2021['class'].isin([3, 5])]

# Filter the 2021 data to only include class 1 and class 2
df_2021_class_1 = df_2021[df_2021['class'] == 1]
df_2021_class_2 = df_2021[df_2021['class'] == 2]

# Randomly sample the desired number of entries from each class in the 2021 data
df_2021_class_1_sampled = df_2021_class_1.sample(n=1000, random_state=42)
df_2021_class_2_sampled = df_2021_class_2.sample(n=1100, random_state=42)

# Combine the sampled 2021 entries
df_2021_sampled = pd.concat([df_2021_class_1_sampled, df_2021_class_2_sampled])

# Split the 2021 data into features and labels
X_2021 = df_2021_sampled.drop(columns='class')
y_2021 = df_2021_sampled['class']

# Standardize the 2021 features using the same scaler
X_2021_scaled = scaler.transform(X_2021)

# Make predictions on the sampled 2021 data
y_2021_pred = rf.predict(X_2021_scaled)

# Evaluate the model on the sampled 2021 data
accuracy_2021 = accuracy_score(y_2021, y_2021_pred)
report_2021 = classification_report(y_2021, y_2021_pred)

print(f'Accuracy on 2021 sampled data: {accuracy_2021}')
print('Classification Report for 2021 sampled data:')
print(report_2021)

Accuracy on 2021 sampled data: 0.6185714285714285
Classification Report for 2021 sampled data:
              precision    recall  f1-score   support

         1.0       0.57      0.84      0.68      1000
         2.0       0.74      0.42      0.53      1100

    accuracy                           0.62      2100
   macro avg       0.66      0.63      0.61      2100
weighted avg       0.66      0.62      0.60      2100



In [None]:
from sklearn.svm import SVC
svm = SVC(kernel='linear', random_state=42)
svm.fit(X_train, y_train)

# Make predictions
y_pred = svm.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print('Classification Report:')
print(report)

Accuracy: 0.7471428571428571
Classification Report:
              precision    recall  f1-score   support

         1.0       0.70      0.73      0.71       605
         2.0       0.79      0.76      0.77       795

    accuracy                           0.75      1400
   macro avg       0.74      0.75      0.74      1400
weighted avg       0.75      0.75      0.75      1400



In [None]:

# Load and preprocess data from 2021
file_path_2021 = 'drive/MyDrive/Train_Data-20240801T122201Z-001/Train_Data/train_2021.tif'
flattened_img_2021 = load_image_data(file_path_2021)
df_2021 = pd.DataFrame(flattened_img_2021, columns=[f'Band_{i}' for i in range(1, 7)])

# Rename the 'Band_6' column to 'class'
df_2021 = df_2021.rename(columns={'Band_6': 'class'})

# Remove entries with class 3 and 5
df_2021 = df_2021[~df_2021['class'].isin([3, 5])]

# Check class counts in 2021 data
print("Class counts in the 2021 data:")
print(df_2021['class'].value_counts())

# Extract features and labels
X_2021 = df_2021.drop(columns='class')
y_2021 = df_2021['class']

# Standardize the features using the same scaler
X_2021_scaled = scaler.transform(X_2021)

# Make predictions on the 2021 data
y_2021_pred = rf.predict(X_2021_scaled)

# Evaluate the model on the 2021 data
accuracy_2021 = accuracy_score(y_2021, y_2021_pred)
report_2021 = classification_report(y_2021, y_2021_pred)

print(f'Accuracy on 2021 data: {accuracy_2021}')
print('Classification Report on 2021 data:')
print(report_2021)

# Concatenate test data with predicted class values
df_2021_with_predictions = df_2021.copy()
df_2021_with_predictions['Predicted_Class'] = y_2021_pred

print("DataFrame with test data and predicted class values:")
print(df_2021_with_predictions.head())

Class counts in the 2021 data:
class
2.0    18589
1.0    17695
Name: count, dtype: int64
Accuracy on 2021 data: 0.6446091941351559
Classification Report on 2021 data:
              precision    recall  f1-score   support

         1.0       0.59      0.86      0.70     17695
         2.0       0.77      0.44      0.56     18589

    accuracy                           0.64     36284
   macro avg       0.68      0.65      0.63     36284
weighted avg       0.68      0.64      0.63     36284

DataFrame with test data and predicted class values:
   Band_1  Band_2  Band_3   Band_4   Band_5  class  Predicted_Class
0  9205.0  9186.0  9109.0  16885.0  14454.0    2.0              2.0
1  9156.0  9081.0  8532.0  16668.0  14378.0    2.0              2.0
2  9244.0  9346.0  8969.0  17091.0  14299.0    2.0              2.0
3  9304.0  9312.0  8913.0  16030.0  14446.0    2.0              2.0
4  9342.0  9296.0  8922.0  17583.0  14961.0    2.0              2.0


In [None]:
# Load and preprocess the 2021 data
file_path_2021 = 'drive/MyDrive/Train_Data-20240801T122201Z-001/Train_Data/train_2021.tif'
flattened_img_2021 = load_image_data(file_path_2021)
df_2021 = pd.DataFrame(flattened_img_2021, columns=[f'Band_{i}' for i in range(1, 7)])
df_2021 = df_2021.rename(columns={'Band_6': 'class'})
df_2021 = df_2021[~df_2021['class'].isin([3, 5])]

# Filter the 2021 data to only include class 1 and class 2
df_2021_class_1 = df_2021[df_2021['class'] == 1]
df_2021_class_2 = df_2021[df_2021['class'] == 2]

# Randomly sample the desired number of entries from each class in the 2021 data
df_2021_class_1_sampled = df_2021_class_1.sample(n=1000, random_state=42)
df_2021_class_2_sampled = df_2021_class_2.sample(n=1100, random_state=42)

# Combine the sampled 2021 entries
df_2021_sampled = pd.concat([df_2021_class_1_sampled, df_2021_class_2_sampled])

# Split the 2021 data into features and labels
X_2021 = df_2021_sampled.drop(columns='class')
y_2021 = df_2021_sampled['class']

# Standardize the 2021 features using the same scaler
X_2021_scaled = scaler.transform(X_2021)

# Make predictions on the sampled 2021 data
y_2021_pred = rf.predict(X_2021_scaled)

# Evaluate the model on the sampled 2021 data
accuracy_2021 = accuracy_score(y_2021, y_2021_pred)
report_2021 = classification_report(y_2021, y_2021_pred)

print(f'Accuracy on 2021 sampled data: {accuracy_2021}')
print('Classification Report for 2021 sampled data:')
print(report_2021)

Accuracy on 2021 sampled data: 0.6185714285714285
Classification Report for 2021 sampled data:
              precision    recall  f1-score   support

         1.0       0.57      0.84      0.68      1000
         2.0       0.74      0.42      0.53      1100

    accuracy                           0.62      2100
   macro avg       0.66      0.63      0.61      2100
weighted avg       0.66      0.62      0.60      2100



In [None]:
import pandas as pd
import rasterio

# Load the TIFF files
train_2015 = rasterio.open('drive/MyDrive/Train_Data-20240801T122201Z-001/Train_Data/train_2015.tif')
train_2016 = rasterio.open('drive/MyDrive/Train_Data-20240801T122201Z-001/Train_Data/train_2016.tif')
train_2017 = rasterio.open('drive/MyDrive/Train_Data-20240801T122201Z-001/Train_Data/train_2017.tif')
train_2018 = rasterio.open('drive/MyDrive/Train_Data-20240801T122201Z-001/Train_Data/train_2018.tif')
train_2019 = rasterio.open('drive/MyDrive/Train_Data-20240801T122201Z-001/Train_Data/train_2019.tif')
train_2020 = rasterio.open('drive/MyDrive/Train_Data-20240801T122201Z-001/Train_Data/train_2020.tif')

# Read the data
train_2015 = train_2015.read()
train_2016 = train_2016.read()
train_2017 = train_2017.read()
train_2018 = train_2018.read()
train_2019 = train_2019.read()
train_2020 = train_2020.read() # Read the data from the DatasetReader

# Flatten the images
flattened_img_2015 = train_2015.reshape(6, -1).T
flattened_img_2016 = train_2016.reshape(6, -1).T
flattened_img_2017 = train_2017.reshape(6, -1).T
flattened_img_2018 = train_2018.reshape(6, -1).T
flattened_img_2019 = train_2019.reshape(6, -1).T
flattened_img_2020 = train_2020.reshape(6, -1).T # Now you can reshape



# Create dataframes
df_2015 = pd.DataFrame(flattened_img_2015, columns=[f'Band_{i}' for i in range(1, 6)] + ['class'])
df_2016 = pd.DataFrame(flattened_img_2016, columns=[f'Band_{i}' for i in range(1, 6)] + ['class'])
df_2017 = pd.DataFrame(flattened_img_2017, columns=[f'Band_{i}' for i in range(1, 6)] + ['class'])
df_2018 = pd.DataFrame(flattened_img_2018, columns=[f'Band_{i}' for i in range(1, 6)] + ['class'])
df_2019 = pd.DataFrame(flattened_img_2019, columns=[f'Band_{i}' for i in range(1, 6)] + ['class'])
df_2020 = pd.DataFrame(flattened_img_2020, columns=[f'Band_{i}' for i in range(1, 6)] + ['class'])

# Add year column
df_2015['year'] = 2015
df_2016['year'] = 2016
df_2017['year'] = 2017
df_2018['year'] = 2018
df_2019['year'] = 2019
df_2020['year'] = 2020


# Add pixel_id column
num_pixels = len(df_2015)
df_2015['pixel_id'] = range(num_pixels)
df_2016['pixel_id'] = range(num_pixels)
df_2017['pixel_id'] = range(num_pixels)
df_2018['pixel_id'] = range(num_pixels)
df_2019['pixel_id'] = range(num_pixels)
df_2020['pixel_id'] = range(num_pixels)

# Concatenate dataframes
df_train = pd.concat([df_2015, df_2016, df_2017, df_2018, df_2019, df_2020], ignore_index=True)

# Pivot the dataframe to create a time series for each pixel
time_series_df = df_train.pivot_table(index='pixel_id', columns='year', values=['Band_1', 'Band_2', 'Band_3', 'Band_4', 'Band_5', 'class'])

# Flatten the MultiIndex columns
time_series_df.columns = ['{}_{}'.format(col[0], col[1]) for col in time_series_df.columns]

# Reset the index to make `pixel_id` a column again
time_series_df.reset_index(inplace=True)

print(time_series_df)

       pixel_id  Band_1_2015  Band_1_2016  Band_1_2017  Band_1_2018  \
0             0      10418.5       9980.0       9844.5       9149.0   
1             1      10441.0      10029.0       9709.5       9120.0   
2             2      10249.0      10095.0       9712.5       9175.0   
3             3      10182.0      10081.0       9718.5       9251.0   
4             4      10179.0      10046.0       9767.0       9359.0   
...         ...          ...          ...          ...          ...   
36287     36287       8664.0       8367.0       9298.0       8413.0   
36288     36288       8684.0       8362.5       9266.0       8398.0   
36289     36289       8700.0       8362.5       9284.0       8357.0   
36290     36290       8802.0       8356.5       9308.0       8600.0   
36291     36291       8835.0       8365.5       9438.0       8383.0   

       Band_1_2019  Band_1_2020  Band_2_2015  Band_2_2016  Band_2_2017  ...  \
0          10379.0       9915.0      10462.0       9565.0       9780