In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv("/content/hacktrain.csv")
df #ignore the warnings

Unnamed: 0.1,Unnamed: 0,ID,class,20150720_N,20150602_N,20150517_N,20150501_N,20150415_N,20150330_N,20150314_N,...,20140610_N,20140525_N,20140509_N,20140423_N,20140407_N,20140322_N,20140218_N,20140202_N,20140117_N,20140101_N
0,0,1,water,637.5950,658.668,-1882.030,-1924.36,997.904,-1739.990,630.087,...,,-1043.160,-1942.490,267.138,,,211.328,-2203.020,-1180.19,433.906
1,1,2,water,634.2400,593.705,-1625.790,-1672.32,914.198,-692.386,707.626,...,,-933.934,-625.385,120.059,364.858,476.972,220.878,-2250.000,-1360.56,524.075
2,3,4,water,58.0174,-1599.160,,-1052.63,,-1564.630,,...,-1025.880,368.622,,-1227.800,304.621,,369.214,-2202.120,,-1343.550
3,4,5,water,72.5180,,380.436,-1256.93,515.805,-1413.180,-802.942,...,-1813.950,155.624,,-924.073,432.150,282.833,298.320,-2197.360,,-826.727
4,7,8,water,1136.4400,,,1647.83,1935.800,,2158.980,...,1535.000,1959.430,-279.317,-384.915,-113.406,1020.720,1660.650,-116.801,-568.05,-1357.140
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7995,10537,10538,impervious,1207.7000,984.620,,1166.25,937.478,1072.700,823.896,...,1117.740,1176.600,1044.110,,369.082,465.843,362.882,979.795,,433.659
7996,10538,10539,impervious,2170.3500,1419.720,1361.000,1478.71,983.911,1262.110,1422.860,...,984.634,2128.970,1379.660,,762.633,485.204,446.724,771.747,1589.06,506.936
7997,10541,10542,impervious,1895.6800,1454.740,,1033.56,1930.380,1057.150,1471.600,...,888.408,2093.020,1232.110,1190.830,1441.460,1170.880,1095.000,1818.650,2501.72,1247.770
7998,10542,10543,impervious,3465.7400,1283.320,413.412,4391.05,1146.820,4473.050,1614.750,...,5833.760,4047.320,4515.800,433.177,277.296,744.143,,3759.710,,388.346


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV
from sklearn.impute import KNNImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import classification_report
from xgboost import XGBClassifier
import joblib

# ---- Step 1: Load and preprocess ----
ndvi_cols = [col for col in df.columns if col.endswith('_N')]
imputer = KNNImputer(n_neighbors=5, weights='distance')
df[ndvi_cols] = imputer.fit_transform(df[ndvi_cols])

if 'ID' in df.columns:
    df.drop(columns=['ID'], inplace=True)

# Label encoding
label_encoder = LabelEncoder()
df['class'] = label_encoder.fit_transform(df['class'])

X = df.drop(columns=['class'])
y = df['class']

# Save original column names
feature_names = X.columns.tolist()

# ---- Step 2: Train-test split ----
X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=0.2, random_state=42)

# ---- Step 3: Model + Feature Selection ----
base_model = XGBClassifier(
    objective='multi:softmax',
    eval_metric='mlogloss',
    use_label_encoder=False,
    random_state=42
)

selector = SelectFromModel(base_model, threshold='median')
selector.fit(X_train, y_train)

X_train_sel = selector.transform(X_train)
X_test_sel = selector.transform(X_test)

# ---- Step 4: Hyperparameter tuning ----
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [4, 6, 8],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0]
}

search = RandomizedSearchCV(
    estimator=base_model,
    param_distributions=param_grid,
    n_iter=20,
    scoring='accuracy',
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
    verbose=1,
    random_state=42,
    n_jobs=-1
)

search.fit(X_train_sel, y_train)

# ---- Step 5: Evaluate ----
best_model = search.best_estimator_
y_pred = best_model.predict(X_test_sel)

print(f"Best Params: {search.best_params_}")
print("Classification Report:")
print(classification_report(
    y_test, y_pred,
    target_names=label_encoder.classes_
))

# ---- Step 6: Save everything ----
joblib.dump(imputer, 'knn_imputer.pkl')
joblib.dump(label_encoder, 'label_encoder.pkl')
joblib.dump(selector, 'feature_selector.pkl')
joblib.dump(best_model, 'xgb_model.pkl')
joblib.dump(selector.get_support(indices=True), 'selected_indices.pkl')
joblib.dump(feature_names, 'all_features.pkl')


Parameters: { "use_label_encoder" } are not used.



Fitting 5 folds for each of 20 candidates, totalling 100 fits


Parameters: { "use_label_encoder" } are not used.



Best Params: {'subsample': 1.0, 'n_estimators': 300, 'max_depth': 8, 'learning_rate': 0.1, 'colsample_bytree': 1.0}
Classification Report:
              precision    recall  f1-score   support

        farm       0.90      0.95      0.93       168
      forest       1.00      1.00      1.00      1232
       grass       1.00      0.82      0.90        39
  impervious       0.90      0.91      0.90       134
     orchard       1.00      0.50      0.67         6
       water       0.95      0.95      0.95        21

    accuracy                           0.98      1600
   macro avg       0.96      0.86      0.89      1600
weighted avg       0.98      0.98      0.98      1600



['all_features.pkl']

In [None]:
test_data = pd.read_csv("/content/hacktest.csv")
ID = test_data['ID']
test_data.drop(['ID'], axis=1, inplace=True)

# Load saved parts
imputer = joblib.load('knn_imputer.pkl')
label_encoder = joblib.load('label_encoder.pkl')
selector = joblib.load('feature_selector.pkl')
model = joblib.load('xgb_model.pkl')
all_features = joblib.load('all_features.pkl')

# Preprocess test data
ndvi_cols = [col for col in test_data.columns if col.endswith('_N')]
test_data[ndvi_cols] = imputer.transform(test_data[ndvi_cols])
test_data = test_data[all_features]  # Ensure same order

X_test_sel = selector.transform(test_data)
y_pred = model.predict(X_test_sel)
y_decoded = label_encoder.inverse_transform(y_pred)

# Create submission
submission = pd.DataFrame({
    'ID': ID,
    'class': y_decoded
})
submission.to_csv("submission2.csv", index=False)


In [None]:
df['20140101_N'].isnull

In [None]:
df.isnull().sum()

Unnamed: 0,0
Unnamed: 0,0
ID,0
class,0
20150720_N,560
20150602_N,1200
20150517_N,800
20150501_N,960
20150415_N,480
20150330_N,1120
20150314_N,720


In [None]:
from sklearn.impute import KNNImputer

# 1. Identify NDVI columns using the "_N" suffix
ndvi_cols = [col for col in df.columns if col.endswith('_N')]

# 2. Initialize the imputer
imputer = KNNImputer(n_neighbors=5, weights='uniform')  # You can also try 'distance'

# 3. Apply the imputer only to NDVI columns
df[ndvi_cols] = imputer.fit_transform(df[ndvi_cols])
df.isnull().sum()


Unnamed: 0,0
Unnamed: 0,0
ID,0
class,0
20150720_N,0
20150602_N,0
20150517_N,0
20150501_N,0
20150415_N,0
20150330_N,0
20150314_N,0


In [None]:
df.fillna(df.mean(numeric_only=True), inplace=True) #simple mean imputation [This part has a lot of scope for imporovement.]
#keep in mind that the data is inherently noisy and the test dataset is not.
df.isnull().sum()

Unnamed: 0,0
Unnamed: 0,0
ID,0
class,0
20150720_N,0
20150602_N,0
20150517_N,0
20150501_N,0
20150415_N,0
20150330_N,0
20150314_N,0




              precision    recall  f1-score   support

        farm       0.90      0.88      0.89       168
      forest       0.99      1.00      1.00      1232
       grass       0.82      0.69      0.75        39
  impervious       0.84      0.91      0.87       134
     orchard       0.75      0.50      0.60         6
       water       0.90      0.86      0.88        21

    accuracy                           0.97      1600
   macro avg       0.87      0.81      0.83      1600
weighted avg       0.97      0.97      0.97      1600



In [None]:
import pandas as pd
import numpy as np
import joblib
from sklearn.impute import KNNImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report

# ---- Load and preprocess training data ----
ndvi_cols = [col for col in df.columns if col.endswith('_N')]
imputer = KNNImputer(n_neighbors=5, weights='distance')
df[ndvi_cols] = imputer.fit_transform(df[ndvi_cols])

if 'ID' in df.columns:
    df.drop(columns=['ID'], inplace=True)

# Encode target
label_encoder = LabelEncoder()
df['class'] = label_encoder.fit_transform(df['class'])

# Split features and target
X = df.drop(columns=['class'])
y = df['class']

# Save column order for future use
joblib.dump(X.columns.tolist(), 'feature_columns.pkl')

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Train Random Forest
model = RandomForestClassifier(
    n_estimators=200,
    min_samples_split=5,
    class_weight='balanced',
    random_state=42,
    n_jobs=-1
)
model.fit(X_train, y_train)

# Cross-validation
cv_scores = cross_val_score(model, X_train, y_train, cv=5)
print(f"Cross-validation accuracy: {np.mean(cv_scores):.4f} ± {np.std(cv_scores):.4f}")

# Test evaluation
y_pred = model.predict(X_test)
print(classification_report(
    y_test, y_pred,
    labels=list(range(len(label_encoder.classes_))),
    target_names=label_encoder.classes_
))

# ---- Save model and preprocessors ----
joblib.dump(imputer, 'knn_imputer.pkl')
joblib.dump(label_encoder, 'label_encoder.pkl')
joblib.dump(model, 'rf_model.pkl')


Cross-validation accuracy: 0.9753 ± 0.0074
              precision    recall  f1-score   support

        farm       0.88      0.98      0.93       168
      forest       1.00      1.00      1.00      1232
       grass       1.00      0.74      0.85        39
  impervious       0.91      0.89      0.90       134
     orchard       1.00      0.33      0.50         6
       water       0.95      0.86      0.90        21

    accuracy                           0.98      1600
   macro avg       0.96      0.80      0.85      1600
weighted avg       0.98      0.98      0.98      1600



['rf_model.pkl']

In [None]:

print(test_data['20150720_N'])

0       7466.4200
1       7235.2600
2       7425.0800
3       7119.1200
4       7519.5500
          ...    
2840   -1673.7400
2841     -96.8233
2842   -2364.6000
2843   -3004.6300
2844   -2975.1000
Name: 20150720_N, Length: 2845, dtype: float64


In [None]:
import pandas as pd
import joblib

# Step 1: Load test data
test_data = pd.read_csv("/content/hacktest.csv")
ID = test_data['ID']
test_data.drop(['ID'], axis=1, inplace=True)

# Step 2: Load saved model artifacts
model = joblib.load('rf_model.pkl')
label_encoder = joblib.load('label_encoder.pkl')
feature_columns = joblib.load('feature_columns.pkl')  # Columns used during training

# Step 3: Ensure test data matches training feature structure
test_data = test_data[feature_columns]

# Step 4: Make predictions
y_test = model.predict(test_data)
y_decoded = label_encoder.inverse_transform(y_test)

# Step 5: Create submission file
result = pd.DataFrame({
    'ID': ID,
    'class': y_decoded
})
result.to_csv("submission0.csv", index=False)


In [None]:
test_data = pd.read_csv("/content/hacktest.csv")
ID=test_data['ID']
test_data.drop(['ID'],axis=1,inplace=True)
test_data.shape
y_test = model.predict(test_data)
y_decoded = label_encoder.inverse_transform(y_test)
result = pd.DataFrame({
    'ID': ID,
    'class': y_decoded
})
result.to_csv("submission1.csv", index=False) #this file will appear under the output section of the right navbar. You need to submit this csv file

(2845, 28)

In [None]:
y_test

array([1, 1, 1, ..., 5, 5, 5])

In [None]:

y_decoded

array(['forest', 'forest', 'forest', ..., 'water', 'water', 'water'],
      dtype=object)

In [None]:
result = pd.DataFrame({
    'ID': ID,
    'class': y_decoded
})

In [None]:
result.to_csv("submission1.csv", index=False) #this file will appear under the output section of the right navbar. You need to submit this csv file