# NDVI-Based Land Cover Classification Hackathon - Final Submission
Ashish Pandey

Roll No.-244159002

ashishpandey@iitg.ac.in

In [158]:
# Importing Required Libraries

import numpy as np
import pandas as pd
from scipy.signal import savgol_filter
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report, confusion_matrix

In [159]:
# Loading data 
train_df = pd.read_csv('hacktrain.csv')
test_df = pd.read_csv('hacktest.csv')

In [160]:
train_df.head()

Unnamed: 0.1,Unnamed: 0,ID,class,20150720_N,20150602_N,20150517_N,20150501_N,20150415_N,20150330_N,20150314_N,...,20140610_N,20140525_N,20140509_N,20140423_N,20140407_N,20140322_N,20140218_N,20140202_N,20140117_N,20140101_N
0,0,1,water,637.595,658.668,-1882.03,-1924.36,997.904,-1739.99,630.087,...,,-1043.16,-1942.49,267.138,,,211.328,-2203.02,-1180.19,433.906
1,1,2,water,634.24,593.705,-1625.79,-1672.32,914.198,-692.386,707.626,...,,-933.934,-625.385,120.059,364.858,476.972,220.878,-2250.0,-1360.56,524.075
2,3,4,water,58.0174,-1599.16,,-1052.63,,-1564.63,,...,-1025.88,368.622,,-1227.8,304.621,,369.214,-2202.12,,-1343.55
3,4,5,water,72.518,,380.436,-1256.93,515.805,-1413.18,-802.942,...,-1813.95,155.624,,-924.073,432.15,282.833,298.32,-2197.36,,-826.727
4,7,8,water,1136.44,,,1647.83,1935.8,,2158.98,...,1535.0,1959.43,-279.317,-384.915,-113.406,1020.72,1660.65,-116.801,-568.05,-1357.14


In [161]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8000 entries, 0 to 7999
Data columns (total 30 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  8000 non-null   int64  
 1   ID          8000 non-null   int64  
 2   class       8000 non-null   object 
 3   20150720_N  7440 non-null   float64
 4   20150602_N  6800 non-null   float64
 5   20150517_N  7200 non-null   float64
 6   20150501_N  7040 non-null   float64
 7   20150415_N  7520 non-null   float64
 8   20150330_N  6880 non-null   float64
 9   20150314_N  7280 non-null   float64
 10  20150226_N  6640 non-null   float64
 11  20150210_N  7360 non-null   float64
 12  20150125_N  6960 non-null   float64
 13  20150109_N  7120 non-null   float64
 14  20141117_N  6720 non-null   float64
 15  20141101_N  7600 non-null   float64
 16  20141016_N  6560 non-null   float64
 17  20140930_N  7200 non-null   float64
 18  20140813_N  7440 non-null   float64
 19  20140626_N  6400 non-null  

In [162]:
# Checking and filling null values 

train_df.isnull().sum()

Unnamed: 0       0
ID               0
class            0
20150720_N     560
20150602_N    1200
20150517_N     800
20150501_N     960
20150415_N     480
20150330_N    1120
20150314_N     720
20150226_N    1360
20150210_N     640
20150125_N    1040
20150109_N     880
20141117_N    1280
20141101_N     400
20141016_N    1440
20140930_N     800
20140813_N     560
20140626_N    1600
20140610_N     480
20140525_N     720
20140509_N     880
20140423_N    1760
20140407_N     640
20140322_N    1120
20140218_N    1440
20140202_N     560
20140117_N    1200
20140101_N     400
dtype: int64

In [163]:
train_df.fillna(df.mean(numeric_only=True), inplace=True)
df.isnull().sum()

Unnamed: 0    0
ID            0
20150720_N    0
20150602_N    0
20150517_N    0
20150501_N    0
20150415_N    0
20150330_N    0
20150314_N    0
20150226_N    0
20150210_N    0
20150125_N    0
20150109_N    0
20141117_N    0
20141101_N    0
20141016_N    0
20140930_N    0
20140813_N    0
20140626_N    0
20140610_N    0
20140525_N    0
20140509_N    0
20140423_N    0
20140407_N    0
20140322_N    0
20140218_N    0
20140202_N    0
20140117_N    0
20140101_N    0
dtype: int64

In [164]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8000 entries, 0 to 7999
Data columns (total 30 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  8000 non-null   int64  
 1   ID          8000 non-null   int64  
 2   class       8000 non-null   object 
 3   20150720_N  8000 non-null   float64
 4   20150602_N  8000 non-null   float64
 5   20150517_N  8000 non-null   float64
 6   20150501_N  8000 non-null   float64
 7   20150415_N  8000 non-null   float64
 8   20150330_N  8000 non-null   float64
 9   20150314_N  8000 non-null   float64
 10  20150226_N  8000 non-null   float64
 11  20150210_N  8000 non-null   float64
 12  20150125_N  8000 non-null   float64
 13  20150109_N  8000 non-null   float64
 14  20141117_N  8000 non-null   float64
 15  20141101_N  8000 non-null   float64
 16  20141016_N  8000 non-null   float64
 17  20140930_N  8000 non-null   float64
 18  20140813_N  8000 non-null   float64
 19  20140626_N  8000 non-null  

In [165]:
# Converting class from object to numerical value

le = LabelEncoder()
train_df["class"] = le.fit_transform(train_df["class"])

In [166]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8000 entries, 0 to 7999
Data columns (total 30 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  8000 non-null   int64  
 1   ID          8000 non-null   int64  
 2   class       8000 non-null   int64  
 3   20150720_N  8000 non-null   float64
 4   20150602_N  8000 non-null   float64
 5   20150517_N  8000 non-null   float64
 6   20150501_N  8000 non-null   float64
 7   20150415_N  8000 non-null   float64
 8   20150330_N  8000 non-null   float64
 9   20150314_N  8000 non-null   float64
 10  20150226_N  8000 non-null   float64
 11  20150210_N  8000 non-null   float64
 12  20150125_N  8000 non-null   float64
 13  20150109_N  8000 non-null   float64
 14  20141117_N  8000 non-null   float64
 15  20141101_N  8000 non-null   float64
 16  20141016_N  8000 non-null   float64
 17  20140930_N  8000 non-null   float64
 18  20140813_N  8000 non-null   float64
 19  20140626_N  8000 non-null  

In [167]:
# Dropping unreleated / not useful data 
train_df.drop(columns=["ID"], inplace=True)

In [168]:
train_df.head(2)

Unnamed: 0.1,Unnamed: 0,class,20150720_N,20150602_N,20150517_N,20150501_N,20150415_N,20150330_N,20150314_N,20150226_N,...,20140610_N,20140525_N,20140509_N,20140423_N,20140407_N,20140322_N,20140218_N,20140202_N,20140117_N,20140101_N
0,0,5,637.595,658.668,-1882.03,-1924.36,997.904,-1739.99,630.087,4547.764233,...,4372.211461,-1043.16,-1942.49,267.138,2004.551484,2348.475869,211.328,-2203.02,-1180.19,433.906
1,1,5,634.24,593.705,-1625.79,-1672.32,914.198,-692.386,707.626,-1670.59,...,4372.211461,-933.934,-625.385,120.059,364.858,476.972,220.878,-2250.0,-1360.56,524.075


In [169]:
# Setup random seed 
np.random.seed(42)

In [170]:
# Separate features and target

x = train_df.drop("class", axis=1)
y = train_df["class"]

# Train-test split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

In [171]:
# Feature scaling

scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)

In [172]:
# Instentiate Logestic Regression

model = LogisticRegression(solver='saga', max_iter=1000)
model.fit(x_train_scaled, y_train)

In [173]:
# Evaluating model
model.score(x_test_scaled, y_test)

  ret = a @ b
  ret = a @ b
  ret = a @ b


0.96375

In [174]:
# Make predictions on the test set

y_pred = model.predict(x_test_scaled)
y_pred

  ret = a @ b
  ret = a @ b
  ret = a @ b


array([1, 1, 1, ..., 1, 1, 0], shape=(1600,))

In [176]:
print("Classification Report:\n", classification_report(
    y_test, 
    y_pred, 
    labels=list(range(len(le.classes_))), 
    target_names=le.classes_
))

print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Classification Report:
               precision    recall  f1-score   support

        farm       0.86      0.90      0.88       161
      forest       0.99      1.00      0.99      1231
       grass       0.95      0.81      0.88        43
  impervious       0.88      0.84      0.86       141
     orchard       0.33      0.17      0.22         6
       water       0.93      0.78      0.85        18

    accuracy                           0.96      1600
   macro avg       0.82      0.75      0.78      1600
weighted avg       0.96      0.96      0.96      1600

Confusion Matrix:
 [[ 145    3    0   12    1    0]
 [   2 1228    0    0    0    1]
 [   3    1   35    4    0    0]
 [  15    4    2  119    1    0]
 [   4    0    0    1    1    0]
 [   0    4    0    0    0   14]]


In [183]:
# Making predictions on test data 
test_data = pd.read_csv("hacktest.csv")
test_data.head()

Unnamed: 0.1,Unnamed: 0,ID,20150720_N,20150602_N,20150517_N,20150501_N,20150415_N,20150330_N,20150314_N,20150226_N,...,20140610_N,20140525_N,20140509_N,20140423_N,20140407_N,20140322_N,20140218_N,20140202_N,20140117_N,20140101_N
0,0,1,7466.42,413.162,5761.0,5625.45,489.403,3923.84,3097.11,6766.42,...,801.184,927.115,4704.14,6378.42,340.949,2695.57,527.268,4736.75,601.843,6639.76
1,1,2,7235.26,6037.35,1027.56,6085.14,1618.05,6668.54,2513.99,1051.69,...,5533.47,5103.04,5216.12,4885.27,4366.79,1234.14,3298.11,6942.68,1070.44,842.101
2,2,3,7425.08,6969.98,1177.94,7408.93,861.061,7644.43,814.458,1504.29,...,1981.39,6204.54,7021.69,5704.41,4897.45,1789.99,2206.1,6928.93,1036.56,831.441
3,3,4,7119.12,1731.62,6311.93,6441.61,465.979,7128.42,1649.12,6935.22,...,959.344,5794.15,1045.57,5572.9,586.287,685.906,1287.0,6734.72,824.584,6883.61
4,4,5,7519.55,8130.26,1482.54,7879.53,1001.21,7937.6,4122.53,1094.51,...,7636.07,6996.76,7413.43,4596.13,4511.7,1413.52,3283.94,7937.68,1857.8,1336.92


In [185]:
# Save ID column separately
id = test_data['ID']

# Drop ID column from features
test_data.drop(['ID'], axis=1, inplace=True)

In [186]:
# Scale test data using the previously fitted scaler
test_data_scaled = scaler.transform(test_data)

In [187]:
# Predict using trained model
y_test = model.predict(test_data_scaled)

  ret = a @ b
  ret = a @ b
  ret = a @ b


In [188]:
# Decode numeric predictions back to original class labels
y_decoded = le.inverse_transform(y_test)

In [190]:

# Create submission dataframe
result = pd.DataFrame({
    'ID': id,
    'class': y_decoded
})

In [191]:
result

Unnamed: 0,ID,class
0,1,forest
1,2,forest
2,3,forest
3,4,forest
4,5,forest
...,...,...
2840,2841,water
2841,2842,water
2842,2843,water
2843,2844,water


In [192]:
# Save to CSV
result.to_csv("submission.csv", index=False)

In [193]:
result.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2845 entries, 0 to 2844
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   ID      2845 non-null   int64 
 1   class   2845 non-null   object
dtypes: int64(1), object(1)
memory usage: 44.6+ KB
