In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [3]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from scipy.stats import linregress
import warnings
warnings.filterwarnings('ignore')

In [7]:
train_df = pd.read_csv('/kaggle/input/summer-analytics-mid-hackathon/hacktrain.csv')
test_df = pd.read_csv('/kaggle/input/summer-analytics-mid-hackathon/hacktest.csv')

In [8]:
train_df.shape

(8000, 30)

In [9]:
test_df.shape

(2845, 29)

In [10]:
train_df.head()

Unnamed: 0.1,Unnamed: 0,ID,class,20150720_N,20150602_N,20150517_N,20150501_N,20150415_N,20150330_N,20150314_N,...,20140610_N,20140525_N,20140509_N,20140423_N,20140407_N,20140322_N,20140218_N,20140202_N,20140117_N,20140101_N
0,0,1,water,637.595,658.668,-1882.03,-1924.36,997.904,-1739.99,630.087,...,,-1043.16,-1942.49,267.138,,,211.328,-2203.02,-1180.19,433.906
1,1,2,water,634.24,593.705,-1625.79,-1672.32,914.198,-692.386,707.626,...,,-933.934,-625.385,120.059,364.858,476.972,220.878,-2250.0,-1360.56,524.075
2,3,4,water,58.0174,-1599.16,,-1052.63,,-1564.63,,...,-1025.88,368.622,,-1227.8,304.621,,369.214,-2202.12,,-1343.55
3,4,5,water,72.518,,380.436,-1256.93,515.805,-1413.18,-802.942,...,-1813.95,155.624,,-924.073,432.15,282.833,298.32,-2197.36,,-826.727
4,7,8,water,1136.44,,,1647.83,1935.8,,2158.98,...,1535.0,1959.43,-279.317,-384.915,-113.406,1020.72,1660.65,-116.801,-568.05,-1357.14


In [11]:
ndvi_cols = [col for col in train_df.columns if '_N' in col]

In [12]:
print("Missing NDVI values in Train:", train_df[ndvi_cols].isnull().sum().sum())
print("Missing NDVI values in Test :", test_df[ndvi_cols].isnull().sum().sum())

Missing NDVI values in Train: 25040
Missing NDVI values in Test : 0


In [14]:
imputer = SimpleImputer(strategy='median')
train_df[ndvi_cols] = imputer.fit_transform(train_df[ndvi_cols])

In [15]:
def extract_ndvi_features(df, ndvi_columns):

    features = pd.DataFrame()
    ndvi_values = df[ndvi_columns].values
    
    features['ndvi_mean'] = ndvi_values.mean(axis=1)
    features['ndvi_std'] = ndvi_values.std(axis=1)
    features['ndvi_min'] = ndvi_values.min(axis=1)
    features['ndvi_max'] = ndvi_values.max(axis=1)
    features['ndvi_range'] = features['ndvi_max'] - features['ndvi_min']
    
    def compute_slope(row):
        x = np.arange(len(ndvi_columns))
        slope, _, _, _, _ = linregress(x, row)
        return slope
    
    features['ndvi_trend'] = [compute_slope(row) for row in ndvi_values]
    
    return features

X_train = extract_ndvi_features(train_df, ndvi_cols)
X_test = extract_ndvi_features(test_df, ndvi_cols)

In [16]:
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(train_df['class'])

print("Classes found:", label_encoder.classes_)

Classes found: ['farm' 'forest' 'grass' 'impervious' 'orchard' 'water']


In [17]:
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('clf', LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000, random_state=42))
])

In [18]:
cv_scores = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='accuracy')
print("5-Fold Cross Validation Accuracy Scores:", cv_scores)
print("Mean CV Accuracy:", np.mean(cv_scores))

5-Fold Cross Validation Accuracy Scores: [0.855    0.875625 0.865625 0.845625 0.8525  ]
Mean CV Accuracy: 0.858875


In [19]:
pipeline.fit(X_train, y_train)

In [22]:
y_test_pred = pipeline.predict(X_test)
y_test_pred_labels = label_encoder.inverse_transform(y_test_pred)