In [35]:
import numpy as np
import pandas as pd


import os
for dirname, _, filenames in os.walk('/kaggle/input/model-data'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/model-data/hacktest.csv
/kaggle/input/model-data/hacktrain.csv


In [36]:
import warnings

warnings.filterwarnings('ignore')

In [37]:
df = pd.read_csv("/kaggle/input/model-data/hacktrain.csv")

In [38]:
df.isnull().sum()

Unnamed: 0       0
ID               0
class            0
20150720_N     560
20150602_N    1200
20150517_N     800
20150501_N     960
20150415_N     480
20150330_N    1120
20150314_N     720
20150226_N    1360
20150210_N     640
20150125_N    1040
20150109_N     880
20141117_N    1280
20141101_N     400
20141016_N    1440
20140930_N     800
20140813_N     560
20140626_N    1600
20140610_N     480
20140525_N     720
20140509_N     880
20140423_N    1760
20140407_N     640
20140322_N    1120
20140218_N    1440
20140202_N     560
20140117_N    1200
20140101_N     400
dtype: int64

In [39]:
df.fillna(df.mean(numeric_only=True), inplace=True) 
df.isnull().sum()

Unnamed: 0    0
ID            0
class         0
20150720_N    0
20150602_N    0
20150517_N    0
20150501_N    0
20150415_N    0
20150330_N    0
20150314_N    0
20150226_N    0
20150210_N    0
20150125_N    0
20150109_N    0
20141117_N    0
20141101_N    0
20141016_N    0
20140930_N    0
20140813_N    0
20140626_N    0
20140610_N    0
20140525_N    0
20140509_N    0
20140423_N    0
20140407_N    0
20140322_N    0
20140218_N    0
20140202_N    0
20140117_N    0
20140101_N    0
dtype: int64

In [40]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

X = df.drop(columns=['Unnamed: 0', 'ID', 'class'])
y = df['class']


le = LabelEncoder()
y_encoded = le.fit_transform(y)

X_train, X_val, y_train, y_val = train_test_split(X, y_encoded, test_size=0.2, random_state=42)


pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler()),
    ("clf", LogisticRegression(max_iter=1000, multi_class='multinomial', solver='lbfgs'))
])

pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_val)
print(classification_report(y_val, y_pred, target_names=le.classes_))


df.drop(columns=['ID'], inplace=True)


label_encoder = LabelEncoder()
df['class'] = label_encoder.fit_transform(df['class'])


X = df.drop(columns=['class'])
y = df['class']


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y)


model = LogisticRegression(
    multi_class='multinomial',
    solver='lbfgs',
    max_iter=10
)
model.fit(X_train, y_train)


y_pred = model.predict(X_test)


print(classification_report(
    y_test,
    y_pred,
    labels=list(range(len(label_encoder.classes_))),
    target_names=label_encoder.classes_
))


              precision    recall  f1-score   support

        farm       0.70      0.44      0.54       161
      forest       0.90      0.98      0.94      1231
       grass       0.73      0.26      0.38        43
  impervious       0.86      0.76      0.80       141
     orchard       0.00      0.00      0.00         6
       water       0.65      0.61      0.63        18

    accuracy                           0.88      1600
   macro avg       0.64      0.51      0.55      1600
weighted avg       0.86      0.88      0.86      1600

              precision    recall  f1-score   support

        farm       0.76      0.68      0.72       168
      forest       0.96      0.99      0.97      1232
       grass       0.00      0.00      0.00        39
  impervious       0.64      0.84      0.73       134
     orchard       0.00      0.00      0.00         6
       water       0.00      0.00      0.00        21

    accuracy                           0.90      1600
   macro avg       0.39

In [41]:
test_data = pd.read_csv("/kaggle/input/model-data/hacktest.csv")
test_data.shape

(2845, 29)

In [42]:
ID=test_data['ID']
test_data.drop(['ID'],axis=1,inplace=True)

In [43]:
y_test = model.predict(test_data)

In [44]:
y_test

array([1, 1, 1, ..., 3, 3, 3])

In [45]:
y_decoded = label_encoder.inverse_transform(y_test)
y_decoded

array(['forest', 'forest', 'forest', ..., 'impervious', 'impervious',
       'impervious'], dtype=object)

In [46]:
result = pd.DataFrame({
    'ID': ID,
    'class': y_decoded
})

In [47]:
result

Unnamed: 0,ID,class
0,1,forest
1,2,forest
2,3,forest
3,4,forest
4,5,forest
...,...,...
2840,2841,impervious
2841,2842,impervious
2842,2843,impervious
2843,2844,impervious


In [48]:
result.to_csv("submission.csv", index=False)