Importálások

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

Adatok beolvavása

In [2]:
train_df = pd.read_csv("input.csv")
valid_df = pd.read_csv("validation.csv")

Adatok előkészítése

In [3]:
X = train_df.drop("class", axis=1)
y = train_df["class"]

encoders = {}
for col in X.columns:
    if X[col].dtype == 'object':
        le = LabelEncoder()
        X[col] = le.fit_transform(X[col])
        encoders[col] = le

for col in valid_df.columns:
    if valid_df[col].dtype == 'object' and col in encoders:
        valid_df[col] = encoders[col].transform(valid_df[col])

Tanító és teszt halmaz

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=42)

Modellek

In [5]:
models = {
    'KNN':{
        'pipeline': Pipeline([
            ('imputer',SimpleImputer(strategy='median')),
            ('scaler',StandardScaler()),
            ('clf',KNeighborsClassifier())
        ]),
        'params':{
            'clf__n_neighbors':[3,5,7,9],
            'clf__weights':['uniform','distance'],
            'clf__p':[1,2]
        }
    },
    'DecisionTree':{
        'pipeline': Pipeline([
            ('imputer',SimpleImputer(strategy='most_frequent')),
            ('scaler',StandardScaler()),
            ('clf',DecisionTreeClassifier(random_state=42))
        ]),
        'params':{
            'clf__criterion':['gini','entropy'],
            'clf__max_depth':[None,5,10,20,30],
            'clf__min_samples_split':[2,5,10,20],
            'clf__min_samples_leaf':[1,2,4,8]
        }
    },
    'RandomForest':{
        'pipeline': Pipeline([
            ('imputer',SimpleImputer(strategy='most_frequent')),
            ('scaler',StandardScaler()),
            ('clf',RandomForestClassifier(random_state=42))
        ]),
        'params':{
            'clf__n_estimators':[100],
            'clf__max_depth':[None,5,10],
            'clf__min_samples_split':[2,5],
            'clf__min_samples_leaf':[1,2],
            'clf__bootstrap':[True,False]
        }
    }
}

GridSearchCV

In [6]:
best_model = None
best_score = 0
best_name = None

for name, config in models.items():
    print(f"\n== {name} modell tanítása ===")
    grid = GridSearchCV(
        config['pipeline'],
        config['params'],
        cv=4,
        scoring='accuracy',
        n_jobs=1,
        verbose=1
    )
    grid.fit(X_train,y_train)
    score = grid.best_score_
    print(f"{name} legjobb pontosság: {score:.4f}")
    print(f"Legjobb paraméterek: {grid.best_params_}")

    if score > best_score:
        best_score = score
        best_model = grid.best_estimator_
        best_name = name


== KNN modell tanítása ===
Fitting 4 folds for each of 16 candidates, totalling 64 fits
KNN legjobb pontosság: 0.9306
Legjobb paraméterek: {'clf__n_neighbors': 7, 'clf__p': 1, 'clf__weights': 'distance'}

== DecisionTree modell tanítása ===
Fitting 4 folds for each of 160 candidates, totalling 640 fits
DecisionTree legjobb pontosság: 0.9204
Legjobb paraméterek: {'clf__criterion': 'entropy', 'clf__max_depth': 30, 'clf__min_samples_leaf': 1, 'clf__min_samples_split': 2}

== RandomForest modell tanítása ===
Fitting 4 folds for each of 24 candidates, totalling 96 fits
RandomForest legjobb pontosság: 0.9499
Legjobb paraméterek: {'clf__bootstrap': True, 'clf__max_depth': None, 'clf__min_samples_leaf': 1, 'clf__min_samples_split': 5, 'clf__n_estimators': 100}


Legjobb modell kiértékelése

In [7]:
print("\n-----------------------")
print(f"Legjobb modell: {best_name}")
print(f"CV pontosság: {best_score:.4f}")
test_acc = best_model.score(X_test,y_test)
print(f"Teszthalmaz pontosság: {test_acc:.4f}")
print("=======================\n")


-----------------------
Legjobb modell: RandomForest
CV pontosság: 0.9499
Teszthalmaz pontosság: 0.9552



Validációs adatok előrejelzése

In [8]:
val_pred = best_model.predict(valid_df)

Mentés

In [9]:
np.savetxt('pred.txt', val_pred, fmt="%s")