In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import IsolationForest, RandomForestClassifier, VotingClassifier
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import numpy as np

In [2]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

In [3]:
train_labels = train_data.iloc[:, -1]
train_labelss = train_data["category"]
train_data = train_data.iloc[:, :-1]

In [4]:
label_encoder = LabelEncoder()
train_labels = label_encoder.fit_transform(train_labels)

In [5]:
outlier_detector = IsolationForest()
outlier_detector.fit(train_data)
outlier_mask = outlier_detector.predict(train_data)>0

In [6]:
clusterer = KMeans(n_clusters=20)
clusterer.fit(train_data[outlier_mask])
train_clusters = clusterer.predict(train_data)



In [7]:
pca = PCA(n_components=100)
train_data_pca = pca.fit_transform(train_data[outlier_mask])
test_data_pca = pca.transform(test_data.iloc[:, :])

In [8]:
classifier = RandomForestClassifier()
classifier.fit(train_data[outlier_mask], train_labels[outlier_mask], sample_weight=train_clusters[outlier_mask])


In [9]:
test_clusters = clusterer.predict(test_data)
test_predictions = classifier.predict(test_data)

In [10]:
logreg_clf = LogisticRegression()
ensemble_clf = VotingClassifier(
    estimators=[('logreg', logreg_clf), ('rf', classifier)],
    voting='soft')

In [11]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)
kf_scores = []

X_train = train_data.drop(columns=["ID"])

for train, value in kf.split(X_train):
    foldX = X_train.iloc[train]
    foldY = train_labelss.iloc[train]
    
    valFoldX = X_train.iloc[value]
    valFoldY = train_labelss.iloc[value]
    
    ensemble_clf.fit(foldX, foldY)
    
    y_preds = ensemble_clf.predict(valFoldX)
    kscore = accuracy_score(valFoldY, y_preds)
    kf_scores.append(kscore)
    
print("Cross-validation accuracy: {:.2f}%".format(np.mean(kf_scores) * 100))


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Cross-validation accuracy: 76.81%


In [12]:
# X_train = train_data.drop(columns=["ID"])
ensemble_clf.fit(X_train, train_labels)
X_test = test_data.drop(columns=["ID"])
y_test_pred = ensemble_clf.predict(X_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [13]:
predicted_labels = label_encoder.inverse_transform(y_test_pred)

In [14]:
predicted_df = pd.DataFrame({'ID': test_data['ID'], 'category': predicted_labels})
predicted_df.to_csv('submission.csv', index=False) 