In [66]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report


In [45]:
def visualize_classifier(classifier, X, y):
    # Define the minimum and maximum values for X and Y
    # that will be used in the mesh grid
    min_x, max_x = X[:, 0].min() - 1.0, X[:, 0].max() + 1.0
    min_y, max_y = X[:, 1].min() - 1.0, X[:, 1].max() + 1.0

    # Define the step size to use in plotting the mesh grid 
    mesh_step_size = 0.01

    # Define the mesh grid of X and Y values
    x_vals, y_vals = np.meshgrid(np.arange(min_x, max_x, mesh_step_size), np.arange(min_y, max_y, mesh_step_size))

    # Run the classifier on the mesh grid
    output = classifier.predict(np.c_[x_vals.ravel(), y_vals.ravel()])

    # Reshape the output array
    output = output.reshape(x_vals.shape)

    # Create a plot
    plt.figure()

    # Choose a color scheme for the plot 
    plt.pcolormesh(x_vals, y_vals, output, cmap=plt.cm.gray)

    # Overlay the training points on the plot 
    plt.scatter(X[:, 0], X[:, 1], c=y, s=75, edgecolors='black', linewidth=1, cmap=plt.cm.Paired)

    # Specify the boundaries of the plot
    plt.xlim(x_vals.min(), x_vals.max())
    plt.ylim(y_vals.min(), y_vals.max())

    # Specify the ticks on the X and Y axes
    plt.xticks((np.arange(int(X[:, 0].min() - 1), int(X[:, 0].max() + 1), 1.0)))
    plt.yticks((np.arange(int(X[:, 1].min() - 1), int(X[:, 1].max() + 1), 1.0)))

    plt.show()

In [46]:
# 입력 데이터
train = pd.read_csv("C:\\Users\\customer\\Desktop\\Dacon\\train.csv")
test = pd.read_csv("C:\\Users\\customer\\Desktop\\Dacon\\test.csv")
sample_submission = pd.read_csv("C:\\Users\\customer\\Desktop\\Dacon\\sample_submission.csv")


In [47]:
column_number = {}
for i, column in enumerate(sample_submission.columns):
    column_number[column] = i
    
def to_number(x, dic):
    return dic[x]

train['type_num'] = train['type'].apply(lambda x: to_number(x, column_number))

In [48]:
train_x = train.drop(columns=['type', 'type_num'], axis=1)
train_y = train['type_num']
test_x = test

In [49]:
x_train, x_test, y_train, y_test = train_test_split(train_x,train_y, test_size = 0.01)


In [70]:
params = {"n_estimators":100,  "max_depth":4, "random_state":0}

classifier = RandomForestClassifier(**params)
extra_classifier =ExtraTreesClassifier(**params)

classifier.fit(train_x,train_y)
y_train_pred = classifier.predict(train_x)
y_test_pred = classifier.predict(x_test)



In [72]:
print("\n"+ "#"*80)
print("\nClassifier Performance on Training Dataset\n")
print(classification_report(train_y,classifier.predict(train_x)))
print("#"*80 + "\n")


################################################################################

Classifier Performance on Training Dataset



  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           1       0.00      0.00      0.00      2160
           2       0.90      0.88      0.89      6506
           3       0.00      0.00      0.00       500
           4       0.69      0.65      0.67      2562
           5       0.65      0.95      0.77     14618
           6       0.69      0.32      0.44     13500
           7       0.79      0.97      0.87     37347
           8       0.00      0.00      0.00      4654
           9       0.77      0.81      0.79     49680
          10       0.00      0.00      0.00       127
          11       0.88      0.90      0.89     13750
          12       0.00      0.00      0.00      6580
          13       0.00      0.00      0.00        13
          14       0.79      0.25      0.37      7132
          15       0.00      0.00      0.00      3257
          16       0.76      0.98      0.85     14630
          17       0.00      0.00      0.00      1154
          18       0.00    

In [77]:
y_pred = classifier.predict_proba(test_x)

In [80]:
print(y_pred)

[[1.92005917e-04 1.68992603e-03 7.11054876e-03 ... 3.89242325e-03
  8.30914094e-04 4.74251211e-04]
 [2.14062284e-03 1.19916341e-04 4.96691692e-03 ... 4.66529135e-04
  6.87487352e-04 4.17023357e-02]
 [2.02586573e-03 3.88495192e-02 2.30321245e-04 ... 8.50854214e-03
  6.67470496e-05 7.37826744e-04]
 ...
 [5.00132477e-03 5.71607904e-04 1.65025283e-03 ... 5.63780049e-04
  3.44342054e-04 1.42916069e-01]
 [1.45089786e-04 0.00000000e+00 3.84444083e-04 ... 1.96251023e-06
  8.25314538e-05 1.93966368e-02]
 [5.54469868e-03 7.36188570e-04 0.00000000e+00 ... 4.47006962e-06
  1.64986124e-04 3.16332504e-03]]


In [79]:
submission = pd.DataFrame(data=y_pred, columns=sample_submission.columns, index=sample_submission.index)
submission.to_csv('submission_ver1.csv', index=True)

ValueError: Shape of passed values is (10009, 19), indices imply (10009, 20)