In [4]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report


In [5]:
def visualize_classifier(classifier, X, y):
    # Define the minimum and maximum values for X and Y
    # that will be used in the mesh grid
    min_x, max_x = X[:, 0].min() - 1.0, X[:, 0].max() + 1.0
    min_y, max_y = X[:, 1].min() - 1.0, X[:, 1].max() + 1.0

    # Define the step size to use in plotting the mesh grid 
    mesh_step_size = 0.01

    # Define the mesh grid of X and Y values
    x_vals, y_vals = np.meshgrid(np.arange(min_x, max_x, mesh_step_size), np.arange(min_y, max_y, mesh_step_size))

    # Run the classifier on the mesh grid
    output = classifier.predict(np.c_[x_vals.ravel(), y_vals.ravel()])

    # Reshape the output array
    output = output.reshape(x_vals.shape)

    # Create a plot
    plt.figure()

    # Choose a color scheme for the plot 
    plt.pcolormesh(x_vals, y_vals, output, cmap=plt.cm.gray)

    # Overlay the training points on the plot 
    plt.scatter(X[:, 0], X[:, 1], c=y, s=75, edgecolors='black', linewidth=1, cmap=plt.cm.Paired)

    # Specify the boundaries of the plot
    plt.xlim(x_vals.min(), x_vals.max())
    plt.ylim(y_vals.min(), y_vals.max())

    # Specify the ticks on the X and Y axes
    plt.xticks((np.arange(int(X[:, 0].min() - 1), int(X[:, 0].max() + 1), 1.0)))
    plt.yticks((np.arange(int(X[:, 1].min() - 1), int(X[:, 1].max() + 1), 1.0)))

    plt.show()

In [174]:
# 입력 데이터
train = pd.read_csv("C:\\Users\\customer\\Desktop\\Dacon\\train.csv",index_col=0)
test = pd.read_csv("C:\\Users\\customer\\Desktop\\Dacon\\test.csv",index_col=0)
sample_submission = pd.read_csv("C:\\Users\\customer\\Desktop\\Dacon\\sample_submission.csv")


In [164]:
column_number = {}
column_name = {}
for i, column in enumerate(sample_submission.columns):
    column_number[column] = i
    column_name[i] = column
def to_number(x, dic):
    return dic[x]

train['type_num'] = train['type'].apply(lambda x: to_number(x, column_number))


In [165]:
print(column_number)
print(column_name)

{'STAR_WHITE_DWARF': 0, 'STAR_CATY_VAR': 1, 'STAR_BROWN_DWARF': 2, 'SERENDIPITY_RED': 3, 'REDDEN_STD': 4, 'STAR_BHB': 5, 'GALAXY': 6, 'SERENDIPITY_DISTANT': 7, 'QSO': 8, 'SKY': 9, 'STAR_RED_DWARF': 10, 'ROSAT_D': 11, 'STAR_PN': 12, 'SERENDIPITY_FIRST': 13, 'STAR_CARBON': 14, 'SPECTROPHOTO_STD': 15, 'STAR_SUB_DWARF': 16, 'SERENDIPITY_MANUAL': 17, 'SERENDIPITY_BLUE': 18}
{0: 'STAR_WHITE_DWARF', 1: 'STAR_CATY_VAR', 2: 'STAR_BROWN_DWARF', 3: 'SERENDIPITY_RED', 4: 'REDDEN_STD', 5: 'STAR_BHB', 6: 'GALAXY', 7: 'SERENDIPITY_DISTANT', 8: 'QSO', 9: 'SKY', 10: 'STAR_RED_DWARF', 11: 'ROSAT_D', 12: 'STAR_PN', 13: 'SERENDIPITY_FIRST', 14: 'STAR_CARBON', 15: 'SPECTROPHOTO_STD', 16: 'STAR_SUB_DWARF', 17: 'SERENDIPITY_MANUAL', 18: 'SERENDIPITY_BLUE'}


In [166]:
train_x = train.drop(columns=['type', 'type_num','fiberID'], axis=1)
train_y = train['type_num']
test_x = test
test_x = test_x.drop(columns=['fiberID'],axis=1)

In [167]:
drop_index = []
columns = train_x.columns.tolist()
for i in columns:
    drop_index.append(train_x[train_x[i]<=10].index.tolist())
    drop_index.append(train_x[train_x[i]>=35].index.tolist())

In [168]:
drop_index = sum(drop_index, [])

In [169]:
train_x = train_x.drop(drop_index)
train_y = train_y.drop(drop_index)

In [170]:
params = {"n_estimators":100,  "max_depth":4, "random_state":0}

classifier = RandomForestClassifier(**params)
extra_classifier =ExtraTreesClassifier(**params)

classifier.fit(train_x,train_y)
y_train_pred = classifier.predict(train_x)
y_test_pred = classifier.predict(test_x)



In [171]:
print("\n"+ "#"*80)
print("\n Extra Classifier Performance on Training Dataset\n")
print(classification_report(train_y,classifier.predict(train_x)))
print("#"*80 + "\n")


################################################################################

 Extra Classifier Performance on Training Dataset



  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.00      0.00      0.00      2159
           1       0.82      0.92      0.87      6505
           2       0.00      0.00      0.00       498
           3       0.70      0.73      0.72      2544
           4       0.66      0.95      0.78     14613
           5       0.68      0.35      0.46     13493
           6       0.80      0.97      0.88     37030
           7       0.00      0.00      0.00      4650
           8       0.79      0.80      0.80     49555
           9       0.00      0.00      0.00       123
          10       0.90      0.91      0.90     13725
          11       0.00      0.00      0.00      6454
          12       0.00      0.00      0.00        12
          13       0.72      0.35      0.47      7094
          14       0.00      0.00      0.00      3253
          15       0.76      0.98      0.85     14629
          16       0.00      0.00      0.00      1154
          17       0.00    

In [172]:
y_pred = classifier.predict_proba(test_x)


In [175]:
submission = pd.DataFrame(data=y_pred, columns=column_number,index = sample_submission['id'])
print(submission)

        STAR_WHITE_DWARF  STAR_CATY_VAR  STAR_BROWN_DWARF  SERENDIPITY_RED  \
id                                                                           
199991          0.000433       0.006696      5.801280e-03         0.056025   
199992          0.001789       0.000090      3.792557e-03         0.026292   
199993          0.001891       0.036035      1.677775e-04         0.000225   
199994          0.017849       0.003911      4.315086e-05         0.000572   
199995          0.000395       0.000177      6.234793e-03         0.058923   
...                  ...            ...               ...              ...   
209995          0.053442       0.010890      3.021961e-05         0.000167   
209996          0.037194       0.010351      3.760238e-05         0.000184   
209997          0.006027       0.000741      2.753307e-04         0.003832   
209998          0.000095       0.000000      6.084555e-04         0.001965   
209999          0.005313       0.000912      4.671587e-07       

In [176]:
submission.to_csv('submission_ver6.csv', index=True)