In [1]:
import numpy as np
import pandas as pd
from lightgbm import LGBMClassifier
import warnings
warnings.filterwarnings('ignore')

In [2]:
from pycaret.classification import *

In [3]:
train  = pd.read_csv('competition_data/train.csv')
test  = pd.read_csv('competition_data/test.csv')
submission = pd.read_csv('competition_data/sample_submission.csv')

In [4]:
print(train.shape)
print(test.shape)
print(submission.shape)

(15000, 70)
(35452, 69)
(35452, 2)


In [5]:
clf = setup(data = train, target = 'nerdiness')

Unnamed: 0,Description,Value
0,session_id,6296
1,Target,nerdiness
2,Target Type,Binary
3,Label Encoded,
4,Original Data,"(15000, 70)"
5,Missing Values,True
6,Numeric Features,5
7,Categorical Features,64
8,Ordinal Features,False
9,High Cardinality Features,False


In [6]:
best_3 = compare_models(sort = 'AUC', n_select = 3)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
et,Extra Trees Classifier,0.7705,0.8582,0.8303,0.7724,0.8002,0.5313,0.5334,0.842
rf,Random Forest Classifier,0.7678,0.855,0.8376,0.7654,0.7997,0.5248,0.5281,0.575
lightgbm,Light Gradient Boosting Machine,0.7487,0.8208,0.8145,0.7523,0.782,0.4865,0.4889,0.17
gbc,Gradient Boosting Classifier,0.7327,0.8029,0.7992,0.7392,0.768,0.454,0.456,1.088
lda,Linear Discriminant Analysis,0.7234,0.7903,0.7846,0.7341,0.7584,0.4357,0.4372,0.492
ada,Ada Boost Classifier,0.7167,0.7878,0.7777,0.7289,0.7524,0.4222,0.4236,0.282
lr,Logistic Regression,0.6492,0.6899,0.847,0.6478,0.7287,0.2569,0.263,0.826
dt,Decision Tree Classifier,0.6891,0.6861,0.7144,0.7215,0.7178,0.3717,0.3719,0.139
nb,Naive Bayes,0.4687,0.6525,0.0914,0.6393,0.0783,0.0291,0.0363,0.061
knn,K Neighbors Classifier,0.5832,0.5992,0.674,0.6122,0.6415,0.1464,0.1474,0.412


In [7]:
blended = blend_models(estimator_list = best_3, fold = 5, method = 'soft')

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.7652,0.8472,0.8201,0.7704,0.7945,0.5214,0.5228
1,0.7771,0.8634,0.8442,0.7737,0.8074,0.5442,0.5471
2,0.7786,0.8656,0.8237,0.7865,0.8047,0.5494,0.5502
3,0.7567,0.8445,0.8383,0.7512,0.7924,0.5006,0.5051
4,0.7594,0.8471,0.8339,0.7564,0.7933,0.5072,0.5106
Mean,0.7674,0.8536,0.8321,0.7676,0.7985,0.5246,0.5271
Std,0.009,0.009,0.009,0.0126,0.0063,0.0194,0.0185


In [8]:
pred_holdout = predict_model(blended)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Voting Classifier,0.7832,0.8709,0.8366,0.7855,0.8103,0.5579,0.5594


In [9]:
final_model = finalize_model(blended)

In [10]:
predictions = predict_model(final_model, data=test)

In [11]:
predictions

Unnamed: 0,index,Q1,Q2,Q3,Q4,Q5,Q6,Q7,Q8,Q9,...,age,hand,religion,orientation,voted,married,familysize,ASD,Label,Score
0,0,4.0,4.0,3.0,5.0,5.0,5.0,3.0,5.0,4.0,...,19,1.0,4.0,4.0,1.0,1.0,3.0,2.0,0,0.7105
1,1,4.0,5.0,4.0,4.0,5.0,4.0,5.0,5.0,5.0,...,33,1.0,1.0,5.0,2.0,1.0,5.0,2.0,1,0.7997
2,2,5.0,5.0,5.0,5.0,4.0,5.0,5.0,5.0,5.0,...,13,1.0,4.0,5.0,2.0,1.0,3.0,2.0,1,0.8412
3,3,5.0,4.0,3.0,4.0,5.0,4.0,5.0,4.0,4.0,...,28,1.0,2.0,2.0,2.0,1.0,3.0,2.0,1,0.6562
4,4,5.0,5.0,5.0,5.0,5.0,3.0,5.0,5.0,5.0,...,15,1.0,4.0,5.0,2.0,1.0,2.0,2.0,1,0.8441
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35447,35447,4.0,5.0,5.0,3.0,3.0,4.0,5.0,5.0,4.0,...,16,3.0,10.0,4.0,2.0,1.0,3.0,2.0,1,0.8101
35448,35448,5.0,5.0,5.0,5.0,5.0,4.0,5.0,3.0,5.0,...,16,3.0,2.0,5.0,2.0,1.0,2.0,2.0,1,0.8117
35449,35449,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,...,31,1.0,1.0,5.0,1.0,1.0,2.0,2.0,1,0.9192
35450,35450,5.0,5.0,4.0,5.0,5.0,1.0,5.0,1.0,5.0,...,19,1.0,12.0,1.0,2.0,1.0,,2.0,0,0.7322


In [12]:
submission['nerdiness'] = predictions['Score']

In [13]:
submission.to_csv('submission_vimlab.csv', index = False)

# 모델링 

In [None]:
train = train.drop(['index', 'country'],axis = 1)
test =test.drop(['index', 'country'],axis = 1)

In [None]:
#train을 target과 feature로 나눠줍니다.
train_x=train.drop(['nerdiness'], axis=1)
train_y=train['nerdiness']

In [None]:
lgbm_clf = LGBMClassifier(
            n_estimators=1000
        )
lgbm_clf.fit(train_x, train_y)

lgbm_pred = lgbm_clf.predict(test)

# 제출 파일 생성

In [None]:
submission = pd.read_csv('competition_data/sample_submission.csv')

submission

In [None]:
submission["nerdiness"] = lgbm_pred

In [None]:
submission

In [None]:
submission.to_csv("baseline.csv", index = False)