In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

#ensemble methods
from sklearn.ensemble import HistGradientBoostingClassifier

#cross fold validation, optimizing for roc auc (my second-favorite metric)
from sklearn.model_selection import cross_val_score
from sklearn import metrics

#all available files
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/playground-series-s3e23/sample_submission.csv
/kaggle/input/playground-series-s3e23/train.csv
/kaggle/input/playground-series-s3e23/test.csv


In [2]:
# IO of training data
# string -> binary
raw_df = pd.read_csv(
    '/kaggle/input/playground-series-s3e23/train.csv', 
    header=0,
    converters={'defects': lambda x: int(x == 'True')}
)
df = raw_df.set_index(['id'])
df.head()

Unnamed: 0_level_0,loc,v(g),ev(g),iv(g),n,v,l,d,i,e,...,lOCode,lOComment,lOBlank,locCodeAndComment,uniq_Op,uniq_Opnd,total_Op,total_Opnd,branchCount,defects
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,22.0,3.0,1.0,2.0,60.0,278.63,0.06,19.56,14.25,5448.79,...,17,1,1,0,16.0,9.0,38.0,22.0,5.0,0
1,14.0,2.0,1.0,2.0,32.0,151.27,0.14,7.0,21.11,936.71,...,11,0,1,0,11.0,11.0,18.0,14.0,3.0,0
2,11.0,2.0,1.0,2.0,45.0,197.65,0.11,8.05,22.76,1754.01,...,8,0,1,0,12.0,11.0,28.0,17.0,3.0,0
3,8.0,1.0,1.0,1.0,23.0,94.01,0.19,5.25,17.86,473.66,...,4,0,2,0,8.0,6.0,16.0,7.0,1.0,1
4,11.0,2.0,1.0,2.0,17.0,60.94,0.18,5.63,12.44,365.67,...,7,0,2,0,7.0,6.0,10.0,10.0,3.0,0


In [3]:
#feature engineering

# I have a hunch that complexity increases defect risk log, not exp
df['v(g)rt'] = df['v(g)']**0.5
df['ev(g)rt'] = df['ev(g)']**0.5
df['iv(g)rt'] = df['iv(g)']**0.5

y = df['defects']
x = df.drop(columns=['defects'])

In [4]:
clf = HistGradientBoostingClassifier(max_iter=10000000000, learning_rate=0.5).fit(x, y)

In [5]:
#score the classifier using cross-fold validation
scores = cross_val_score(clf, x, y, cv=5, scoring='roc_auc')
print(f"Accuracy: {scores.mean()} \nStdDev: {scores.std()}")

Accuracy: 0.7861685188873909 
StdDev: 0.0025524152310859756


In [6]:
test_df = pd.read_csv(
    '/kaggle/input/playground-series-s3e23/test.csv', 
    header=0,
    converters={'defects': lambda x: int(x == 'True')}
)
test_df['v(g)rt'] = test_df['v(g)']**0.5
test_df['ev(g)rt'] = test_df['ev(g)']**0.5
test_df['iv(g)rt'] = test_df['iv(g)']**0.5
x_test = test_df.drop(columns=['id'])
preds = clf.predict_proba(x_test)
pred_fails = [pred[1] for pred in preds]
submission = pd.DataFrame({'id':test_df['id'], 'defects':pred_fails}, columns=['id', 'defects'])
submission.to_csv('submission.csv', index=False)