In [18]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import log_loss

In [8]:
# read the data from /input/train.csv
data = pd.read_csv("train.csv")

In [11]:
data.head(20)

Unnamed: 0,id,Mean_Integrated,SD,EK,Skewness,Mean_DMSNR_Curve,SD_DMSNR_Curve,EK_DMSNR_Curve,Skewness_DMSNR_Curve,Class
0,0,133.171875,59.716081,0.043133,-0.703383,54.917224,70.084438,0.749798,-0.649512,0
1,1,87.09375,36.257973,0.435469,2.266057,3.417224,21.865069,7.03933,52.686251,0
2,2,112.640625,39.818393,0.379639,0.922306,2.730769,15.68969,8.193471,85.649785,0
3,3,120.679688,45.918448,-0.09849,0.011775,2.696488,20.954662,8.183874,70.332899,0
4,4,134.070312,57.720107,-0.107772,-0.573335,1.10786,11.255051,16.107748,308.753765,0
5,5,131.632812,52.56321,-0.075253,-0.495825,2.194816,15.537425,9.033439,97.032406,0
6,6,110.9375,41.556955,0.312844,0.559022,1.965719,17.191469,10.396774,118.72427,0
7,7,120.203125,49.927902,-0.08999,-0.321367,3.2801,18.37684,8.190561,77.917237,0
8,8,112.414062,46.939866,0.282551,0.151784,3.336957,21.929529,7.69333,65.186279,0
9,9,99.859375,48.089189,0.69371,0.281663,3.414716,24.18191,7.958684,65.084575,0


In [12]:
data.describe()

Unnamed: 0,id,Mean_Integrated,SD,EK,Skewness,Mean_DMSNR_Curve,SD_DMSNR_Curve,EK_DMSNR_Curve,Skewness_DMSNR_Curve,Class
count,117564.0,117564.0,117564.0,117564.0,117564.0,117564.0,117564.0,117564.0,117564.0,117564.0
mean,58781.5,111.2483,46.713535,0.503498,1.886385,11.962921,26.190678,8.037488,93.881076,0.093285
std,33937.947861,24.906474,6.102941,1.127093,6.515466,26.719946,20.041937,3.84098,79.96211,0.290833
min,0.0,6.054688,24.783273,-1.730782,-1.791886,0.213211,7.370432,-2.597872,-1.976976,0.0
25%,29390.75,104.546875,43.44339,0.049761,-0.188956,2.090301,14.955405,6.742911,49.409136,0.0
50%,58781.5,116.664062,47.478932,0.186498,0.09172,2.808528,18.164924,8.442883,83.421375,0.0
75%,88172.25,126.296875,50.862718,0.39562,0.691613,4.12291,24.732218,10.003237,122.09329,0.0
max,117563.0,189.367188,93.602933,7.879628,65.385974,217.371238,109.890785,34.539844,1191.000837,1.0


In [163]:
X = data.drop(['Class', 'id'], axis=1)
y = data['Class']

# split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [184]:
from sklearn.preprocessing import StandardScaler

ss_train = StandardScaler()
X_train = ss_train.fit_transform(X_train)

ss_test = StandardScaler()
X_test = ss_test.fit_transform(X_test)

X = ss_train.fit_transform(X)

In [165]:
model = RandomForestClassifier(n_estimators=128, max_depth=3, random_state=42)

In [166]:
# fit the model
model.fit(X_train, y_train)

RandomForestClassifier(max_depth=3, n_estimators=128, random_state=42)

In [167]:
test_probs = model.predict_proba(X_test)

In [168]:
test_loss = log_loss(y_test, test_probs)

In [169]:
test_loss

0.04115376328504106

In [170]:
train_loss = log_loss(y_train, model.predict_proba(X_train))

In [171]:
train_loss

0.0401390264662757

In [69]:
models = {}


# Logistic Regression
from sklearn.linear_model import LogisticRegression
models['Logistic Regression'] = LogisticRegression()

# Decision Trees
from sklearn.tree import DecisionTreeClassifier
models['Decision Trees'] = DecisionTreeClassifier()

# Random Forest
from sklearn.ensemble import RandomForestClassifier
models['Random Forest'] = RandomForestClassifier()

# Naive Bayes
from sklearn.naive_bayes import GaussianNB
models['Naive Bayes'] = GaussianNB()

# K-Nearest Neighbors
from sklearn.neighbors import KNeighborsClassifier
models['K-Nearest Neighbor'] = KNeighborsClassifier()

In [70]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, log_loss

accuracy, precision, recall, logloss = {}, {}, {}, {}

for key in models.keys():
    
    # Fit the classifier
    models[key].fit(X_train, y_train)
    
    # Make predictions
    predictions = models[key].predict(X_test)
    
    # Calculate metrics
    accuracy[key] = accuracy_score(predictions, y_test)
    precision[key] = precision_score(predictions, y_test)
    recall[key] = recall_score(predictions, y_test)
    logloss[key] = log_loss(y_test, models[key].predict_proba(X_test))

In [73]:
df_model = pd.DataFrame(index=models.keys(), columns=['Accuracy', 'Precision', 'Recall', 'Log Loss'])
df_model['Accuracy'] = accuracy.values()
df_model['Precision'] = precision.values()
df_model['Recall'] = recall.values()
df_model['Log Loss'] = logloss.values()

df_model

Unnamed: 0,Accuracy,Precision,Recall,Log Loss
Logistic Regression,0.989538,0.915262,0.971,0.039385
Decision Trees,0.983498,0.908884,0.91388,0.569942
Random Forest,0.990516,0.926651,0.97042,0.082107
Naive Bayes,0.962361,0.933485,0.734935,0.319941
K-Nearest Neighbor,0.989623,0.918907,0.968315,0.204199


## XGBoost

In [75]:
import xgboost as xgb

In [172]:
model = xgb.XGBClassifier(n_estimators=1024, max_depth=3, learning_rate=0.02, random_state=42)

In [173]:
# fit the model
model.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
              importance_type=None, interaction_constraints='',
              learning_rate=0.02, max_bin=256, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=3, max_leaves=0, min_child_weight=1,
              missing=nan, monotone_constraints='()', n_estimators=1024,
              n_jobs=0, num_parallel_tree=1, predictor='auto', random_state=42,
              reg_alpha=0, reg_lambda=1, ...)

In [174]:
# make predictions
test_probs = model.predict_proba(X_test)

# calculate log loss
test_loss = log_loss(y_test, test_probs)
test_loss

0.03327405758795012

In [175]:
preds = test_probs[:, 1]

In [176]:
# make training predictions
train_probs = model.predict_proba(X_train)

# calculate log loss
train_loss = log_loss(y_train, train_probs)
train_loss

0.026254011978710447

In [89]:
# read the data from sample_submission.csv
submission = pd.read_csv("sample_submission.csv")

In [90]:
submission.head()

Unnamed: 0,id,Class
0,117564,0.5
1,117565,0.5
2,117566,0.5
3,117567,0.5
4,117568,0.5


In [159]:
# Save test predictions to file
submission = pd.DataFrame({'id': X_test.id,
                       'Class': preds})
submission.to_csv('submission.csv', index=False)

AttributeError: 'numpy.ndarray' object has no attribute 'id'

In [154]:
type(X_test)

numpy.ndarray

In [155]:
# read the real test data from test.csv
test_data = pd.read_csv("test.csv")

In [156]:
test_data.head()

Unnamed: 0,id,Mean_Integrated,SD,EK,Skewness,Mean_DMSNR_Curve,SD_DMSNR_Curve,EK_DMSNR_Curve,Skewness_DMSNR_Curve
0,117564,140.046875,54.5078,0.058862,-0.567263,2.337793,14.868335,9.59176,117.988781
1,117565,107.828125,51.578965,0.284368,-0.33843,1.574415,12.501437,11.694968,182.704822
2,117566,135.0625,49.812343,-0.087784,-0.094341,3.576923,21.243336,7.252386,59.021499
3,117567,112.8125,41.926647,0.519921,1.287762,6.669732,29.013153,5.097661,27.10524
4,117568,96.210938,35.32262,0.481286,2.44308,2.218227,17.041064,9.766006,117.131775


In [180]:
X_testtest = test_data.drop(['id'], axis=1)

In [181]:
ID_test = test_data.id

In [182]:
X_testtest = ss_train.fit_transform(X_testtest)

In [185]:
# final fit with entire training set
model.fit(X, y)

XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
              importance_type=None, interaction_constraints='',
              learning_rate=0.02, max_bin=256, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=3, max_leaves=0, min_child_weight=1,
              missing=nan, monotone_constraints='()', n_estimators=1024,
              n_jobs=0, num_parallel_tree=1, predictor='auto', random_state=42,
              reg_alpha=0, reg_lambda=1, ...)

In [193]:
preds = model.predict_proba(X_testtest)

In [194]:
preds=  preds[:,1]

In [195]:
# Save test predictions to file
submission = pd.DataFrame({'id': ID_test,
                       'Class': preds})
submission.to_csv('submission.csv', index=False)