In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss

In [3]:
# Read in the data
competition_data = pd.read_csv('./input/train.csv')

In [4]:
# define the target and features
X = competition_data.drop(['id', 'Class'], axis=1)
y = competition_data['Class']

In [5]:
# split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

## Merging with external data

In [6]:
original_data = pd.read_csv('./input/pulsar.csv')

In [7]:
competition_data.head()

Unnamed: 0,id,Mean_Integrated,SD,EK,Skewness,Mean_DMSNR_Curve,SD_DMSNR_Curve,EK_DMSNR_Curve,Skewness_DMSNR_Curve,Class
0,0,133.171875,59.716081,0.043133,-0.703383,54.917224,70.084438,0.749798,-0.649512,0
1,1,87.09375,36.257973,0.435469,2.266057,3.417224,21.865069,7.03933,52.686251,0
2,2,112.640625,39.818393,0.379639,0.922306,2.730769,15.68969,8.193471,85.649785,0
3,3,120.679688,45.918448,-0.09849,0.011775,2.696488,20.954662,8.183874,70.332899,0
4,4,134.070312,57.720107,-0.107772,-0.573335,1.10786,11.255051,16.107748,308.753765,0


In [8]:
original_data.head()

Unnamed: 0,Mean_Integrated,SD,EK,Skewness,Mean_DMSNR_Curve,SD_DMSNR_Curve,EK_DMSNR_Curve,Skewness_DMSNR_Curve,Class
0,140.5625,55.683782,-0.234571,-0.699648,3.199833,19.110426,7.975532,74.242225,0
1,102.507812,58.88243,0.465318,-0.515088,1.677258,14.860146,10.576487,127.39358,0
2,103.015625,39.341649,0.323328,1.051164,3.121237,21.744669,7.735822,63.171909,0
3,136.75,57.178449,-0.068415,-0.636238,3.642977,20.95928,6.896499,53.593661,0
4,88.726562,40.672225,0.600866,1.123492,1.17893,11.46872,14.269573,252.567306,0


In [9]:
competition_data['source']=0

In [10]:
competition_data.head()

Unnamed: 0,id,Mean_Integrated,SD,EK,Skewness,Mean_DMSNR_Curve,SD_DMSNR_Curve,EK_DMSNR_Curve,Skewness_DMSNR_Curve,Class,source
0,0,133.171875,59.716081,0.043133,-0.703383,54.917224,70.084438,0.749798,-0.649512,0,0
1,1,87.09375,36.257973,0.435469,2.266057,3.417224,21.865069,7.03933,52.686251,0,0
2,2,112.640625,39.818393,0.379639,0.922306,2.730769,15.68969,8.193471,85.649785,0,0
3,3,120.679688,45.918448,-0.09849,0.011775,2.696488,20.954662,8.183874,70.332899,0,0
4,4,134.070312,57.720107,-0.107772,-0.573335,1.10786,11.255051,16.107748,308.753765,0,0


In [11]:
original_data['source']=1

In [12]:
competition_data.drop('id', axis=1, inplace=True)

In [13]:
#merge the two dataframes
merged_data = pd.concat([competition_data, original_data], ignore_index=True)

In [14]:
merged_data.head()

Unnamed: 0,Mean_Integrated,SD,EK,Skewness,Mean_DMSNR_Curve,SD_DMSNR_Curve,EK_DMSNR_Curve,Skewness_DMSNR_Curve,Class,source
0,133.171875,59.716081,0.043133,-0.703383,54.917224,70.084438,0.749798,-0.649512,0,0
1,87.09375,36.257973,0.435469,2.266057,3.417224,21.865069,7.03933,52.686251,0,0
2,112.640625,39.818393,0.379639,0.922306,2.730769,15.68969,8.193471,85.649785,0,0
3,120.679688,45.918448,-0.09849,0.011775,2.696488,20.954662,8.183874,70.332899,0,0
4,134.070312,57.720107,-0.107772,-0.573335,1.10786,11.255051,16.107748,308.753765,0,0


In [15]:
merged_data.tail()

Unnamed: 0,Mean_Integrated,SD,EK,Skewness,Mean_DMSNR_Curve,SD_DMSNR_Curve,EK_DMSNR_Curve,Skewness_DMSNR_Curve,Class,source
135457,136.429688,59.847421,-0.187846,-0.738123,1.296823,12.166062,15.45026,285.931022,0,1
135458,122.554688,49.485605,0.127978,0.323061,16.409699,44.626893,2.945244,8.297092,0,1
135459,119.335938,59.935939,0.159363,-0.743025,21.430602,58.872,2.499517,4.595173,0,1
135460,114.507812,53.9024,0.201161,-0.024789,1.946488,13.381731,10.007967,134.23891,0,1
135461,57.0625,85.79734,1.406391,0.08952,188.30602,64.712562,-1.597527,1.429475,0,1


In [16]:
# define the target and features
X_merged = merged_data.drop(['Class'], axis=1)
y_merged = merged_data['Class']

In [17]:
# split data into train and test sets
X_train_merged, X_test_merged, y_train_merged, y_test_merged = train_test_split(X_merged, y_merged, test_size=0.2, random_state=42)
X_train_merged, X_val_merged, y_train_merged, y_val_merged = train_test_split(X_train_merged, y_train_merged, test_size=0.2, random_state=42)

## Scaling

In [18]:
from sklearn.preprocessing import StandardScaler

In [19]:
SS = StandardScaler()

# fit the scaler on the training data
SS.fit(X_train)

X_train = SS.transform(X_train)
X_val = SS.transform(X_val)
X_test = SS.transform(X_test)

## PCA

In [2]:
from sklearn.decomposition import PCA

In [15]:
#principal component analysis on X_train
pca = PCA(n_components=5)
pca.fit(X_train)
X_train_pca = pca.transform(X_train)
X_val_pca = pca.transform(X_val)
X_test_pca = pca.transform(X_test)

## Polynomial features

In [17]:
from sklearn.preprocessing import PolynomialFeatures

In [18]:
# preprocess the data by adding interaction features

poly = PolynomialFeatures(2, interaction_only=True, include_bias=False)
X = poly.fit_transform(X)

In [19]:
X_train_pol, X_test_pol, y_train_pol, y_test_pol = train_test_split(X, y, test_size=0.2, random_state=42)
X_train_pol, X_val_pol, y_train_pol, y_val_pol = train_test_split(X_train_pol, y_train_pol, test_size=0.2, random_state=42)

## Comparison of datasets & features

In [22]:
from sklearn.ensemble import RandomForestClassifier

In [42]:
model = RandomForestClassifier(n_estimators=256, max_depth=4, random_state=42)

In [47]:
losses = {}

### Baseline model

In [48]:
baseline = model.fit(X_train, y_train)
preds_baseline = model.predict_proba(X_test)
loss_baseline = log_loss(y_test, preds_baseline)
losses['baseline'] = loss_baseline

### Merged dataset

In [49]:
merged = model.fit(X_train_merged, y_train_merged)
preds_merged = model.predict_proba(X_test_merged)
loss_merged = log_loss(y_test_merged, preds_merged)
losses['merged'] = loss_merged

### Scaled dataset

In [50]:
scaled = model.fit(X_train_SS, y_train)
preds_scaled = model.predict_proba(X_test_SS)
loss_scaled = log_loss(y_test, preds_scaled)
losses['scaled'] = loss_scaled

### PCA dataset

In [51]:
PCAed = model.fit(X_train_pca, y_train)
preds_PCAed = model.predict_proba(X_test_pca)
loss_PCAed = log_loss(y_test, preds_PCAed)
losses['PCAed'] = loss_PCAed

In [52]:
for key, value in losses.items():
    print(f"The loss from {key} model is {value}.")

The loss from baseline model is 0.03783092611145073.
The loss from merged model is 0.04270389795813338.
The loss from scaled model is 0.03799545414268243.
The loss from PCAed model is 0.05997817773183873.


Based on these results, we decided not to include the external data and not to apply any preprocessing.