### Reading the data

In [1]:
import pandas as pd

In [2]:
data = pd.read_csv('train_values.csv')
target = pd.read_csv('train_labels.csv')
df = pd.merge(data, target, how='inner')

In [3]:
df.head()

Unnamed: 0,patient_id,slope_of_peak_exercise_st_segment,thal,resting_blood_pressure,chest_pain_type,num_major_vessels,fasting_blood_sugar_gt_120_mg_per_dl,resting_ekg_results,serum_cholesterol_mg_per_dl,oldpeak_eq_st_depression,sex,age,max_heart_rate_achieved,exercise_induced_angina,heart_disease_present
0,0z64un,1,normal,128,2,0,0,2,308,0.0,1,45,170,0,0
1,ryoo3j,2,normal,110,3,0,0,0,214,1.6,0,54,158,0,0
2,yt1s1x,1,normal,125,4,3,0,2,304,0.0,1,77,162,1,1
3,l2xjde,1,reversible_defect,152,4,0,0,0,223,0.0,1,40,181,0,1
4,oyt4ek,3,reversible_defect,178,1,0,0,2,270,4.2,1,59,145,0,0


### Data Preperation

In [4]:
df.drop(labels='patient_id', axis=1, inplace=True)
df = pd.get_dummies(df, drop_first=True)

In [5]:
df_Y = df['heart_disease_present']
df_X = df.drop('heart_disease_present', axis=1)

### Selecting most Important Features

In [6]:
from sklearn.ensemble import RandomForestClassifier

In [7]:
rfc = RandomForestClassifier(n_estimators=100)
rfc.fit(df_X, df_Y)
rfc.feature_importances_

array([0.05047833, 0.07      , 0.10419792, 0.08462275, 0.00719707,
       0.02549577, 0.08919488, 0.09732019, 0.03036702, 0.09533881,
       0.09726496, 0.06089681, 0.10473953, 0.08288596])

In [8]:
f_imp = pd.DataFrame(rfc.feature_importances_)
f_imp.columns = ['Importance']

In [9]:
f_imp['column_Name'] = 'A'
for i in range(0, (f_imp.shape[0]-1)):
    f_imp['column_Name'][i] = df_X.columns[i]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


#### Picking Top 8 Features

In [31]:
cols = []
for i in range(11):
    cols.append(f_imp.sort_values(by=['Importance'], ascending=False)['column_Name'][i])

In [32]:
import xgboost

In [33]:
xgb = xgboost.XGBClassifier()
xgb.fit(df_X[cols], df_Y)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

### Load Test Data and Make Predictions

In [34]:
test_data = pd.read_csv('test_values.csv')

In [35]:
test_X = test_data[cols]

Making Predictions

In [36]:
y_pred = xgb.predict_proba(test_X)

In [37]:
submission = pd.DataFrame()
submission['patient_id'] = test_data['patient_id']
submission['heart_disease_present'] = y_pred[:,1]
submission.to_csv('FSelwXGboost.csv', index=False)