In [0]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
import seaborn as sns
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [0]:
train_df = pd.read_csv('/kaggle/input/cte-ml-hack-2019/train_real.csv')
test_df = pd.read_csv('/kaggle/input/cte-ml-hack-2019/test_real.csv')

# Train-Test split
# 
# Here we split train_df into train and validation dataframes
#

In [0]:
train_df["Azimuthal_angle"] = np.sqrt(train_df["Azimuthal_angle"])

train_df["H_dist_Hydro"] = np.sqrt(train_df["H_dist_Hydro"])

train_df["H_dist_Fire"] = np.sqrt(train_df["H_dist_Fire"])

train_df["H_dist_Road"] = np.sqrt(train_df["H_dist_Road"])

train_df["Incline"] = np.sqrt(train_df["Incline"])

train_df['Hillshade_9am'] = (train_df['Hillshade_9am'])**3

train_df = train_df.drop(train_df[(train_df['Azimuthal_angle']>25)].index)
train_df = train_df.reset_index(drop = True)
train_df = train_df.drop(train_df[(train_df['H_dist_Hydro']<5)].index)
train_df = train_df.reset_index(drop = True)

X = train_df.drop(['Id', 'label','Soil'], axis=1)
y = train_df['label']

X_test = test_df.drop(['Id','Soil'], axis=1)

X_train, X_validation, y_train, y_validation = train_test_split(X, y, test_size=0.2, random_state=42)

In [0]:
y_train.head()

In [0]:
X_train.head()

# EDA
# 
# This is where we start to explore the data and the features to see if we can apply some algorithms to the features to make them more usable for our model

In [0]:
#Just checking features for any unnaturally high skewness

print(train_df.skew())

In [0]:
# Correlation tells relation between two attributes.
# Correlation requires continous data. Hence, ignore Wilderness_Area and Soil_Type as they are binary

#Index where continuous data ends
size = 10

data=train_df.iloc[:,:size] 

cols=data.columns 

data_corr = data.corr()

threshold = 0.5

corr_list = []

for i in range(0,size):
    for j in range(i+1,size):
        if (data_corr.iloc[i,j] >= threshold and data_corr.iloc[i,j] < 1) or (data_corr.iloc[i,j] < 0 and data_corr.iloc[i,j] <= -threshold):
            corr_list.append([data_corr.iloc[i,j],i,j]) 

s_corr_list = sorted(corr_list,key=lambda x: -abs(x[0]))

for v,i,j in s_corr_list:
    print ("%s and %s = %.2f" % (cols[i],cols[j],v))

In [0]:
#Plotting the features with high correlation along with the labels
for v,i,j in s_corr_list:
    sns.pairplot(train_df, hue="label", height=6, x_vars=cols[i],y_vars=cols[j] )
    plt.show()

# Basic Linear models
# 
# We use Random Forest and K Nearest Neighbours and Extreme Gradiend Boost models and use the one which gives us  higher accuracy
# 
#

In [0]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

rf = RandomForestClassifier(random_state=42, n_estimators=180,n_jobs=-1,max_depth = 8).fit(X_train, y_train)

knn = KNeighborsClassifier(7).fit(X_train,y_train)

In [0]:
rf_validation_res=rf.predict(X_validation)
print(accuracy_score(rf_validation_res, y_validation))

In [0]:
knn_validation_res=knn.predict(X_validation)
print(accuracy_score(knn_validation_res, y_validation))

In [0]:
test_res = rf.predict(X_test)

In [0]:
submission_df = pd.DataFrame()
submission_df['Id'] = test_df['Id']

In [0]:
submission_df['Predicted'] = test_res.tolist()

In [0]:
submission_df.tail()

In [0]:
submission_df.to_csv('ml_hack_submission.csv',index=False)

In [0]:
!ls