In [75]:
import numpy as np 
import pandas as pd
import seaborn as sns 
import matplotlib.pyplot as plt 
plt.style.use('fivethirtyeight')

import warnings 
warnings.filterwarnings ('ignore')

In [76]:
train_df =  pd.read_csv("Dataset/exoTrain.csv")
train_df

In [77]:
train_df.shape

Checking Null Values

In [78]:
train_df[train_df.isnull().any(axis = 1)]

In [79]:
sns.heatmap(train_df.isnull())

In [80]:
train_df.LABEL.unique()

The horizontal dashes in this plot would indicate the presence of missing values in respective column.
As there aren't any of such dashes seen we can conclude that there are no missing values in this data.

In [81]:
train_df['LABEL'].unique()

In [82]:
list(train_df[train_df['LABEL'] ==2].index)

There are total of two classes: one is for stars with exoplanets and the other for stars without exoplanets. 
Very few index for label 2 indicates that this class must belong to stars with exoplanets
We can also visualise this using countplot

In [83]:
plt.figure(figsize = (3,5))
ax = sns.countplot(x = 'LABEL', data=train_df)
ax.bar_label(ax.containers[0])

There is a huge imbalance in the data which isn't good for KNN (explained later in this notebook).
We will need to balance it using some resampling technique and we will use RandomOverSampler for this data.
We'll do that after building the model with imbalanced dataset to compare the results!

Replacing the labels


For ease of our model its always better to feed in the data in terms of 0 and 1

• Stars with Exoplanets: 2 - 1

• Stars without Exoplanets: 1 - 0

In [84]:
# Replacing labels
train_df = train_df.replace({"LABEL" : {2:1,1:0}})

In [85]:
train_df.LABEL.unique()

In [86]:
plot_df = train_df.drop(['LABEL'], axis = 1)
plot_df 

In [87]:
list(range(1,3198))

In [88]:
# Plot Random Star from Data - here 3
time = range(1,3198)
flux_val = plot_df.iloc[3,:].values
plt.figure(figsize = (15,5))
plt.plot(time, flux_val, linewidth = 1.17)


In [89]:
# Plot Random Star from Data - here 1100
time = range(1,3198)
flux_val = plot_df.iloc[1100,:].values
plt.figure(figsize = (15,5))
plt.plot(time, flux_val, linewidth = 1.17)


In [90]:
plt.figure(figsize = (20,10))
for i in range(1,4):
    plt.subplot(1,4, i)
    sns.boxplot(data = train_df, x = 'LABEL', y = 'FLUX.' + str(i))

We can see that the flux values more than 0.25x 10° are extreme outliers.

We can either drop it or replace its value with upper bridge value. For this usecase, we will simply drop it.

However you can try to compute on your own the upper bridge value using the formula given below:-

UB = 03 + 3 x 1QR:
UB - upper bridge, Q3 - 75th percentile, IQR - Interquartile range

In [91]:
train_df.drop(train_df[train_df['FLUX.2'] > 0.25e6].index, axis = 0, inplace = True)

In [92]:
sns.boxplot(data = train_df, x = 'LABEL', y = 'FLUX.' + str(np.random.randint(1000)))

# Working with KNN Classification

In [93]:
# Extracting independent (X), dependent (Y)

In [94]:
x = train_df.drop(['LABEL'], axis = 1)
y = train_df.LABEL

In [95]:
# Splitting data in Train,Test
from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.3, random_state = 84)

In [96]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
x_train_sc = sc.fit_transform(x_train)
x_test_sc = sc.fit_transform(x_test)


In [97]:
np.min(x_train_sc), np.max(x_train_sc)

In [98]:
from sklearn.neighbors import KNeighborsClassifier as KNC

# K = 5
knn_classifier = KNC(n_neighbors = 5, metric = 'minkowski', p = 2)

# Fitting Model
knn_classifier.fit(x_train_sc, y_train)


# Predict
y_pred = knn_classifier.predict(x_test_sc)


In [99]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report, roc_curve, auc

print("Validation accuracy of KNN:- ", accuracy_score (y_test, y_pred))
print ()
print("Classification report:- \n", classification_report(y_test, y_pred))

#Confusion matrix 
plt.figure(figsize=(15, 11))
plt.subplots_adjust(wspace = 0.3)
plt.suptitle("KNN Performance before handling the imbalance in the data", color = 'r', weight = 'bold')
plt.subplot (221)
sns.heatmap(confusion_matrix(y_test,y_pred), annot=True, cmap="Set2", fmt = "d", linewidths=3, cbar = False, 
            xticklabels=['nexo','exo'], yticklabels=['nexo', 'exo'], square = True)
plt.xlabel("True Labels", fontsize = 15, weight = 'bold', color = 'tab:pink')
plt.ylabel("Predicited Labels", fontsize = 15, weight = 'bold', color = 'tab:pink')
plt.title("CONFUSION MATRIX", fontsize=20, color = 'm')

#ROC curve and Area under the curve plotting
predicting_probabilites = knn_classifier.predict_proba(x_test_sc)[:,1]
fpr, tpr, thresholds = roc_curve (y_test, predicting_probabilites)
plt.subplot(222)
plt.plot(fpr, tpr, label = ("AUC :", auc(fpr, tpr)),color = "g")
plt.plot([1,0], [1,0], "k--")
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title("ROC - CURVE & AREA UNDER CURVE", fontsize=20, color = 'm')
plt.show

Handling Imbalance in the Data

In [100]:
x,y

In [101]:
#pip install imblearn
from imblearn.over_sampling import RandomOverSampler

ros = RandomOverSampler()
x_ros, y_ros  = ros.fit_resample(x,y)

In [102]:
y_ros.value_counts().plot(kind = 'bar')

In [103]:
from collections import Counter
print(f"Before ROS:- {Counter(y)} \n After ROS:- {Counter(y_ros)}")

In [104]:
# Splitting data in Train,Test - Balanced
from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test = train_test_split(x_ros,y_ros,test_size = 0.3, random_state = 84)

In [105]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
x_train_sc = sc.fit_transform(x_train)
x_test_sc = sc.fit_transform(x_test)


In [106]:
np.min(x_train_sc), np.max(x_train_sc)

In [107]:
from sklearn.neighbors import KNeighborsClassifier as KNC

# K = 5
knn_classifier = KNC(n_neighbors = 3, metric = 'minkowski', p = 2)

# Fitting Model
knn_classifier.fit(x_train_sc, y_train)


# Predict
y_pred = knn_classifier.predict(x_test_sc)

In [108]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report, roc_curve, auc

print("Validation accuracy of KNN:- ", accuracy_score (y_test, y_pred))
print ()
print("Classification report:- \n", classification_report(y_test, y_pred))

#Confusion matrix 
plt.figure(figsize=(15, 11))
plt.subplots_adjust(wspace = 0.3)
plt.suptitle("KNN Performance after balancing the data", color = 'r', weight = 'bold')
plt.subplot (221)
sns.heatmap(confusion_matrix(y_test,y_pred), annot=True, cmap="Set2", fmt = "d", linewidths=3, cbar = False, 
            xticklabels=['nexo','exo'], yticklabels=['nexo', 'exo'], square = True)
plt.xlabel("True Labels", fontsize = 15, weight = 'bold', color = 'tab:pink')
plt.ylabel("Predicited Labels", fontsize = 15, weight = 'bold', color = 'tab:pink')
plt.title("CONFUSION MATRIX", fontsize=20, color = 'm')

#ROC curve and Area under the curve plotting
predicting_probabilites = knn_classifier.predict_proba(x_test_sc)[:,1]
fpr, tpr, thresholds = roc_curve (y_test, predicting_probabilites)
plt.subplot(222)
plt.plot(fpr, tpr, label = ("AUC :", auc(fpr, tpr)),color = "g")
plt.plot([1,0], [1,0], "k--")
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title("ROC - CURVE & AREA UNDER CURVE", fontsize=20, color = 'm')
plt.show

In [110]:
# Choosing K value with minimum error
err_rate = []

for K in range(1, 11):
    knn_clsfr = KNC(n_neighbors = K)
    knn_clsfr.fit(x_train_sc, y_train)
    pred = knn_clsfr.predict(x_test_sc)
    err_rate.append(np.mean(pred != y_test))
    
arg, val = err_rate.index(min(err_rate)), min(err_rate)

plt.figure(figsize = (5,5))
plt.plot(range(1,11), err_rate,'co--', markersize = 8)
plt.plot(arg+1, val, marker = 'o', markersize = 8, markerfacecolor ='r', markeredgecolor = 'g')
plt.title("Error rate wrt K values with minimum K marked" )
plt.ylabel("Error Rate")
plt.xlabel("K values")
plt.show()