In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# DATA VISUALIZATION
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import seaborn as sns

#scikit learn libraries
#Lineaer Models
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis


from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
# Tree
from sklearn.tree import DecisionTreeClassifier

# sklearn libraries
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split,KFold,cross_val_score
from sklearn.preprocessing import normalize
from sklearn.metrics import confusion_matrix,accuracy_score,precision_score,recall_score,f1_score,matthews_corrcoef,classification_report,roc_curve
from sklearn.externals import joblib
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

#let's remove the annoying warnings from our cells.
import warnings
warnings.filterwarnings('ignore')

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [None]:
## Loading the datasets

Train = pd.read_csv("../input/ace-class-assignment/AMP_TrainSet.csv")
Test = pd.read_csv("../input/ace-class-assignment/Test.csv")

## Taking a preview of the data
Train.head()

In [None]:
## Loking at the dimensions of the dataset

Train.shape,Test.shape

In [None]:
## Looking for misssing and null values
Train.isnull().any().sum()

In [None]:
## Looking the data types of the data
Train.dtypes

In [None]:
## Looking at the description of the data 
Train.describe()

## Looking at the data:
* There are two categorical features CLASS and NT_EFC195
* There is a big difference between the 75th percentile and the maximum values for columns  FULL_Charge and FULL_AcidicMolPerc
* The columns FULL_AcidicMolPerc and FULL_OOBM850104 have both negative and positive values.

In [None]:
# Looking at the ratio of the binary data for the class

All = Train.shape[0]
Positive = Train[Train['CLASS'] == 1]
Negative = Train[Train['CLASS'] == 0]

x = len(Positive)/All
y = len(Negative)/All

print('Positives :',x*100,'%')
print('Negatives :',y*100,'%')

## The sets are evenly balanced and hence there will be no need to use SMOTE OR NEAR MISS

In [None]:
# Visualizing the data class
labels = ['Negatives','Positives']
classes = pd.value_counts(Train['CLASS'], sort = True)
classes.plot(kind = 'bar', rot=0)
plt.title("Visualizing the data class")
plt.xticks(range(2), labels)
plt.xlabel("Class")
plt.ylabel("Frequency")

## Plot the distribution of features

In [None]:
# distribution of features
# This feature works with numerical data not categorical data.
features =Train.drop(["CLASS","NT_EFC195"],axis=1).columns

plt.figure(figsize=(12,12*4))
gs = gridspec.GridSpec(12, 1)
for i, cn in enumerate(Train[features]):
    ax = plt.subplot(gs[i])
    sns.distplot(Train[cn][Train.CLASS == 1], bins=50)
    sns.distplot(Train[cn][Train.CLASS == 0], bins=50)
    ax.set_xlabel('')
    ax.set_title('histogram of feature: ' + str(cn))
plt.show()

In [None]:
# PCA (Principal Component Analysis) mainly using to reduce the size of the feature space while
# retaining as much of the information as possible.
# In here all the features transformed into 2 features using PCA.

X = Train.drop(['CLASS'], axis = 1)
y = Train['CLASS']

pca = PCA(n_components=2)
principalComponents = pca.fit_transform(X.values)
principalDf = pd.DataFrame(data = principalComponents
             , columns = ['principal component 1', 'principal component 2'])
FTrain = pd.concat([principalDf, y], axis = 1)
FTrain.head()

In [None]:
# 2D visualization
fig = plt.figure(figsize = (8,8))
ax = fig.add_subplot(1,1,1) 
ax.set_xlabel('Principal Component 1', fontsize = 15)
ax.set_ylabel('Principal Component 2', fontsize = 15)
ax.set_title('2 component PCA', fontsize = 20)
targets = [0, 1]
colors = ['r', 'g']
for target, color in zip(targets,colors):
    indicesToKeep = FTrain['CLASS'] == target
    ax.scatter(FTrain.loc[indicesToKeep, 'principal component 1']
               , FTrain.loc[indicesToKeep, 'principal component 2']
               , c = color
               , s = 50)
ax.legend(targets)
ax.grid()

In [None]:

# Data splitting

# splitting the faeture array and label array keeping 80% for the trainnig sets
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.20)

# normalize: Scale input vectors individually to unit norm (vector length).
X_train = normalize(X_train)
X_test=normalize(X_test)

In [None]:
# Spot-Check Algorithms

models = []
models.append(('LR', LogisticRegression()))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC()))
models.append(('NN',MLPClassifier()))
models.append(('SG',SGDClassifier()))

# evaluate each model in turn

results = []
names = []
seed =25
for name, model in models:
    kfold = KFold(n_splits=10, random_state=seed,shuffle=True)
    cv_results = cross_val_score(model, X_train, y_train, cv=kfold, 
    scoring='accuracy')
    results.append(cv_results)
    names.append(name)
    print("%s: %f (%f)" % (name, cv_results.mean(), cv_results.std()))
#print(msg)


# Model Evaluation


In [None]:

#scoring knn
knn_accuracy_score  = accuracy_score(y_test,knn_predicted_test_labels)
knn_precison_score  = precision_score(y_test,knn_predicted_test_labels)
knn_recall_score    = recall_score(y_test,knn_predicted_test_labels)
knn_f1_score        = f1_score(y_test,knn_predicted_test_labels)
knn_MCC             = matthews_corrcoef(y_test,knn_predicted_test_labels)

In [None]:
for n in results:
    print(n)
#names['LR'].predict(X_test)