In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [None]:
#import the necessary libraries you are going to use
import warnings
warnings.filterwarnings('ignore')

# -----> Put your code here below:


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Working with AMP_Data_set
## `Training and testing data in Machine Learning`


In [None]:
#Load the datasets, there are two i.e the Train and Test datasets

Train = pd.read_csv("../input/amp-data-set/AMP_TrainSet.csv")
Test = pd.read_csv("../input/amp-data-set/Test.csv")



>Getting to know more about the data

In [None]:
#here, am trying to find the kind of data am dealing with.
print(type(Train))
print(type(Test))
print(Train.dtypes)
print(Test.dtypes)

In [None]:
# check the dimensions of your data
# this retuns the number of rows and columns in the data

Train.shape, Test.shape

#this helps to know how big the data is in terms of rows and columns.
#it also informs one of which data is labeled

> The data is pre-prepared so ill just continue to work with it, since the Train dataset is already labeled with a CLASS attribute.

In [None]:
#getting a description of the data
#Train.describe, Test.describe
Train.describe()
Test.describe()
#description gives a summary of the data.

In [None]:
#looking at the first 5 entries of my data
Train.head()
Test.head()


>Looking at the skewness of the data

In [None]:
#knowing data skewness allows one to perform data preparation and improve a model
Train.skew().plot(kind='bar')

> Data correlation.

In [None]:
#first ill review the pairwise correlation of the attributes.
Train.corr(method='pearson')

In [None]:
#Reviewing inter-correlation of attributes using heatmap
#graphical representation
plt.figure(figsize=(6,6))
sns.heatmap(Train.corr(method='pearson'))

In [None]:
#Ill also check the correlation in regards to the 'CLASS' attribute
Train.corr(method='pearson')['CLASS']

In [None]:
Train['CLASS'].value_counts

In [None]:
#I need to know the distribution of the class attribute of my data.
print(Train.groupby('CLASS').size().plot(kind='bar'))
#train.CLASS.value_counts().plot(kind='bar')


### From the above bar graph, the Class distribution is even which means am dealing with balanced data.

# Feature selection 
## Using recursive feature elimination

In [None]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

array = Train.values
X = array[:,0:11]
Y = array[:,11]
# feature extraction
model = LogisticRegression()
rfe = RFE(model, 4)
fit = rfe.fit(X, Y)
print("Num Features: ", fit.n_features_)
print("Selected Features:", fit.support_)
print("Feature Ranking: ", fit.ranking_)

In [None]:
#Calling out the column names so I can know which features am going to drop from RFE
Train.columns

## Using Feature importance

In [None]:
from sklearn.ensemble import ExtraTreesClassifier

array = Train.values
X = array[:,0:11]
Y = array[:,11]
# feature extraction
model = ExtraTreesClassifier()
model.fit(X, Y)
print(model.feature_importances_)

## Am going to use 4 features and drop the rest

In [None]:
# I will call the new Train data with selected features New_Train.
Train
New_Train = Train.drop(['FULL_Charge', 'FULL_AcidicMolPerc', 'FULL_DAYM780201', 'FULL_GEOR030101', 'AS_MeanAmphiMoment', 'AS_DAYM780201', 'AS_FUKS010112'], axis =1)

In [None]:
#dropping the same features in the test dataset
New_Test = Test.drop(['FULL_Charge', 'FULL_AcidicMolPerc', 'FULL_DAYM780201', 'FULL_GEOR030101', 'AS_MeanAmphiMoment', 'AS_DAYM780201', 'AS_FUKS010112'], axis =1)

In [None]:
#viewing tselected features
New_Train.columns

# Rescaling data

In [None]:
from numpy import set_printoptions
from sklearn.preprocessing import MinMaxScaler
array = New_Train.values

#seperating data into onput and output options
X = array[:,0:4]
Y = array[:,4]
scaler = MinMaxScaler(feature_range = (0,1))
rescaledX = scaler.fit_transform(X)

#summarising transformed data
set_printoptions(precision = 3)
print(rescaledX[0:4,:])
    

# Standardising data

In [None]:
from sklearn.preprocessing import StandardScaler


# Comparing models to use

In [None]:
#comparing different models to from which ill choose.
from matplotlib import pyplot
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC


# load dataset

array = New_Train.values

#split the dataset 
X = array[:,0:4]  #X = Train.drop(columns=['CLASS'])
Y = array[:,4]   #Y = Train['CLASS']

# prepare models and add them to a list
models = []
models.append(('LR', LogisticRegression()))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC()))

# evaluate each model in turn
results = []
names = []

for name, model in models:
    kfold = KFold(n_splits=40, random_state=10)
    cv_results = cross_val_score(model, X, Y, cv=kfold, scoring='accuracy')
    results.append(cv_results)
    names.append(name)
    msg = (name, cv_results.mean(), cv_results.std())
    print(msg)

# boxplot algorithm comparison
fig = pyplot.figure()
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)
pyplot.boxplot(results)
ax.set_xticklabels(names)
pyplot.show()

In [None]:
#now slitting data
#array = New_Train.values
X = New_Train.values[:,0:4]
Y = New_Train.values[:,4]
from sklearn.model_selection import train_test_split
X_Train, X_Test, Y_Train, Y_Test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [None]:
# also train and test the model on Matthews correlation coefficient.
from sklearn.metrics import matthews_corrcoef

GS = GaussianNB()
GS.fit(X_Train,Y_Train)
pred = GS.predict(X_Test)

print("The result is: ",np.round(matthews_corrcoef(Y_Test,pred) *100,2)," Mathew's Coef")

In [None]:
#now creating a model and training it
model = LogisticRegression(solver='liblinear', C=0.05, multi_class='ovr', random_state=30)
model.fit(X_Train, Y_Train)

In [None]:
from sklearn.model_selection import train_test_split
X_Train, X_Test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2,
random_state=42)

In [None]:
from sklearn.metrics import matthews_corrcoef

nv = GaussianNB()
nv.fit(X_Train,Y_Train)
pred = nv.predict(X_Test)

print("The result is: ",np.round(matthews_corrcoef(Y_Test,pred) *100,2)," Mathew's Coef")

In [None]:
#model = DecisionTreeClassifier()

#model.fit(X_train, y_train)

In [None]:
Test.head()

In [None]:
#we now need to predict on the test dataset
md = pd.DataFrame((New_Test.index,nv.predict(New_Test))).T
md = md.rename(columns={0:"Index",1:"CLASS"}).set_index('Index')
md.to_csv('maria.csv')

In [None]:
import os
os.listdir()
!ls ../../kaggle/working/