In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        #import the necessary libraries you are going to use
import warnings
warnings.filterwarnings('ignore')

# Any results you write to the current directory are saved as output.

#### Importing the datasets `AMP_TrainSt.csv` and `Test.csv` and separating them into testing and training data.

In [None]:
Test = pd.read_csv ("../input/ace-class-assignment/Test.csv")#Importing the test data
Train = pd.read_csv ("../input/ace-class-assignment/AMP_TrainSet.csv")# Importing the TrainSet
                     

#### Checking the dimensions of the datasets

In [None]:
Test.shape, Train.shape #Getting the shape of the datasets

#### Viewing a section of the data:

In [None]:
Test.head(10) #Displaying a section of the Test data 

In [None]:
Train.head(10) # Displaying a section of the Training set

#### Checking the data types of each of the attributes of the `Train` and `Test` data sets.

In [None]:
#Checking the data types of the variables
Train.dtypes

In [None]:
Test.dtypes

### Descriptive Statistics


In [None]:
Test.describe() #Used to provide descriptive statistics of the Test data set

In [None]:
Train.describe() #Used to provide descriptive statistics of the Test data set

### Checking for the ** 'na' ** values from both the Test and Train datasets

In [None]:
Test.isna() ##Checking for na values in the Test Set

In [None]:
Train.isna() #Checking for na values in the Training Set

### Classification Distribution
#### Classification distribution is done to know how balanced class values are.

In [None]:
#Plotting a bar graph to check the distribution of the class
Train.groupby('CLASS').size().plot(kind='bar') 


### **Correlation between Attributes**
This is used to test for the correlation between two variables and whether the change of one influences the change of the other. The Pearson's Correlation coefficient is commonly used to achieve this. It assumes a normal distribution of attributes involved.
A correlation of 1 shows very strong positive correlation while a correlation of -1 shows a very strong negative correlation.

In [None]:
#Testing for correlation between attributes
Train.corr(method='pearson')

#### A heat map is used to well depict the correlation between attributes.



In [None]:
# Plotting a heat map to show the correkation between the data
import seaborn as sns
import matplotlib.pyplot as plt
plt.figure(figsize=(6,6))
sns.heatmap(Train.corr(method='pearson'))

#### To check the correlation of the Attributes in regards to the class.

In [None]:
#Performing Pearson correlation
Train.corr(method='pearson')['CLASS']

### **Determining the Skew of Univariate Distributions**
#### Normal skew assumes a Gaussian distribution, which is a normal bell curve that is shifted or squashed in one direction or another.
#### The skew() function can be used to calculate the skew of each attribute from the Pandas library.
#### If the skewness lies above 1 or below -1, then it is correct to infer that the data is highly skewed.If it lies between +0.5 to -0.5 then it is moderately skewed. The data is said to be symmetric if the skewness value is zero.
#### Positively skewed data can be transformed using the `log`, `cube root` and `square root` functions.

In [None]:
#Plotting a bar chart that shows the skewness of the data
Train.skew().plot(kind='bar')

### Visualization 
#### The `seaborn` library is used to generate plots to help us better understand the data.

In [None]:
# Plotting Histograms to represent the distribution of various features
plt.figure(figsize=(17,17))
Train.hist()
plt.subplots_adjust(bottom=1,right=2, top=3)# fit the plots and adjust them for visibility
plt.show()

### Box and Whisker Plots
#### Boxplots summarize the distribution of each attribute drawing a line for the median (mid value) and a box that spans from the first quartile (25th percentile) to the third quartile (75th percentile).

In [None]:
Train.plot(kind='box', subplots=True, layout=(5,5), sharex=False, sharey=False)
plt.subplots_adjust(bottom=1,right=2, top=3)# fit the plots and adjust them for visibility
plt.show()

### Density Plots

#### The Density plots are another alternative way of getting a quick idea of the distribution of each attribute. They are more or less a histogram with a smooth curve drawn through the top of each bin.


In [None]:
Train.plot(kind='density', subplots=True, layout=(4,4), sharex=False)
plt.show

### Multivariate Plots
#### Multivariate plots provide examples of two plots showing multiple interactions between multiple variables. Such plots include:
* Scatter plot matrix
* correlation matrix plot

### Scatter Plot
#### A scatter plot shows the relationship between two variables as dots in two dimensions, one axis for each attribute.

In [None]:

#importing pairplot from seaborn for the Train data
sns.pairplot(Train)

In [None]:
correlations = Train.corr()
# plot correlation matrix
fig = plt.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(correlations, vmin=-1, vmax=1)
fig.colorbar(cax)
ticks = np.arange(0,9,1)
#setting the x and y axis
ax.set_xticks(ticks)
ax.set_yticks(ticks)
ax.set_xticklabels(Train.columns)
ax.set_yticklabels(Train.columns)
plt.show()

In [None]:
g = sns.PairGrid(Train, hue="CLASS")
sns.PairGrid(Train,hue='CLASS', vars=Train[['CLASS','NT_EFC195']]) #Hue must be a categorical variable
g.map(plt.scatter)


### Data Preparation for Machine Learning
#### Since most machine learning algorithms make a lot of assumptions about the data we provide them with, it is very essential to prepare the data in such a way that best exposes the structure of the problem you need to solve using these algorithms. Different types of data however require different transforms during pre-processing.
#### The normally followed procedure involves:
* Splitting the dataset into input.output (I/O)variables for machine learning.
* Applying a pre-processing transform to the input variables.
* Data summarization to reveal the change.

#### The fit and multiple transform method from sci-kit learn is normally preferred during data pre-processing.
#### The `fit()` function is used to prepare the parameters of the data. The `transform()` function on the same dataset is used to prepare it for modeling and later on the test data or the validation dataset for any new data generated in the future. This is useful in representing the data using plots.

### Rescaling/Normalizing Data
#### Depending on the skewness of the data, data with many varying scales, machine learning algorithms benefit from rescaling this data so that all attributes can have the same scale. Attreibutes are normally scales in the range between 0 and 1.


In [None]:
from numpy import set_printoptions
from sklearn.preprocessing import MinMaxScaler

array = Train.values
# separate array into input and output components
X = array[:,0:10]
Y = array[:,10]
scaler = MinMaxScaler(feature_range=(0, 1))
rescaledX = scaler.fit_transform(X)
# summarize transformed data
set_printoptions(precision=3)
print(rescaledX[0:5,:])

### Data Standaridization 
  #### Standardization of data is useful in the transformation of attributes of data with a Gaussian (normal) distribution with a standard deviation of 1 and a mean of 0. The StandardScaler library of sci-kit learn is used to perform this function. 

In [None]:
from sklearn.preprocessing import StandardScaler

array2 = Train.values
#  array is separated into input and output components
X = array2[:,0:10]
Y = array2[:,10]
scaler = StandardScaler().fit(X)
rescaledX = scaler.transform(X)
# summarizing the transformed data
set_printoptions(precision=3)
print(rescaledX[0:5,:])

### Data Normalization
#### Not to be confused with rescaling where data is often made to fit between values of 0 and 1. This type of normalization involves to rescaling each row/observation to have a length of 1. Often useful for sparse datasets with attributes of varying scales using algorithms such as k-Nearest neighbors and neural networks.The Normalizer class4 pakage is employed for this task.

In [None]:
from sklearn.preprocessing import Normalizer

array_3 = Train.values
# separate array into input and output components
X = array_3[:,0:10]
Y = array_3[:,10]
scaler = Normalizer().fit(X)
normalizedX = scaler.transform(X)
# summarize transformed data
set_printoptions(precision=3)
print(normalizedX[0:5,:])
print(type(normalizedX))#print the data type so we can know what we are 
#working with in the dataset.

### Binarizing Data
#### This is a useful technique especially when data has different probabilities. The data is normally transformed by creating a binary threshold where all values above this threshold are marked as `1` while those below are marked as `0`. The package used for this is Binarizer from scikit-learn. 

In [None]:
# Data binarization
from sklearn.preprocessing import Binarizer

array_4 = Train.values
# separate array into input and output components
X = array_4[:,0:11]
Y = array_4[:,11]
binarizer = Binarizer(threshold=0.0).fit(X)
binaryX = binarizer.transform(X)
# summarize transformed data
set_printoptions(precision=3)
print(binaryX[0:5,:])

### Feature Selection
#### Feature selection is a technique employed to choose the features that highly influence and contribute the overall predictive performance of the model while dropping those features that negatively affect model performance. 
* #### Feature selection reduces overfitting and ensures that redundant and unuseful data (noise) is eliminated from the model.
* #### This goes a long way in improving the accuracy/ predictive capacity of the model.
* #### Appropraite feature selection also ensures that the algorithms train faster. 

#### Statistical tests are used to select the features that have the strongest relationship to the class/output variable. The SlectKBest class2 library from sci-kit learn can be used with several statistical tests to select a specific number of wanted or unwanted features.
#### The most commonly employed feature selection methods are `Wrapper feature selection methods` and `Filter feature selective methods`. Wrapper methods are known to evaluate multiple models with different subsets of input features and select those features that result in the best performing model according to a performance metric. These techniques are normally computationally expensive. Such a method is Recursive Feature Elimination(RFE). Filter methods evaluate the relevance of the predictors outside of the predictive models using statistical techniques such as the chi square test mentioned above to evaluate the relationship between input and target variable.

In [None]:
from sklearn.feature_selection import SelectKBest, f_classif
array_5=Train.values
X = array_5[:,0:11]
Y = array_5[:,11]
sel_f = SelectKBest (f_classif, k=6)
array_4= sel_f.fit_transform(X, Y)
print(sel_f.get_support())


In [None]:
#Printing out the selected columns from the ftest from the previous step and saving them in a new variable.
xtrain = X[:, sel_f.get_support()]
xtrain


### Recursive Feature Elimination
#### This is a wrapper method that works by recursively removing attributes and building a model on the remaining attributes.



In [None]:
#importing the RFE library from sklearn 
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

#creating an array for training values
array_6 = Train.values
X = array_6[:,0:11]
Y = array_6[:,11]

# feature extraction
model = LogisticRegression()
rfe = RFE(model, 7)

# fitting and predicting the model
fit = rfe.fit(X, Y)
print("Num Features: ",  fit.n_features_)
print("Selected Features:",  fit.support_)
print("Feature Ranking: ",  fit.ranking_)
model.fit(Train.values[:,:11][:,fit.support_],Y) 
model.predict(Test.values[:,fit.support_])
otrfe = model.predict(Test.values[:,:11][:,fit.support_])
otrfe
print(np.count_nonzero(otrfe==1))
print(np.count_nonzero(otrfe==0))


### Evaluating the Machine Learning Algorithm
#### We should always split the training data from the test data to prevent `overfitti1ng`. An overfit algorithm works perfectly as it can remember the entire dataset rather than work predictively.
#### The evaluation is an estimate that we can use to determine how well we think the algorithm would perform in practice.
#### Once this is done, we can then re-train the final algorithm on the entire training set and get it ready for operational use. Some of the splitting methods used for machine learning algorithms are:
* k-fold cross validation
* Leave one out cross validation
* Repeated Random Test-Train Splits


#### In this case the K-fold cross validation was used. To split the data to several k-parts(7 folds). Each fold is given a chance to be a test set.The algorithm is trained and tested multiple times which i`ncreases the accuracy. The choice of k must be large enough to allow the size of each test partition to be a reasonable sample of the problem, while allowing enough repetitions of the train-test evaluation of the algorithm to provide a fair estimate of the algorithms performance on unseen data. 

In [None]:
#Kfold cross validation
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

num_folds = 97 #number of folds to use
seed = 7 #reproducibility

kfold = KFold(n_splits=num_folds, random_state=seed)
model = LogisticRegression()
results = cross_val_score(model, xtrain, Y, cv=kfold)
print(f"Accuracy:", (results.mean()*100.0, results.std()*100.0))
model.fit(Train.values[:,:11][:,fit.support_],Y) 
model.predict(Test.values[:,fit.support_])
otk = model.predict(Test.values[:,:11][:,fit.support_])
otk
print(np.count_nonzero(otk==1))
print(np.count_nonzero(otk==0))


#### Returning the output in a dataframe

In [None]:
repo_0 = pd.DataFrame(otk)
repo_0.columns=['CLASS'] # Creating a class column
repo_0.index.name= 'index' #Creating a culumn index
repo_0['CLASS']= repo_0['CLASS'].map({0.0:False, 1.0:True}) # Map function to change the 0.0 and 1.0 into False and True repectively
repo_0

In [None]:
repo_0.to_csv("kcsv") #convert the dataframe into csv

### Applying more classifiers

#### SVM


In [None]:
#Importing SVC library from sklearn
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
#Splitting the training data into 20 folds with a random state of 7
kfold = KFold(n_splits=20, random_state=7)
model = SVC()
results = cross_val_score(model, Train.values[:,0:11][:,fit.support_], Y, cv=kfold)
print(results.mean())

#Fitting and predicting the model using slected feaetures from the Training data
model.fit(Train.values[:,:11][:,fit.support_], Y)
model.predict(Test.values[:,:11][:,fit.support_])
otsvm = model.predict(Test.values[:,:11][:,fit.support_])
otsvm
print(np.count_nonzero(otsvm==1))
print(np.count_nonzero(otsvm==0))



#### Returning the SVM output in a dataframe

In [None]:
repo_1 = pd.DataFrame(otsvm)
repo_1.columns=['CLASS'] # Creating a class column
repo_1.index.name= 'index' #Creating a culumn index
repo_1['CLASS']= repo_1['CLASS'].map({0.0:False, 1.0:True}) # Map function to change the 0.0 and 1.0 into False and True repectively
repo_1

#returning the svm output in a csv file
repo_1.to_csv("nbmcsv1")

### Naive Bayes
#### This is a probabilistic classifier, that assumes Gaussian distribution and calculates the probability of each class and the conditional probability of the class given an input value/feature. Assuming these probabilities are all independent, they are estimated for new data and multiplied together.
#### The model gave better results when using the entire dataset (instead of a few features from the Training dataset. The computational time was not greatly affected.

In [None]:
from sklearn.naive_bayes import GaussianNB #import naive bayes library from sklearn

#Split the data into 40 folds with a random state of 7
kfold = KFold(n_splits=40, random_state=7)
model = GaussianNB()
results = cross_val_score(model, Train.values[:,0:11], Y, cv=kfold)
print(results.mean())

#fitting the model
model.fit(Train.values[:,:11],Y) 
#predicting using the Test data
otp = model.predict(Test.values[:,:11])
otp

print(np.count_nonzero(otp==1))
print(np.count_nonzero(otp==0))

#Returning the Naive Bayes output in a dataframe
repo_2 = pd.DataFrame(otp)
repo_2.columns=['CLASS'] # Creating a class column
repo_2.index.name= 'index' #Creating a culumn index
repo_2['CLASS']= repo_2['CLASS'].map({0.0:False, 1.0:True}) # Map function to change the 0.0 and 1.0 into False and True repectively
repo_2

#Storing the dataframe output in a csv file.
repo_2.to_csv("nbmcsv1")