In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

## Importing Libraries

In [None]:
###Importing libraries
import pandas
import scipy
import numpy
import matplotlib
import sklearn



## Loading some libraries
### These are some of the libraries I think I will need

In [None]:
from pandas import read_csv #for reading in csv files
from pandas.plotting import scatter_matrix #for showing how one variable is affected by another
from matplotlib import pyplot #for plotting graphs
from sklearn.model_selection import train_test_split  # for splitting my data into train and test
from sklearn.model_selection import cross_val_score  #to estimate the skill of a machine learning model on unseen data
from sklearn.model_selection import StratifiedKFold  #he folds are selected so that the mean response value is approximately equal in all the folds. In the case of a dichotomous classification, this means that each fold contains roughly the same proportions of the two types of class labels
from sklearn.metrics import classification_report #Visual classification reports are used to compare classification models to select models that are “redder”, e.g. have stronger classification metrics or that are more balanced.
from sklearn.metrics import confusion_matrix #A confusion matrix is a table that is often used to describe the performance of a classification model (or “classifier”) on a set of test data for which the true values are known. It allows the visualization of the performance of an algorithm.
from sklearn.metrics import accuracy_score  #It is the ratio of number of correct predictions to the total number of input samples
from sklearn.linear_model import LogisticRegression #an algorithm for classification
from sklearn.tree import DecisionTreeClassifier #create a model that predicts the value of a target variable by learning simple decision rules inferred from the data features.
from sklearn.neighbors import KNeighborsClassifier #for classification
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis  #used for modeling differences in groups i.e. separating two or more classes
from sklearn.naive_bayes import GaussianNB #an algorithm that estimates the mean and the standard deviation from your training data,
from sklearn.svm import SVC #an algorithm that creates a line or a hyperplane which separates the data into classes
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import NearestCentroid
from sklearn.neural_network import MLPClassifier

In [None]:
#this suprreses unnecesary warnings from my output
import warnings
warnings.filterwarnings('ignore')

## Loading My Dataset

##### I am going to first use my training dataset, then the test dataset last.

In [None]:
train = pandas.read_csv('/kaggle/input/ace-class-assignment/AMP_TrainSet.csv')#reading in my train dataset
train


In [None]:
test= pandas.read_csv('/kaggle/input/ace-class-assignment/Test.csv') #reading in my test dataset
test

## Inspecting my train dataset

In [None]:
train.shape #a tuple that gives you an indication of the number of dimensions in the array.

In [None]:
train.isnull().sum()  ###this will show the number of null values in my data

In [None]:
train.count()  #returns number of non-null values in my data

##### It seems i have no missing values in my data

In [None]:
train.describe()  #returns summary of the whole data

In [None]:
train.info() #It returns range, column, number of non-null objects of each column, datatype and memory usage

##### This shows that my dataset has 3038 rows (instances) and 12 columns (attributes)

##### I will also take a look at how many instances i have for each class

In [None]:
train.groupby('CLASS').size()

In [None]:
train.groupby('CLASS').size().plot(kind='bar') #i can also show this in a graph form
pyplot.show()

#### I have two groups of classes, each with 1519 instances

# DATA VISUALISATION

#### I will start with univariate plots to see each individual variable

In [None]:
#visualizing using histograms
train.hist(figsize=(16,16))
pyplot.show()

#### These histograms show that FullAcidicMolPerc is exponentially distributed while the rest are have a Gaussian distribution except for NT_EFC195 and CLASS. Histograms also help us identify outliers. From this output, I can say there are no outliers.

In [None]:
train.corr(method='pearson')['CLASS'] #Here I tried to see the correlation of all attributes with the class

## Multivariate 

In [None]:
#plotting a heatmap to show correlation of data
pyplot.figure(figsize=(10,10))
sns.heatmap(train.corr(method='pearson'))
pyplot.show()

#### i did this plot in order to see which features are highly correlated as this can be a problem in soome models if some features are used together whereas they are highly correlated.

In [None]:
#scatter plot

import seaborn as sns
sns.pairplot(train)

#### A scatter plot shows the relationship between two variables as dots in two dimensions. So from this output, i can be able to see the relationship between variables. If they show a good correlation, then they ccan be removed in feature selection.

# EVALUATING ALGORITHMS
#### I will now evaluate some algorithms and estimate their accuracy on unseen data.

### Building models

#### I will first split my train data into test and train, so that I test the effectiveness of my model using the test data.

In [None]:
array = train.values #first create a variable for extracting the values from the train dataset to be used
X = array[:,0:11]  #selecting which columns to use, in this case all of them
Y = array[:,11] #selecting the label for our data, which is the last column
test_size = 0.32 #this is the size of my test data, meaning my train data is 68%
seed = 3 #this is to initialize the random generator. So everytime i run this with a different seed number, i will get a different output
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=test_size,
random_state=seed)

## IMPORTING MODELS

#### I will now use the models I imported in the beginning from sklearn to use for my algorithm.

In [None]:


models = []
models.append(('LR', LogisticRegression(solver='liblinear', multi_class='ovr')))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC(gamma='auto')))
models.append(('RTC', RandomForestClassifier()))
models.append(('SGD',SGDClassifier()))
models.append(('NC', NearestCentroid()))
models.append(('MLPC',MLPClassifier()))
# evaluate each model in turn
results = []
names = []
for name, model in models:
    kfold = StratifiedKFold(n_splits=10)
    cv_results = cross_val_score(model, X_train, Y_train, cv=kfold, scoring='accuracy')
    results.append(cv_results)
    names.append(name)
    print('%s: %f (%f)' % (name, cv_results.mean(), cv_results.std()))
    

In [None]:
# Compare Algorithms
pyplot.boxplot(results, labels=names)
pyplot.title('Algorithm Comparison')
pyplot.show()

#### from here, RTC is the best performing, followed by NB.

In [None]:
# Make predictions on validation dataset, using my selected model from above(NB)
model = GaussianNB()
model.fit(X_train, Y_train)
predictions = model.predict(X_test)

from sklearn.metrics import matthews_corrcoef
print('MCC', matthews_corrcoef(model.predict(X_test), Y_test))

In [None]:
# Make predictions on validation dataset, using my selected model from above(RTC)
model = RandomForestClassifier()
model.fit(X_train, Y_train)
predictions = model.predict(X_test)

from sklearn.metrics import matthews_corrcoef
print('MCC', matthews_corrcoef(model.predict(X_test), Y_test))

### The MCC gives a score close to 100.

#### I will now test the performance of my model first using all the features, then with some selected ones

In [None]:
#with all features
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB

array = train.values
X = array[:,0:11]
Y = array[:,11]
test_size = 0.32
seed = 3
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=test_size,
random_state=seed)
model = GaussianNB()
model.fit(X_train, Y_train)
result = model.score(X_test, Y_test)
print("Accuracy: ",  (result*100.0))


In [None]:
# Evaluate predictions
print(accuracy_score(Y_test, predictions))
print(confusion_matrix(Y_test, predictions))
print(classification_report(Y_test, predictions))

### This gives me a good score of 93.6%. I will now try selecting some features based on feature importance.

### This will be my first submission. It gave me a score of 99%

In [None]:
Y=train.CLASS
X=train.drop("CLASS",axis=1)
OUTPUT=model.fit(X, Y).predict(test.values)
OUTPUT_1=pd.DataFrame(OUTPUT)
OUTPUT_1.columns=["CLASS"]
OUTPUT_1.index.name="Index"
OUTPUT_1["CLASS"]=OUTPUT_1["CLASS"].map({0.0:False,1.0:True})  #changing 0 values to False, 1 to True
OUTPUT_1.to_csv("output") #converting my output file into a csv
print(OUTPUT_1["CLASS"].unique()) #printing out the unique values, i expect to get 2
print(OUTPUT_1["CLASS"].nunique()) #the sum of unique values
print(OUTPUT_1.groupby("CLASS").size()[0].sum())
print(OUTPUT_1.groupby("CLASS").size()[1].sum())

In [None]:
#feature selection using feature importance
X = train.iloc[:,0:11]  #independent columns
y = train.iloc[:,-1]    #target column
from sklearn.ensemble import ExtraTreesClassifier
import matplotlib.pyplot as plt
model = ExtraTreesClassifier()
model.fit(X,y)
print(model.feature_importances_) #use inbuilt class feature_importances of tree based classifiers
#plot graph of feature importances for better visualization
feat_importances = pd.Series(model.feature_importances_, index=X.columns)
feat_importances.nlargest(10).plot(kind='barh')
plt.show()

### I will select the features with the highest bars. The number of features selected will depend on the accuracy.

In [None]:
train.columns
newtrain=train.drop([  'FULL_AURR980107', 'FULL_OOBM850104', 'NT_EFC195', 'AS_DAYM780201', 'AS_FUKS010112', 'CT_RACS820104'], axis=1)
newtest=test.drop([  'FULL_AURR980107', 'FULL_OOBM850104', 'NT_EFC195', 'AS_DAYM780201', 'AS_FUKS010112', 'CT_RACS820104'], axis=1)
array = newtrain.values
X = array[:,0:5]
Y = array[:,-1]
test_size = 0.32
seed = 3
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=test_size,
random_state=seed)

In [None]:
Y_new=newtrain.CLASS
X_new=newtrain.drop("CLASS",axis=1)
OUTPUT=model.fit(X_new, Y_new).predict(newtest.values)
OUTPUT_new=pd.DataFrame(OUTPUT)
OUTPUT_new.columns=["CLASS"]
OUTPUT_new.index.name="Index"
OUTPUT_new["CLASS"]=OUTPUT_new["CLASS"].map({0.0:False,1.0:True})
OUTPUT_new.to_csv("out1")
print(OUTPUT_new["CLASS"].unique())
print(OUTPUT_new["CLASS"].nunique())
print(OUTPUT_new.groupby("CLASS").size()[0].sum())
print(OUTPUT_new.groupby("CLASS").size()[1].sum())

#### This gave me a score of 85%

In [None]:
#this is how i checked the accuracy from the features selected
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB

array = train.values
X = array[:,[0,2,3,5,7]]
Y = array[:,11]
test_size = 0.32
seed = 3
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=test_size,
random_state=seed)
model = GaussianNB()
model.fit(X_train, Y_train)
result = model.score(X_test, Y_test)
print("Accuracy: ",  (result*100.0))

In [None]:
#i will now select 6 features to see if my score improves

array = train.values
X = array[:,[0,1,2,3,5,7]]
Y = array[:,11]
test_size = 0.32
seed = 3
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=test_size,
random_state=seed)
model = GaussianNB()
model.fit(X_train, Y_train)
result = model.score(X_test, Y_test)
print("Accuracy: ",  (result*100.0))

### the accuracy deacreases. let me try 4 features instead

In [None]:
array = train.values
X = array[:,[1,2,3,7]]
Y = array[:,11]
test_size = 0.32
seed = 3
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=test_size,
random_state=seed)
model = GaussianNB()
model.fit(X_train, Y_train)
result = model.score(X_test, Y_test)
print("Accuracy: ",  (result*100.0))

### accuracy deacreases even further. so if i weere to feature select, I would use 5 features because they give me the best accuracy.

## Let me try the RTC model

In [None]:
#with all features
from sklearn.model_selection import train_test_split


array = train.values
X = array[:,0:11]
Y = array[:,11]
test_size = 0.32
seed = 3
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=test_size,
random_state=seed)
model = RandomForestClassifier()
model.fit(X_train, Y_train)
result = model.score(X_test, Y_test)
print("Accuracy: ",  (result*100.0))

#### this actually gives me a better score than the one from GaussianNB. Let me submit it

In [None]:
# Evaluate predictions
print(accuracy_score(Y_test, predictions))
print(confusion_matrix(Y_test, predictions))
print(classification_report(Y_test, predictions))

In [None]:
Y_new2=train.CLASS
X_new2=train.drop("CLASS",axis=1)
OUTPUT2=model.fit(X_new2, Y_new2).predict(test.values)
OUTPUT_new1=pd.DataFrame(OUTPUT2)
OUTPUT_new1.columns=["CLASS"]
OUTPUT_new1.index.name="Index"
OUTPUT_new1["CLASS"]=OUTPUT_new1["CLASS"].map({0.0:False,1.0:True})  #changing 0 values to False, 1 to True
OUTPUT_new1.to_csv("outputRTC") #converting my output file into a csv
print(OUTPUT_new1["CLASS"].unique()) #printing out the unique values, i expect to get 2
print(OUTPUT_new1["CLASS"].nunique()) #the sum of unique values
print(OUTPUT_new1.groupby("CLASS").size()[0].sum())
print(OUTPUT_new1.groupby("CLASS").size()[1].sum())

### This gave me a score of 83%. let me feature select the 5 

In [None]:
from sklearn.model_selection import train_test_split


array = train.values
X = array[:,[0,2,3,5,7]]
Y = array[:,11]
test_size = 0.32
seed = 3
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=test_size,
random_state=seed)
model = RandomForestClassifier()
model.fit(X_train, Y_train)
result = model.score(X_test, Y_test)
print("Accuracy: ",  (result*100.0))

##### this gives me a lower score than when all features are selected. So i will stop here.