In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

## Importing Libraries

In [None]:
###Importing libraries
import pandas
import scipy
import numpy
import matplotlib
import sklearn



## Loading some libraries
### These are some of the libraries I think I will need

In [None]:
from pandas import read_csv
from pandas.plotting import scatter_matrix
from matplotlib import pyplot
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

In [None]:
import warnings
warnings.filterwarnings('ignore')

## Loading My Dataset

##### I am going to first use my training dataset, then the test dataset last.

In [None]:
train = pandas.read_csv('/kaggle/input/ace-class-assignment/AMP_TrainSet.csv')
train


In [None]:
test= pandas.read_csv('/kaggle/input/ace-class-assignment/Test.csv')
test

## Inspecting my train dataset

In [None]:
train.shape

In [None]:
train.isnull().sum()  ###this will show the number of null values in my data

In [None]:
train.count()  #returns number of non-null values in my data

##### It seems i have no missing values in my data

In [None]:
train.describe()  #returns summary of the whole data

In [None]:
train.info() #It returns range, column, number of non-null objects of each column, datatype and memory usage

##### This shows that my dataset has 3038 rows (instances) and 12 columns (attributes)

##### I will also take a look at how many instances i have for each class

In [None]:
train.groupby('CLASS').size()

In [None]:
train.groupby('CLASS').size().plot(kind='bar')
pyplot.show()

#### I have two groups of classes, each with 1519 instances

# DATA VISUALISATION

#### I will start with univariate plots to see each indiviadual variable

In [None]:
train.hist(figsize=(16,16))
pyplot.show()

In [None]:
train.corr(method='pearson')['CLASS'] #Here I tried to see the correlation of all attributes with the class

## Multivariate 

In [None]:
correlations = train.corr()
# plot correlation matrix
fig = pyplot.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(correlations, vmin=-1, vmax=1)
fig.colorbar(cax)
ticks = np.arange(0,9,1)
ax.set_xticks(ticks)
ax.set_yticks(ticks)
ax.set_xticklabels(train.columns)
ax.set_yticklabels(train.columns)
pyplot.show()

#### i did this plot in order to see which features are highly correlated as this can be a problem in soome models if some features are used together whereas they are highly correlated.

# EVALUATING ALGORITHMS
#### I will now evaluate some algorithms and estimate their accuracy on unseen data.

### Building models

In [None]:
array = train.values
X = array[:,0:11]
Y = array[:,11]
test_size = 0.32
seed = 3
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=test_size,
random_state=seed)

In [None]:
len(X_test)

In [None]:


models = []
models.append(('LR', LogisticRegression(solver='liblinear', multi_class='ovr')))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC(gamma='auto')))
# evaluate each model in turn
results = []
names = []
for name, model in models:
    kfold = StratifiedKFold(n_splits=10)
    cv_results = cross_val_score(model, X_train, Y_train, cv=kfold, scoring='accuracy')
    results.append(cv_results)
    names.append(name)
    print('%s: %f (%f)' % (name, cv_results.mean(), cv_results.std()))

In [None]:
# Compare Algorithms
pyplot.boxplot(results, labels=names)
pyplot.title('Algorithm Comparison')
pyplot.show()

#### from here, NB is the best performing.

In [None]:
# Make predictions on validation dataset
model = GaussianNB()
model.fit(X_train, Y_train)
predictions = model.predict(X_test)
print(predictions)

#from sklearn.metrics import matthews_corrcoef
#print('MCC', matthews_corrcoef(model.predict(X_test), Y_test))

In [None]:
#with all features
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB

array = train.values
X = array[:,0:11]
Y = array[:,11]
test_size = 0.32
seed = 3
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=test_size,
random_state=seed)
model = GaussianNB()
model.fit(X_train, Y_train)
result = model.score(X_test, Y_test)
print("Accuracy: ",  (result*100.0))

In [None]:
# Evaluate predictions
print(accuracy_score(Y_test, predictions))
print(confusion_matrix(Y_test, predictions))
print(classification_report(Y_test, predictions))

In [None]:
test

In [None]:
Y=train.CLASS
X=train.drop("CLASS",axis=1)
OUTPUT=model.fit(X, Y).predict(test.values)
OUTPUT_1=pd.DataFrame(OUTPUT)
OUTPUT_1.columns=["CLASS"]
OUTPUT_1.index.name="Index"
OUTPUT_1["CLASS"]=OUTPUT_1["CLASS"].map({0.0:False,1.0:True})
OUTPUT_1.to_csv("output")
print(OUTPUT_1["CLASS"].unique())
print(OUTPUT_1["CLASS"].nunique())
print(OUTPUT_1.groupby("CLASS").size()[0].sum())
print(OUTPUT_1.groupby("CLASS").size()[1].sum())

In [None]:
#feature selection using feature importance
X = train.iloc[:,0:11]  #independent columns
y = train.iloc[:,-1]    #target column
from sklearn.ensemble import ExtraTreesClassifier
import matplotlib.pyplot as plt
model = ExtraTreesClassifier()
model.fit(X,y)
print(model.feature_importances_) #use inbuilt class feature_importances of tree based classifiers
#plot graph of feature importances for better visualization
feat_importances = pd.Series(model.feature_importances_, index=X.columns)
feat_importances.nlargest(10).plot(kind='barh')
plt.show()

In [None]:
train.columns

In [None]:
newtrain=train.drop([  'FULL_AURR980107', 'FULL_OOBM850104', 'NT_EFC195', 'AS_DAYM780201', 'AS_FUKS010112', 'CT_RACS820104'], axis=1)

In [None]:
newtest=test.drop([  'FULL_AURR980107', 'FULL_OOBM850104', 'NT_EFC195', 'AS_DAYM780201', 'AS_FUKS010112', 'CT_RACS820104'], axis=1)

In [None]:
newtrain

In [None]:
array = newtrain.values
X = array[:,0:5]
Y = array[:,-1]
test_size = 0.32
seed = 3
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=test_size,
random_state=seed)

In [None]:
models = []
models.append(('LR', LogisticRegression(solver='liblinear', multi_class='ovr')))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC(gamma='auto')))
# evaluate each model in turn
results = []
names = []
for name, model in models:
    kfold = StratifiedKFold(n_splits=23, shuffle=True)
    cv_results = cross_val_score(model, X_train, Y_train, cv=kfold, scoring='accuracy')
    results.append(cv_results)
    names.append(name)
    print('%s: %f (%f)' % (name, cv_results.mean(), cv_results.std()))

In [None]:
model = GaussianNB()
model.fit(X_train, Y_train)
result = model.score(X_test, Y_test)
print("Accuracy: ",  (result*100.0))

In [None]:
# Evaluate predictions
print(accuracy_score(Y_test, predictions))
print(confusion_matrix(Y_test, predictions))
print(classification_report(Y_test, predictions))

In [None]:
Y=newtrain.CLASS
X=newtrain.drop("CLASS",axis=1)
OUTPUT=model.fit(X, Y).predict(newtest.values)
OUTPUT_1=pd.DataFrame(OUTPUT)
OUTPUT_1.columns=["CLASS"]
OUTPUT_1.index.name="Index"
OUTPUT_1["CLASS"]=OUTPUT_1["CLASS"].map({0.0:False,1.0:True})
OUTPUT_1.to_csv("out1")
print(OUTPUT_1["CLASS"].unique())
print(OUTPUT_1["CLASS"].nunique())
print(OUTPUT_1.groupby("CLASS").size()[0].sum())
print(OUTPUT_1.groupby("CLASS").size()[1].sum())

In [None]:
#checking accuracy using data with selected features from feature importance
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import matthews_corrcoef

array = train.values
X = array[:,[0,1,3,5,7]]
Y = array[:,11]
test_size = 0.32
seed = 3
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=test_size,
random_state=seed)
model = GaussianNB()
model.fit(X_train, Y_train)
result = model.score(X_test, Y_test)
prediction=model.predict(X_test)
matrix=matthews_corrcoef(Y_test,prediction)
print("Accuracy: ",  (result*100.0))
print(matrix)

# making predictions on the test data

#output=model.predict(test.values)
#output1=pd.DataFrame(output)
#output1.columns=['CLASS']
#output1.index.name='Index'
#output1['CLASS']=output1['CLASS'].map({0.0:False, 1.0:True})

#print(output1['CLASS'].unique())
#print(output1['CLASS'].nunique())

#print(output1.groupby('CLASS').size()[0].sum())
#print(output1.groupby('CLASS').size()[1].sum())



In [None]:
array2 = np.array(test)
X = array2[:,[0,1,3,5,7]]
x_t=array[:,[0,1,3,5,7]]
y_t=array[:,11]

#array3 = train.values
#X_t = array3[:,[0,1,3,5,7]]
#Y_t=array3[:,10]
test_size = 0.32
seed = 3
#X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=test_size,
#random_state=seed)
test=array3[:,[0,1,3,5,7]]
model = GaussianNB()

#result = model.score(X_t, Y_t)
model.fit(x_t, y_t)
prediction=model.predict(X)

#matrix=matthews_corrcoef(Y_t,prediction)
#print("Accuracy: ",  (result*100.0))
#print(matrix)

## FEATURE SELECTION

In [None]:
#feature selection using Logistic regression
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

array_1 = train.values
X = array_1[:,0:11]
Y = array_1[:,11]
# feature extraction
model = LogisticRegression()
rfe = RFE(model, 5)
fit = rfe.fit(X, Y)
print("Num Features: ",  fit.n_features_)
print("Selected Features:",  fit.support_)
print("Feature Ranking: ",  fit.ranking_)

In [None]:
#checking accuracy using data with selected features from feature importance
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB

array = train.values
X = array[:,[0,2,3,5,7]]
Y = array[:,11]
test_size = 0.32
seed = 3
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=test_size,
random_state=seed)
model = GaussianNB()
model.fit(X_train, Y_train)
result = model.score(X_test, Y_test)
print("Accuracy: ",  (result*100.0))




In [None]:
#CHECKING ACCURACY WITH ALL FEATURES
array = train.values
X = array[:,0:11]
Y = array[:,11]
test_size = 0.32
seed = 3
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=test_size,
random_state=seed)
model = GaussianNB()
model.fit(X_train, Y_train)
result = model.score(X_test, Y_test)
print("Accuracy: ",  (result*100.0))


In [None]:
train.head(5)