# Importing Libraries

In [None]:
import numpy as np #numpy for arithmatic operation
import pandas as pd # pandas for dataset operations
import matplotlib.pyplot as plt # visualization library
%matplotlib inline              
import seaborn as sns           #scientific visualization
from sklearn.model_selection import train_test_split # dataset split for training and test
from sklearn.tree import DecisionTreeClassifier # Decision tree model library
from sklearn.neighbors import KNeighborsClassifier # KNN model Library
from sklearn.ensemble import RandomForestClassifier # random forest model library
from sklearn.metrics import accuracy_score,classification_report, confusion_matrix #performance evaluation metrics
from sklearn.metrics import f1_score #performance evaluation metrics for F1 score
# Feature Dimention Reduction by PCA 
from sklearn.decomposition import PCA


# A. Downloading the datasets

## 1. Importing the dataset

In [None]:
df=pd.read_csv('dataset/data/covertype_csv.csv')

## 2.	Describe the dataset and the classification task, more information about the dataset can be found in UCI repository. 

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.info()

The dataset contain 581012 and 55 columns. The dataset is used for classification tasks. The dataset does not contain any missing values.This dataset is multivariate. The dataset have cartographic variables that consist of categorical and numeric data types values. The class(covertype) is the target variable that is multiclassification task. 

# B. Data Exploration:


## 3. Display the number of instances.
## 4.	Display the number of attributes.
## 5.    Display the number of classes.

In [None]:
print(f"The data has {df.shape[0]} records, {df.shape[1]} attributes, and {df['class'].unique()} clasess")

##  6.	For each class label, display the code of the class label and the name of that class.


In [None]:
df.groupby('class').size()

##  7.	Summarise the class distribution using a suitable graph.


In [None]:
# The instances distribution of target variable can be visualized through countplot
# sns.countplot(df['class'])
sns.distplot(df['class'])

##  8. Display a statistical summary for all the attributes.


In [None]:
df.describe()

#   C. Data Preprocessing


## 9. Check whether the selected dataset has any data quality issues and choose suitable strategies to deal with any issue (if exists).

In [None]:

# we check the quality of data through histogram that tell about the distribution of the dataset. The overall distributon of dataset
#standard normal distributon. however some attributes contain abnormal distribution. 
df.hist(figsize=(16, 20), bins=50, xlabelsize=8, ylabelsize=8)

##  10.	Convert the multiclass classification problem into a binary classification problem.

In [None]:
# convert classes that has a value grater than 4 to 1
df.loc[df['class'] <= 3] = 0 
# convert classes that has a value less than 3 to 0
df.loc[df['class'] >= 4] = 1

In [None]:
# just making sure everthing works as expected
df.groupby('class').size()

In [None]:
# features
X=df.iloc[:, :-1]
X.head()

In [None]:
#target variable
y=df.iloc[:, -1:]
y.head()

## 11. Use a features selection technique to select those features in your data that contribute most to the prediction.

*Take a partial of the data, since it is too big which cause the machine to hang* 


In [None]:
df = df.sample(frac=0.001, replace=True)

In [None]:
# sample shape
df.shape

In [None]:
# pca model call
pca = PCA(n_components=8, random_state=42)
# pca model fit on featues
pca.fit(X)

In [None]:
# transforming the feautes
X = pca.transform(X)

## 12. Divide your dataset into training, validation and testing datasets.


In [None]:
# test size=30%
# training size 50%
# validation size=20%
random_seed=123
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, 
                                                    train_size=0.5, 
                                                    random_state=42,
                                                    stratify=y_test)
    

# D. classification

## 13.	Build classification models.



### a.	Use three different learning algorithms to generate three classification models. You should choose one learning algorithm from each of the following categories:




In [None]:
model_names = ['Decision_Tree', 'Nearest_Neighbors', 'Random_Forest']

In [None]:
classifiers = [ DecisionTreeClassifier(random_state=24), KNeighborsClassifier(3), RandomForestClassifier(random_state=24) ]

In [None]:
# predictions
predictions_test = []
predictions_vald = []

for name, clf in zip(model_names, classifiers):
    clf.fit(X_train, y_train)
    predictions_test.append(clf.predict(X_test))
    predictions_vald.append(clf.predict(X_val))




In [None]:
# Accucarcy
accuracy_test = []

for name, pred in zip(model_names, predictions_test):
    score = accuracy_score(y_test, pred)
    accuracy.append(score)
    print(f'{name} test accuracy is {score}')


In [None]:
accuracy_vald = []
    
for name, pred in zip(model_names, predictions_vald):
    score = accuracy_score(y_test, pred)
    accuracy.append(score)
    print(f'{name} validation accuracy is {score}')

#### i.	Decision Tree

In [None]:
dtree = DecisionTreeClassifier(random_state=24) # using the random state for reproducibility

In [None]:
# decision tree model training
dtree = dtree.fit(X_train, y_train)

In [None]:
# decision tree model prediction on test data
dtree_pred = dtree.predict(X_test)

In [None]:
# Evaluate our decision tree model
print(classification_report(y_test, dtree_pred))


#### ii. Nearest Neighbor Classifier, ~~Naive Bayes Classifier, Support Vector Machine~~

In [None]:
knn = KNeighborsClassifier() 

In [None]:
# KNN  model training
knn =knn.fit(X_train, y_train)

In [None]:
# KNN  model prediction on test data
knn_prediction = knn.predict(X_test)

#### iii.   ~~Bagging, Boosting,~~ Random Forest

In [None]:
# forest model 
forest= RandomForestClassifier(random_state=24)

In [None]:
# random forest model training
forest=forest.fit(X_train, y_train)

In [None]:
# random forest model prediction on test
forest_prediction= forest.predict(X_test)

## 14.	For each classification model:


a.	Try to find the most accurate classifier (avoid overfitting).


In [None]:
 # then predict on the test set
dt_accuracy= accuracy_score(y_test, dtree_pred) 
rf_accuracy= accuracy_score(y_test, forest_prediction) 
knn_accuracy= accuracy_score(y_test, knn_prediction) 

print(dt_accuracy)
print(rf_accuracy)
print(knn_accuracy)

# E. Evaluation

## 15.	Evaluate your classification models on the validation and the testing datasets.


In [51]:
for name, pred in zip(model_names, predictions_test):
    print("Model: ", name)
    print("classification report testing dataset: ", classification_report(y_test, pred))
    print("Confusion matrix for testing dataset: ", confusion_matrix(y_test, pred ))
    

Model:  Decision_Tree
Confusion matrix for testing dataset:  [0 0 0 ... 0 0 0]
Confusion matrix for validation dataset:  [0 0 0 ... 0 0 0]


In [None]:
for name, pred in zip(model_names, predictions_vald):
    vald_pred = clf.predict(X_val)
    print("Model: ", name)
    print("classification report validation dataset: ", classification_report(y_vald, pred))    
    print("Confusion matrix for validation dataset: ",  confusion_matrix(y_vald, pred )

### a.	For each classification model, print out a confusion matrix for the validation and testing datasets.


In [None]:
#Evaluate your classification models on the validation and the testing datasets.
dtree_test_pred = dtree.predict(X_test)
dtree_vald_pred = dtree.predict(X_val)

In [None]:
#a For each classification model, print out a confusion matrix for the validation and testing datasets.
cm1= confusion_matrix(y_test, dtree_test_pred)
cm2= confusion_matrix(y_test, dtree_vald_pred)

### b. Use the following evaluation measures to evaluate the performance of the generated classification models:
###    *i. Accuracy	ii. Error rate	iii. F -measure*

- #### *Testing Dataset*

##### i.Accuracy

In [None]:
model_accuracy=accuracy_score(y_test, y_pred)
print(model_accuracy)

##### ii. Error rate

In [None]:
error_rate = 1 - model_accuracy
print(error_rate)

##### iii. F -measure

In [None]:
print ('F1 score:', f1_score(y_test, y_pred))

- #### Validation Dataset


##### i. accuracy


In [None]:
classifier_accuracy=accuracy_score(y_test, prediction)
print('Accuracy:',classifier_accuracy)

##### ii. error rate


In [None]:
error_rate_val=1-classifier_accuracy
print('error_rate:',error_rate_val)

##### iii. F-measure

In [None]:
print ('F1 score:', f1_score(y_test, prediction))

## 16. Compare between the performances of all the classification models using suitable chart (The type of chart should be different from the type of the chart that is used in the data exploration stage).


In [None]:
#checking model result
dt_accuracy
rf_accuracy
knn_accuracy
print(dt_accuracy)
print(rf_accuracy)
print(knn_accuracy)

In [None]:
models=['dt_accuracy', 'rf_accuracy', 'knn_accuracy']
accuracy=[0.97, 1.0, 0.95]
plt.bar(models, accuracy)
plt.title('Accuracy of Models')
plt.xlabel('Models')
plt.ylabel('Accuracy')
plt.show()