<a href="https://colab.research.google.com/github/cagoodri2/cagoodri2.github.io/blob/master/Sample_ML.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Author: Corissa Goodrich

Project: Machine Learning Analysis - Classifying Meteor Type

The following code is a sample of the complete project.

In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning) #removal of continual future warning

In [None]:
# Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px #for geographic visualization
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

In [None]:
# SAMPLE FEATURE ENGINEERING
# original dataset has 422 unique values for meteorite classification. For the purposes of this project on categorizing the four main categories of meteorite.
a_chon = ['Iodranite', 'Acapulcoite', 'Winonaite', 'Martian','Shergottite','Chassignite','ALH 84001 opx','Nakhlites',
          'Aubrite','Ureilite', 'HED', 'Eucrite', 'Diogenite','Howardite', 'Angrite','Brachinite',
          'Lunar','Feldspathic Breccias', 'Basaltic','Polymict']
meteor.loc[meteor["recclass"].str.contains('Palasite'), ['types']] = 'stone'
meteor.loc[meteor["recclass"].str.contains('Mesosiderite'), ['types']] = 'stone'
meteor.loc[meteor["recclass"].str.contains('Iron'), ['types']] = 'iron'
meteor.loc[meteor["recclass"].isin(a_chon), ['types']] = 'a_chon'
meteor.loc[meteor["types"].str.contains(" "), ['types']] = 'chon'

meteor.sample(n=25) #ran x 10 to check categorization



In [None]:
#SAMPLE ADABOOST WITH STANDARD PARAMETERS
adb = AdaBoostClassifier()
adb = adb.fit(X_train, y_train)
pred_train = adb.predict(X_train)
pred_test = adb.predict(X_test)

In [None]:
print ("\nClassification Report for Training Dataset:")
print(metrics.classification_report(y_train, pred_train, zero_division=1))
print("\nClassification Report for Testing Dataset:")
print(metrics.classification_report(y_test, pred_test,zero_division=1))

In [None]:
a_train = metrics.accuracy_score(pred_train, y_train)
a_test = metrics.accuracy_score(pred_test, y_test)
c_train = metrics.confusion_matrix(pred_train, y_train)
c_test = metrics.confusion_matrix(pred_test, y_test)
f1 = metrics.f1_score(pred_test, y_test, average ='micro') #multiclass target variable, avg set to micro


print("\nAccuracy Score for Training Set: ", round(a_train,2))
print("\nAccuracy Score for Testing Set: ", round(a_test,2))
print('\nDifference between scores: ', round((a_train - a_test),2))
print('\nConfusion Matrix for the Training Set\n', c_train)
print('\nConfusion Matrix for Testing Set\n', c_test)
print('\nF1 Score for testing set: ', round(f1,2))

In [None]:
#SAMPLE ADABOOST WITH GRIDSEARCH
param = dict(learning_rate = [0.001, 0.01, 0.1, 1.0], n_estimators = [1,2,3,4,5])
adb_grid = GridSearchCV(estimator=adb, param_grid=param, cv=10, n_jobs=-1)
adb_grid.fit(X_train, y_train)
adb_grid.best_params_

In [None]:
adb_best = AdaBoostClassifier(learning_rate = 0.001, n_estimators = 1)
adb_best = adb_best.fit(X_train, y_train)
pred_train = adb_best.predict(X_train)
pred_test = adb_best.predict(X_test)

In [None]:
print ("\nClassification Report for Training Dataset:")
print(metrics.classification_report(y_train, pred_train, zero_division=1))
print("\nClassification Report for Testing Dataset:")
print(metrics.classification_report(y_test, pred_test,zero_division=1))

In [None]:
a_train = metrics.accuracy_score(pred_train, y_train)
a_test = metrics.accuracy_score(pred_test, y_test)
c_train = metrics.confusion_matrix(pred_train, y_train)
c_test = metrics.confusion_matrix(pred_test, y_test)
f1 = metrics.f1_score(pred_test, y_test, average ='micro') #multiclass target variable, avg set to micro


print("\nAccuracy Score for Training Set: ", round(a_train,2))
print("\nAccuracy Score for Testing Set: ", round(a_test,2))
print('\nDifference between scores: ', round((a_train - a_test),2))
print('\nConfusion Matrix for the Training Set\n', c_train)
print('\nConfusion Matrix for Testing Set\n', c_test)
print('\nF1 Score for testing set: ', round(f1,2))

In [None]:
#SEABORN HEATMAP CONFUSION MATRIX
sns.heatmap(c_test/np.sum(c_test), annot=True, cmap = 'Blues', fmt='.2%')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()