# K nearest neighbors Project 
### University of Denver Data Science Tools 2 
## Isabel Osgood 

### research question: Create a classifier for the different types of stars using k-nearest neighbors

In [1]:
import pandas as pd 
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics

In [2]:
#uploading the data
stars = pd.read_csv("stars.csv")
stars.head()
stars.loc[0:4]

Unnamed: 0,Temperature (K),Luminosity(L/Lo),Radius(R/Ro),Absolute magnitude(Mv),Star type,Star color,Spectral Class
0,3068,0.0024,0.17,16.12,0,Red,M
1,3042,0.0005,0.1542,16.6,0,Red,M
2,2600,0.0003,0.102,18.7,0,Red,M
3,2800,0.0002,0.16,16.65,0,Red,M
4,1939,0.000138,0.103,20.06,0,Red,M


#### Data preprocessing 

In [3]:
from sklearn.preprocessing import MinMaxScaler

#normalize numeric data 
scaler = MinMaxScaler()
scaler = scaler.fit(stars[stars.columns[0:4]])
stars[stars.columns[0:4]] = scaler.transform(stars[stars.columns[0:4]])
    
stars.head()

Unnamed: 0,Temperature (K),Luminosity(L/Lo),Radius(R/Ro),Absolute magnitude(Mv),Star type,Star color,Spectral Class
0,0.029663,2.731275e-09,8.3e-05,0.876798,0,Red,M
1,0.02898,4.94455e-10,7.5e-05,0.891807,0,Red,M
2,0.017367,2.590003e-10,4.8e-05,0.957473,0,Red,M
3,0.022622,1.412729e-10,7.8e-05,0.893371,0,Red,M
4,0.0,6.828189e-11,4.9e-05,1.0,0,Red,M


#### Data Splitting

In [8]:
from sklearn.model_selection import train_test_split

y = stars['Star type']
X = stars.drop(['Star color', 'Star type', 'Spectral Class'], axis=1) #exclude non-numeric features (assignment specific)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

#### Model Building 

In [5]:
#model construction 
knn = KNeighborsClassifier(n_neighbors=len(set(y)))

knn.fit(X_train, y_train)
knn.get_params()

{'algorithm': 'auto',
 'leaf_size': 30,
 'metric': 'minkowski',
 'metric_params': None,
 'n_jobs': None,
 'n_neighbors': 6,
 'p': 2,
 'weights': 'uniform'}

In [6]:
from sklearn.model_selection import GridSearchCV
param_grid = {'leaf_size': [10, 20, 30, 40, 50],
             'p': [1,2], 
             'weights': ['uniform', 'distance']}

clf = GridSearchCV(knn, param_grid)
clf = clf.fit(X_train, y_train)
clf.best_estimator_

KNeighborsClassifier(leaf_size=10, n_neighbors=6, p=1, weights='distance')

#### Model Evaluation 

In [7]:
from sklearn.metrics import accuracy_score
knn_best = clf.best_estimator_
print("Training Accuracy: ")
print(knn_best.score(X_train, y_train))
print("Test Accuracy:")
print(knn_best.score(X_test, y_test))

Training Accuracy: 
1.0
Test Accuracy:
1.0


#### Conclusion

In [None]:
The KNN model was very succesful at classifying stars basiced on numeric metrics 