## EDA 

This data is from the ***Journal of Statistics Education***. <br> 
Link: http://jse.amstat.org/jse_data_archive.htm

In [1]:
import matplotlib.pyplot as plt
# adjusts style to emulate ggplot 
# a popular plotting package for R
plt.style.use('ggplot')
import numpy as np
import pandas as pd
import seaborn as sns 

from scipy.stats import randint

# sklearn packages 
from sklearn import preprocessing 
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeClassifier

In [2]:
# read in fish measurement data
fish_df = pd.read_csv("fish.csv", header=None)
fish_df.head()

Unnamed: 0,0,1,2,3,4,5,6
0,Bream,242.0,23.2,25.4,30.0,38.4,13.4
1,Bream,290.0,24.0,26.3,31.2,40.0,13.8
2,Bream,340.0,23.9,26.5,31.1,39.8,15.1
3,Bream,363.0,26.3,29.0,33.5,38.0,13.3
4,Bream,430.0,26.5,29.0,34.0,36.6,15.1


In [3]:
fish_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 85 entries, 0 to 84
Data columns (total 7 columns):
0    85 non-null object
1    85 non-null float64
2    85 non-null float64
3    85 non-null float64
4    85 non-null float64
5    85 non-null float64
6    85 non-null float64
dtypes: float64(6), object(1)
memory usage: 4.8+ KB


In [4]:
# check how many fish species there are 
fish_df[0].unique()

array(['Bream', 'Roach', 'Smelt', 'Pike'], dtype=object)

In [5]:
# Count the number of NaNs in dataset 
fish_df.isnull().sum()

0    0
1    0
2    0
3    0
4    0
5    0
6    0
dtype: int64

## Encode Fish Species Numerically 

In [6]:
# convert categorical variables to numbers 
# instantiate sklearn LabelEncoder
le = preprocessing.LabelEncoder()

# Iterate over all the values of each column and extract their dtypes
for col in fish_df:
    # Compare if the dtype is object
    if fish_df[col].dtypes == 'object':
    # Use LabelEncoder to do the numeric transformation
        le.fit(fish_df[col])
        fish_df[col]=le.transform(fish_df[col])

In [7]:
fish_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 85 entries, 0 to 84
Data columns (total 7 columns):
0    85 non-null int64
1    85 non-null float64
2    85 non-null float64
3    85 non-null float64
4    85 non-null float64
5    85 non-null float64
6    85 non-null float64
dtypes: float64(6), int64(1)
memory usage: 4.8 KB


## Split Into Training and Test Set

In [31]:
y = fish_df[0]
X = fish_df.iloc[:, 1:7]
print(X)

         1     2     3     4     5     6
0    242.0  23.2  25.4  30.0  38.4  13.4
1    290.0  24.0  26.3  31.2  40.0  13.8
2    340.0  23.9  26.5  31.1  39.8  15.1
3    363.0  26.3  29.0  33.5  38.0  13.3
4    430.0  26.5  29.0  34.0  36.6  15.1
..     ...   ...   ...   ...   ...   ...
80   950.0  48.3  51.7  55.1  16.2  11.2
81  1250.0  52.0  56.0  59.7  17.9  11.7
82  1600.0  56.0  60.0  64.0  15.0   9.6
83  1550.0  56.0  60.0  64.0  15.0   9.6
84  1650.0  59.0  63.4  68.0  15.9  11.0

[85 rows x 6 columns]


In [32]:
# Instantiate MinMaxScaler and use it to rescale X_train and X_test
scaler = MinMaxScaler(feature_range=(0, 1))
scaled_samples = scaler.fit_transform(X)

In [33]:
# Create a PCA model with 2 components: pca
pca = PCA(n_components=2)

In [34]:
# Fit the PCA instance to the scaled samples
pca.fit(scaled_samples)

PCA(copy=True, iterated_power='auto', n_components=2, random_state=None,
    svd_solver='auto', tol=0.0, whiten=False)

In [35]:
# Transform the scaled samples: pca_features
pca_features = pca.transform(scaled_samples)

# Print the shape of pca_features
print(pca_features.shape)

(85, 2)


We have successfully reduced the dimensionality from 6 to 2. 