<a href="https://colab.research.google.com/github/lisabroadhead/data_science_machine-learning/blob/main/Abalone_Preprocessing_Exercise.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Abalone Preprocessing Exercise (Core)
- Lisa Broadhead
- June 22, 2022

In [22]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.compose import make_column_selector
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import make_column_transformer

In [8]:
file = '/content/drive/MyDrive/Colab Notebooks/coding_dojo/Machine Learning/files/abalone.data'

df = pd.read_csv(file)
df.head()

Unnamed: 0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
0,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
1,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
2,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
3,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7
4,I,0.425,0.3,0.095,0.3515,0.141,0.0775,0.12,8


### Renaming/Naming Columns

In [9]:
df.columns =['Sex',' Length', 'Diameter', 'Height', 'Whole_Weight', 'Shucked_Weight', 'Viscera_Weight', 'Shell_Weight', 'Rings']
# Reference: https://www.geeksforgeeks.org/add-column-names-to-dataframe-in-pandas/

In [10]:
df.head()

Unnamed: 0,Sex,Length,Diameter,Height,Whole_Weight,Shucked_Weight,Viscera_Weight,Shell_Weight,Rings
0,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
1,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
2,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
3,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7
4,I,0.425,0.3,0.095,0.3515,0.141,0.0775,0.12,8


In [21]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4176 entries, 0 to 4175
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Sex             4176 non-null   object 
 1    Length         4176 non-null   float64
 2   Diameter        4176 non-null   float64
 3   Height          4176 non-null   float64
 4   Whole_Weight    4176 non-null   float64
 5   Shucked_Weight  4176 non-null   float64
 6   Viscera_Weight  4176 non-null   float64
 7   Shell_Weight    4176 non-null   float64
 8   Rings           4176 non-null   int64  
dtypes: float64(7), int64(1), object(1)
memory usage: 293.8+ KB


### Separate your data into the features matrix (X) and target vector (y).

In [11]:
y = df['Rings']
X = df.drop(columns='Rings')

### Train/test split the data. Please use the random number 42 for consistency.

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=42)

### Use column transformers to transform the appropriate columns in the appropriate ways.

In [14]:
cat_selector = make_column_selector(dtype_include='object')
num_selector = make_column_selector(dtype_include='number')

In [15]:
scalar = StandardScaler()
ohe = OneHotEncoder(handle_unknown='ignore')

In [20]:
num_tuple = (scalar, num_selector)
cat_tuple = (ohe, cat_selector)

In [24]:
col_transformer = make_column_transformer(num_tuple, cat_tuple, remainder='passthrough')

In [25]:
col_transformer.fit(X_train)

ColumnTransformer(remainder='passthrough',
                  transformers=[('standardscaler', StandardScaler(),
                                 <sklearn.compose._column_transformer.make_column_selector object at 0x7f10b6e6a410>),
                                ('onehotencoder',
                                 OneHotEncoder(handle_unknown='ignore'),
                                 <sklearn.compose._column_transformer.make_column_selector object at 0x7f10b6e6a890>)])

In [26]:
X_train_processed = col_transformer.transform(X_train)
X_test_processed = col_transformer.transform(X_test)

In [27]:
X_train_df = pd.DataFrame(X_train_processed)
X_train_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,-1.546422,-1.55617,-1.053558,-1.258947,-1.260633,-1.337852,-1.212341,0.0,0.0,1.0
1,0.795725,0.521917,0.706869,0.605245,0.789463,0.749584,0.409117,0.0,0.0,1.0
2,0.252013,0.319177,0.354783,0.37885,0.602065,0.040129,0.172355,1.0,0.0,0.0
3,1.172142,0.927397,0.824231,1.234461,1.277152,1.490874,0.940037,1.0,0.0,0.0
4,-1.462774,-1.4548,-1.17092,-1.233452,-1.177094,-1.205966,-1.212341,0.0,0.0,1.0
