# Task for Today
***
## Abalone Attribute Prediction

Given data about abalone, we'll try to multiple attributes of a given organism.

We will use linear regression and logistic regression models to make our predictions.

## Set Up

In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report

In [3]:
data = pd.read_csv('../input/abalone-uci/abalone_original.csv')

In [4]:
data

Unnamed: 0,sex,length,diameter,height,whole-weight,shucked-weight,viscera-weight,shell-weight,rings
0,M,91,73,19,102.8,44.9,20.2,30.0,15
1,M,70,53,18,45.1,19.9,9.7,14.0,7
2,F,106,84,27,135.4,51.3,28.3,42.0,9
3,M,88,73,25,103.2,43.1,22.8,31.0,10
4,I,66,51,16,41.0,17.9,7.9,11.0,7
...,...,...,...,...,...,...,...,...,...
4172,F,113,90,33,177.4,74.0,47.8,49.8,11
4173,M,118,88,27,193.2,87.8,42.9,52.1,10
4174,M,120,95,41,235.2,105.1,57.5,61.6,9
4175,F,125,97,30,218.9,106.2,52.2,59.2,10


In [5]:
# No missing values
# 1 categorical (object) columns
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4177 entries, 0 to 4176
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   sex             4177 non-null   object 
 1   length          4177 non-null   int64  
 2   diameter        4177 non-null   int64  
 3   height          4177 non-null   int64  
 4   whole-weight    4177 non-null   float64
 5   shucked-weight  4177 non-null   float64
 6   viscera-weight  4177 non-null   float64
 7   shell-weight    4177 non-null   float64
 8   rings           4177 non-null   int64  
dtypes: float64(4), int64(4), object(1)
memory usage: 293.8+ KB


## Preprocessing + Training

In [30]:
#Before Scaling
X_train.mean(), X_train.var()

(diameter           81.910708
 height             28.029422
 whole-weight      166.967670
 shucked-weight     72.400616
 viscera-weight     36.491687
 shell-weight       48.136880
 rings               9.961683
 F                   0.322614
 I                   0.317140
 M                   0.360246
 dtype: float64,
 diameter           384.842469
 height              72.325623
 whole-weight      9520.505891
 shucked-weight    1957.362032
 viscera-weight     478.880208
 shell-weight       766.536709
 rings               10.478340
 F                    0.218609
 I                    0.216636
 M                    0.230548
 dtype: float64)

In [36]:
# input data -> target label -> task type (Regression or Classification)
def preprocess_and_train(df, target, task):
    df = df.copy()
    
    #onehot encode SEX column ONLY IF being used as input data
    if target != 'sex':
        dummies = pd.get_dummies(df.sex)
        df = pd.concat([df, dummies], axis=1)
        df = df.drop('sex', axis=1)
    
    # Make Target and Input data
    y = df[target].copy()
    X = df.drop(target, axis=1).copy()
    
    # Train Test and Split
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=123)
    
    # Scale X
    sc = StandardScaler()
    sc.fit(X_train)
    X_train = pd.DataFrame(sc.transform(X_train), columns = X_train.columns)   
    X_test = pd.DataFrame(sc.transform(X_test), columns = X_test.columns)
    
    #Define my Model
    if task == 'Regression':
        model = LinearRegression()
    elif task == 'Classification':
        model = LogisticRegression()
    
    #Fit the Model
    model.fit(X_train, y_train)
    
    #Return the Test Results
    return model.score(X_test, y_test)

In [32]:
X_train, X_test, y_train, y_test = preprocess_and_train(data, target='length', task="Regression")

In [33]:
#After Scaling
X_train.mean(), X_train.var()

(diameter          5.484646e-17
 height           -1.767127e-16
 whole-weight     -9.290474e-17
 shucked-weight    1.207838e-17
 viscera-weight    2.411877e-17
 shell-weight     -6.555747e-17
 rings            -9.677359e-17
 F                 9.024598e-17
 I                -1.485108e-16
 M                -1.783649e-16
 dtype: float64,
 diameter          1.000342
 height            1.000342
 whole-weight      1.000342
 shucked-weight    1.000342
 viscera-weight    1.000342
 shell-weight      1.000342
 rings             1.000342
 F                 1.000342
 I                 1.000342
 M                 1.000342
 dtype: float64)

In [28]:
X_train

Unnamed: 0,diameter,height,whole-weight,shucked-weight,viscera-weight,shell-weight,rings,F,I,M
112,64,16,66.5,29.7,12.7,21.0,9,0,1,0
2979,71,31,196.2,93.0,40.3,50.1,8,0,0,1
1715,97,30,244.3,113.9,54.7,66.0,9,0,0,1
3548,74,24,118.8,56.0,22.0,27.5,7,1,0,0
3911,54,20,43.2,16.6,7.4,15.0,10,0,1,0
...,...,...,...,...,...,...,...,...,...,...
1593,76,27,123.0,52.2,31.8,35.0,8,0,1,0
4060,81,26,131.6,65.4,28.9,34.8,8,0,1,0
1346,90,36,159.9,67.2,37.1,47.4,8,0,0,1
3454,92,33,213.0,99.7,42.9,56.3,8,1,0,0


## Predicting Characteristics of Abalone

In [39]:
pd.set_option('max_columns', None)
data

Unnamed: 0,sex,length,diameter,height,whole-weight,shucked-weight,viscera-weight,shell-weight,rings
0,M,91,73,19,102.8,44.9,20.2,30.0,15
1,M,70,53,18,45.1,19.9,9.7,14.0,7
2,F,106,84,27,135.4,51.3,28.3,42.0,9
3,M,88,73,25,103.2,43.1,22.8,31.0,10
4,I,66,51,16,41.0,17.9,7.9,11.0,7
...,...,...,...,...,...,...,...,...,...
4172,F,113,90,33,177.4,74.0,47.8,49.8,11
4173,M,118,88,27,193.2,87.8,42.9,52.1,10
4174,M,120,95,41,235.2,105.1,57.5,61.6,9
4175,F,125,97,30,218.9,106.2,52.2,59.2,10


In [43]:
results = preprocess_and_train(data, 'sex', 'Classification')

print("sex Classification Test Accuracy: {:.2f}".format(results))

sex Classification Test Accuracy: 0.56


In [44]:
results = preprocess_and_train(data, 'rings', 'Classification')

print("rings Classification Test Accuracy: {:.2f}".format(results))

rings Classification Test Accuracy: 0.28


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [45]:
for header in data.columns:
    if header != 'sex':
        results = preprocess_and_train(data, header, 'Regression')

        print("{} Regression R^2 Value: {:.2f}".format(header, results))

length Regression R^2 Value: 0.98
diameter Regression R^2 Value: 0.98
height Regression R^2 Value: 0.83
whole-weight Regression R^2 Value: 0.99
shucked-weight Regression R^2 Value: 0.97
viscera-weight Regression R^2 Value: 0.94
shell-weight Regression R^2 Value: 0.96
rings Regression R^2 Value: 0.53
