# Import Python Libraries and Modules:

In [2]:
#Import Python libraries: Numpy and Pandas
import pandas as pd
import numpy as np

#Import libraries and modules for data Visualization
from pandas.plotting import scatter_matrix
from matplotlib import pyplot

# 1. Load Dataset

In [6]:
filename = "C:/Users/Aleena/Desktop/UNT/big data/FINAL PROJECT/abalone.csv"
col_names =['Sex','Length','Diameter','Height','Whole weight','Shucked weight','Viscera weight','Shell weight','Rings']
df = pd.read_csv(filename, names=col_names, index_col= False)


# 2. Preprocess Dataset

In [10]:
#Lets Find null values
#Count the number of NAN values in each column
print(df.isnull().sum())

Sex               0
Length            0
Diameter          0
Height            0
Whole weight      0
Shucked weight    0
Viscera weight    0
Shell weight      0
Rings             0
dtype: int64


In [12]:
df['age'] = df['Rings']+1.5
df = df.drop('Rings', axis = 1)

## 4. Separate Datset into Input and Output Arrays

In [27]:
#Import scit-learn module for the algorithm/model: Linear regression
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
#Import scit-learn module to split the dataset into tain/test subdatasets
from sklearn.model_selection import train_test_split

#Import scit-learn module for K-fold cross-validation- algorithm/model evaluation and validation
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

In [19]:
df.Sex = pd.Categorical(df.Sex)
df['Sex'] = df.Sex.cat.codes
print(df.dtypes)

Sex                  int8
Length            float64
Diameter          float64
Height            float64
Whole weight      float64
Shucked weight    float64
Viscera weight    float64
Shell weight      float64
age               float64
dtype: object


In [21]:
array = df.values
# separate array into input and output components by slicing 
# For X (input)[:,0:8] ---> all the rows, columns from 0-8(9-1) (no rings)
X = array[:,0:8]




#For Y (output)[:,8] ---> all the rows, column index 5(last column)
Y = array[:,8]

In [23]:
## 5. Split Input/Output Arrays into Training/Testing Datasets

In [25]:
# Split the dataset --> training sub-dataset: 67%; test sub-dataset:33%
test_size = 0.33
# Selection of records to include in which sub-datset must be done randomly 
# Use this seed for randomization
seed=7
#Split the datset (both input and output) into taining/testing datasets
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=test_size,random_state = seed)

## 6 Build and Train Model

In [29]:
# Build the model
model= LinearRegression()
#Train the model using the training sub-dataset
model.fit(X_train,Y_train)
#print out the coefficients and the intercepts 
#print intercepts and coefficiesnts
print (model.intercept_)
print(model.coef_)

4.148108689309409
[  0.06240791   0.3626927   12.64808616   9.4881838    9.80737186
 -20.09472745 -10.52582156   6.14673301]


In [34]:
# Build the Decision Tree model (CART)
model_tree = DecisionTreeRegressor(random_state=42)

# Train the model using the training sub-dataset
model_tree.fit(X_train, Y_train)

## 7 Calculate R-Squared

In [36]:
## Calculate R-Squared
R_squared = model.score(X_test,Y_test)
print(R_squared)

0.5223858580716234


In [38]:
## Calculate R-Squared
R_squared = model_tree.score(X_test,Y_test)
print(R_squared)

0.14985936465153316


## 8 Prediction

1. Sex :Male(2)
2. Length: mm: 0.44
3. Diameter: mm : 0.3
4. Height : mm : 0.15
5. Whole weight : grams : 0.406
6. Shucked weight : grams : 0.194
7. Viscera weight : grams : 0.151
8. Shell weight : grams : 0.12

In [43]:
model.predict([[2,0.44,0.3,0.15,0.406,0.194,0.151,0.12]])

array([8.88178747])

In [45]:
model_tree.predict([[2,0.44,0.3,0.15,0.406,0.194,0.151,0.12]])

array([8.5])

1. Sex :Female(0)
2. Length: mm: 0.45
3. Diameter: mm : 0.38
4. Height : mm : 0.11
5. Whole weight : grams : 0.768
6. Shucked weight : grams : 0.08
7. Viscera weight : grams : 0.21
8. Shell weight : grams : 0.1

In [50]:
model.predict([[0,0.45,0.38,0.11,0.768,0.08,0.21,0.1]])

array([14.49002753])

In [52]:
model_tree.predict([[0,0.45,0.38,0.11,0.768,0.08,0.21,0.1]])

array([10.5])

## 9 Evaluate/Validate Algorithm/model - Using K-Fold Cross-Validation

In [55]:
# Evaluate the algorithm 
# specify the K -size
num_folds = 10

# Fix the random seed
# Must use the same seed value so that the same subsets can be obtained for each time the process is repeated
seed=7

#split the whole data set into folds
kfold = KFold(n_splits=num_folds,shuffle=True,random_state=seed)

#For Linear Regression, we can use MSE (mean squared error)
# to evaluate the model/algorithm 
scoring = 'neg_mean_squared_error'

#Train the model and run K-fold cross-validation/evaluate the model
results =cross_val_score(model, X,Y, cv= kfold, scoring=scoring)
#print out the evaluation results 
# Result: the average of all results obtained from the k-fold cross-validation
print(results.mean())

-5.064464344408433


In [57]:
# Evaluate the algorithm 
# specify the K -size
num_folds = 10

# Fix the random seed
# Must use the same seed value so that the same subsets can be obtained for each time the process is repeated
seed=7

#split the whole data set into folds
kfold = KFold(n_splits=num_folds,shuffle=True,random_state=seed)

#For Linear Regression, we can use MSE (mean squared error)
# to evaluate the model/algorithm 
scoring = 'neg_mean_squared_error'

#Train the model and run K-fold cross-validation/evaluate the model
results =cross_val_score(model_tree, X,Y, cv= kfold, scoring=scoring)
#print out the evaluation results 
# Result: the average of all results obtained from the k-fold cross-validation
print(results.mean())

-8.906681353481806
