# Abalone Case Study

### Importing Required Libraries

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from scipy.stats import zscore

import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv("abalone.csv")
df

In [None]:
df.head()

In [None]:
df.shape

There are 4177 rows and 9 columns in this dataset

In [None]:
df.columns

In [None]:
df.dtypes

In [None]:
#checking the datatypes
df.info()  

All the data types are float datatypes except 'sex' and 'Rings' and there are no null values present in the data

# Exploratory Data Analysis

In [None]:
df.isnull().sum()

There are no null data present

## Data Preprocessing

In [None]:
df.describe()

The Minimum height is zero. so lets check height feature

In [None]:
df['Height'].describe()

In [None]:
df[df.Height==0]

Here the zero height is infant's sex. so we change the value of 0 to null. and lets fill the average height with the null value

In [None]:
means=pd.pivot_table(df,index=['Sex'],aggfunc={'Height':np.mean})
means

Mean value of infant sex values is 0.107996.So we will fill in the missing value with 0.107996

In [None]:
df['Height']=df['Height'].replace(to_replace=0,value=0.107996)

In [None]:
#Checking the height minimum values again
df['Height'].describe()

In [None]:
# Since'Sex' Are of categorical dtype. So we are applying Label Encoder
from sklearn.preprocessing import OrdinalEncoder,LabelEncoder
lencode=LabelEncoder()
df['Sex']=lencode.fit_transform(df['Sex'])
df

In [None]:
df.dtypes

now all the columns turned to int type

In [None]:
df.describe()

## Univariate Analysis

In [None]:
# Target feature distribution 
sns.distplot(df["Rings"])

it seems that maximum abalone rings falls between 7 to 15 numbers. Since age = Rings+1.5, maximum abalone'age fall under 7 to 15 age group.

In [None]:
#we have one categorical variable. ('Sex')
#plotting to check how many categories for 'Sex' variable
sns.countplot(df.Sex)

In [None]:
# 0 = Infant
# 1 = Male
# 2 = female
print(df["Sex"].value_counts())

It is clear that the Data is almost equally distributed in all 3 sex columns

## Let's Check the relation between "Sex" and "Rings"

In [None]:
df.groupby(["Sex"])["Rings"].mean().plot(kind="bar")   # 0 = Infant  1 = Male  2 = female

In [None]:
import seaborn as sns
sns.boxplot(x='Sex', y='Rings', data=df)    # 0 = Infant  1 = Male  2 = female

## Correlation

In [None]:
df.corr()

Sex is Negatively Correlated to all the variables

In [None]:
sns.heatmap(df.corr(), cmap = "Blues", annot = True)

In [None]:
# distribution of the data
df.iloc[:,1:-1].hist(figsize=(10,8));

Let's check the data distribution among all columns

In [None]:
df.plot(kind='density',subplots=True, layout=(6,11), sharex=False, legend= False, fontsize=1, figsize=(18,12))
plt.show()

Splitting the target and independent variables before removing skewness

In [None]:
x=df.drop('Rings', axis=1)
y=df['Rings']
x

In [None]:
y

In [None]:
# Cheking Skewness
x.skew().sort_values(ascending=False)

There seems a skewness in height. so lets remove skewness in height using power_tranform

In [None]:
from sklearn.preprocessing import power_transform
x_new=power_transform(x)

In [None]:
type(x_new)

In [None]:
x.columns

In [None]:
x=pd.DataFrame(x_new, columns=x.columns)
x

In [None]:
# Again Cheking Skewness if it has been removed 
x.skew().sort_values(ascending=False)

In [None]:
sk=x.skew()
sk

In [None]:
x.skew()[np.abs(x.skew())<0.25].all()

# Let's Check Outliers

In [None]:
df.plot(kind='box',subplots=True,layout=(2,5),figsize=(10,10))

Except for sex every data has a outliers

# Removing Outliers

In [None]:
from scipy.stats import zscore

z=np.abs(zscore(df))
z

In [None]:
z.shape

In [None]:
threshold=3
print(np.where(z>3))

In [None]:
df_new=df[(z<3).all(axis=1)]
df_new

In [None]:
print(df.shape)
print(df_new.shape)

### Separating columns into Features and Target

In [None]:
x=df_new.iloc[:, :-1]
y=df_new.iloc[:,-1]

In [None]:
x.shape

In [None]:
y.shape

In [None]:
from sklearn.preprocessing import StandardScaler
scale=StandardScaler()
x=scale.fit_transform(x)
x

In [None]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=150)

In [None]:
# seprating data into train and test 
x_train,x_test,y_train,y_test = train_test_split(x,y, test_size = 0.20, random_state =150)  
# getting the shapes
print("Shape of x_train :", x_train.shape)
print("Shape of x_test :", x_test.shape)
print("Shape of y_train :", y_train.shape)
print("Shape of y_test :", y_test.shape)

In [None]:
# taking all models in a list
models = [LogisticRegression(),KNeighborsClassifier(),DecisionTreeClassifier(),SVR(),RandomForestClassifier(),GaussianNB()]

# running loop for differents model\
for model in models:
    model.fit(x_train,y_train)
    pred = model.predict(x_test)
    MSE = mean_squared_error(pred,y_test)
    
    print("Mean_squared_error",model,MSE)
    print("RMSE",model, np.sqrt(MSE) )
    print("r_score",model,r2_score(y_test,pred)*100)
    print("\n")

In [None]:
from sklearn.model_selection import cross_val_score
for model in models:
    score=cross_val_score(model,x,y,cv=3)
    print('CV score for',model)
    print(score)
    print(score.mean())
    print(score.std())
    print('\n')

# Hyperparameter Tuning

In [None]:
from sklearn.model_selection import GridSearchCV
#Creating parameter list to pass in GridSearchCV

# defining parameter range
param_grid = {'C': [0.1, 1, 10],
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['rbf']}
 
grid = GridSearchCV(SVR(), param_grid, refit = True, verbose = 3)
 
# fitting the model for grid search
grid.fit(x_train, y_train)

In [None]:
print("score",grid.best_score_)
print("best parameter",grid.best_params_)
print("best estimator",grid.best_estimator_)
grid_pred=grid.best_estimator_.predict(x_test) # predicting with best parameters

In [None]:
#Predict the Rings
pred=grid.predict(x_test)
print('Predicted number of Rings',pred)
print('Actual Number of Rings',y_test)

In [None]:
import numpy as np
a=np.array(y_test)
Predicted=np.array(pred)
df_com=pd.DataFrame({'Original':a,'Predicted':Predicted},index=range(len(a)))
df_com

## Model Saving

In [None]:
import pickle
filename='Predicted_Rings_data.pkl'
pickle.dump(SVR,open(filename,'wb'))