In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
abalone = pd.read_csv("/kaggle/input/abalone-dataset/abalone.csv")

In [None]:
abalone.info()

In [None]:
abalone.head(5)

In [None]:
abalone['Sex'].value_counts()

In [None]:
abalone.describe()

In [None]:
abalone['Age'] = abalone['Rings'] + 1.5
abalone.drop('Rings', axis = 1, inplace = True)

In [None]:
abalone.head()

In [None]:
abalone.describe()

In [None]:
import matplotlib.pyplot as plt
abalone.hist(bins = 25, figsize = (12, 12))
plt.show()

In [None]:
abalone['Sex'].value_counts().sort_index().plot.bar(rot = 0, grid = True)

In [None]:
from sklearn.model_selection import train_test_split
strat_train_set, strat_test_set = train_test_split(abalone, test_size=0.2,stratify=abalone['Sex'], random_state=42)

In [None]:
strat_train_set.info()

In [None]:
strat_test_set['Sex'].value_counts()/len(strat_test_set)

In [None]:
abalone['Sex'].value_counts()/len(abalone)

In [None]:
abalone = strat_train_set.copy()

In [None]:
corr_matrix = abalone.corr(numeric_only = True)

In [None]:
corr_matrix['Age'].sort_values(ascending=False)

In [None]:
from pandas.plotting import scatter_matrix
attributes = ['Age', 'Shell weight', 'Diameter', 'Length']
scatter_matrix(abalone[attributes], figsize=(12, 12))
plt.show()

In [None]:
abalone = strat_train_set.drop('Age', axis = 1)
abalone_labels = strat_train_set['Age'].copy()

In [None]:
weights = ['Whole weight','Shucked weight', 'Viscera weight', 'Shell weight']
for weight in weights:
    abalone[weight] = np.sqrt(abalone[weight])

In [None]:
from sklearn.preprocessing import OneHotEncoder
sex_encoder = OneHotEncoder()
encoded_sex = sex_encoder.fit_transform(abalone[['Sex']])

In [None]:
abalone.drop(['Sex'], axis=1, inplace=True)

# Convert encoded_sex to array and concatenate with abalone
encoded_sex = encoded_sex.toarray()
abalone_encoded = np.concatenate((encoded_sex, abalone), axis=1)

In [None]:
from sklearn.linear_model import LinearRegression
linear_reg = LinearRegression()
linear_reg.fit(abalone_encoded, abalone_labels)

In [None]:
abalone_predictions = linear_reg.predict(abalone_encoded)

In [None]:
abalone_predictions[:5]

In [None]:
abalone_labels.iloc[:5].values

In [None]:
from sklearn.metrics import mean_squared_error
lin_rmse = mean_squared_error(abalone_labels, abalone_predictions, squared = False)
lin_rmse

In [None]:
from sklearn.tree import DecisionTreeRegressor
tree_reg = DecisionTreeRegressor(random_state = 42)
tree_reg.fit(abalone_encoded, abalone_labels)

In [None]:
abalone_predictions = tree_reg.predict(abalone_encoded)
tree_rmse = mean_squared_error(abalone_labels, abalone_predictions, squared = False)
tree_rmse

In [None]:
abalone_predictions[:20]

In [None]:
abalone_labels[:20]

In [None]:
from sklearn.model_selection import cross_val_score
tree_rmses = -cross_val_score(tree_reg,abalone_encoded, abalone_labels,
                             scoring = "neg_root_mean_squared_error", cv=10)

In [None]:
tree_rmses

In [None]:
pd.Series(tree_rmses).describe()

In [None]:
lin_rmses = -cross_val_score(linear_reg, abalone_encoded, abalone_labels, scoring="neg_root_mean_squared_error", cv=10)
lin_rmses

In [None]:
pd.Series(lin_rmses).describe()

In [None]:
from sklearn.ensemble import RandomForestRegressor
forest_reg = RandomForestRegressor(random_state=42)
forest_rmses = -cross_val_score(forest_reg,abalone_encoded,
                               abalone_labels, scoring = "neg_root_mean_squared_error", cv=10)

In [None]:
pd.Series(forest_rmses).describe()

In [None]:
X_test = strat_test_set.drop("Age", axis=1)
y_test = strat_test_set["Age"].copy()

In [None]:
encoded_test_sex = sex_encoder.transform(X_test[['Sex']])

In [None]:
X_test.drop(['Sex'], axis=1, inplace=True)
encoded_test_sex = encoded_test_sex.toarray()
X_test = np.concatenate((encoded_test_sex, X_test), axis=1)

In [None]:
final_predictions = linear_reg.predict(X_test)

In [None]:
final_rmse = mean_squared_error(y_test, final_predictions, squared = False)

In [None]:
final_rmse