In [35]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
os.listdir('/kaggle/input/fish-species-sampling-weight-and-height-data')
data = pd.read_csv('/kaggle/input/fish-species-sampling-weight-and-height-data/fish_data.csv')
# Understanding Data 
data.shape # Seeing what is the shape of the data in this case (4 columns 4080 rows)
data.info() # Tells the type of data in every column (Int in three columns and object type in Species)
data.describe() # Statistics like mean , standard deviation ,25% quartile  , 50% quartile , 75%quartile min max of the data.

# Checking for Missing Data 
data.isnull().sum() # no missing values 

# Analysis 
# Looking how many Species in total are there
data['species'].unique() # 9 Different Species 
# Plotting bar graph to figure out the count of each species 
#data['species'].value_counts().plot(kind = 'bar')
species_count = data['species'].value_counts()
#print(species_count.head(1))
#print(species_count.head(9))# Highest Members - 480 , Lowest Members - 415
# Plotting a scatter plot to find relationship between height and weight of fishes
#data.plot.scatter( x = 'length' , y = 'weight')
# Extracting equations for every species 
species_groups = data.groupby('species')
for species , group in species_groups :
    X = group['length'].values.reshape(-1 , 1 )
    y = group['weight'].values
    model = LinearRegression() 
    model.fit(X ,y)
    # Extract Slope and intercept for equation 
    slope = model.coef_[0]
    intercept = model.intercept_
    print(f"Species: {species} -> Weight = {slope:.2f} * Length + {intercept:.2f}")
    # Equations for Weight for every Species 
    #Species: Anabas testudineus -> Weight = 0.03 * Length + 2.99
#Species: Coilia dussumieri -> Weight = 0.02 * Length + 2.30
#Species: Otolithoides biauritus -> Weight = 0.04 * Length + 2.55
#Species: Otolithoides pama -> Weight = 0.02 * Length + 3.43
#Species: Pethia conchonius -> Weight = 0.02 * Length + 4.43
#Species: Polynemus paradiseus -> Weight = 0.00 * Length + 3.93
#Species: Puntius lateristriga -> Weight = 0.02 * Length + 2.43
#Species: Setipinna taty -> Weight = 0.04 * Length + 2.46
#Species: Sillaginopsis panijus -> Weight = 0.03 * Length + 5.13
# Figuring out Tallest and Heaviest Species 
species_groups.describe()
# Lengthiest Species - Sillaginopsis panijus - 31.06m
# Heaviest Species - Sillanginopsis panijus - 6.180 Kg 
# Highest Weight - Length Ratio -  Pethia conchonius - 0.4857
# Categorizing species columns
data['species'] = data['species'].astype('category')
# Encoding Species column
le = LabelEncoder()
data['species'] = le.fit_transform(data['species'])
# Building a Decision Trees Model to Predict the species based on weight height and weight height ratio 
X_features = data[['length' , 'weight' , 'w_l_ratio']] # Features for Decision Tress Model
Y_target = data['species'] # Target , what we want to predict

# Splitting the data 
X_train, X_test, Y_train, Y_test = train_test_split(X_features, Y_target, test_size=0.2, random_state=42)

# Initializig Model
clf = DecisionTreeClassifier(random_state = 42)
# Fitting the model 
clf.fit(X_train , Y_train)
# Predicting Values 
Y_pred = clf.predict(X_test)
# Checking Accuracy 
Accuracy = accuracy_score(Y_test , Y_pred)
print(f" Accuracy Score is : {Accuracy:.2f}")
# Performing GridSeacrchCV to find the best hyperparameters 
# Making the param grid to perform GridSearchCV
param_grid = {
    'criterion' : ['gini' , 'entropy'],
    'max_depth' : [None , 10 , 20 , 30],
    'min_samples_split' : [2,10,20],
    'min_samples_leaf' : [1,5,10],
}
# Initialzing GridSearchCV
grid_search = GridSearchCV(estimator=clf, param_grid=param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, Y_train)
print("Best Parameters found: ", grid_search.best_params_)
print("Best Accuracy: ", grid_search.best_score_)
best_clf = grid_search.best_estimator_
Y_pred_2 = best_clf.predict(X_test)
accuracy = accuracy_score(Y_test, Y_pred_2)
print(f"Test Accuracy with Best Parameters: {accuracy:.2f}")
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/fish-species-sampling-weight-and-height-data/fish_data.csv
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4080 entries, 0 to 4079
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   species    4080 non-null   object 
 1   length     4080 non-null   float64
 2   weight     4080 non-null   float64
 3   w_l_ratio  4080 non-null   float64
dtypes: float64(3), object(1)
memory usage: 127.6+ KB
Species: Anabas testudineus -> Weight = 0.03 * Length + 2.99
Species: Coilia dussumieri -> Weight = 0.02 * Length + 2.30
Species: Otolithoides biauritus -> Weight = 0.04 * Length + 2.55
Species: Otolithoides pama -> Weight = 0.02 * Length + 3.43
Species: Pethia conchonius -> Weight = 0.02 * Length + 4.43
Species: Polynemus paradiseus -> Weight = 0.00 * Length + 3.93
Species: Puntius lateristriga -> Weight = 0.02 * Length + 2.43
Species: Setipinna taty -> Weight = 0.04 * Length + 2.46
Species: Sillaginopsis panijus -> Weig