In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

%matplotlib inline

### A linear regression example

We're going to look at the relationship between height in weight in 18 year old humans using [data](http://socr.ucla.edu/docs/resources/SOCR_Data/SOCR_Data_Dinov_020108_HeightsWeights.html) from UCLA's Statistics Online Computational Resource 



In [None]:
# first let's load the data
hwdata = pd.read_csv('SOCR-HeightWeight.csv')

print(hwdata.shape)

hwdata.head(20)

In [None]:
# by convention the independent variable is called X
# in a more complicated ML example, each entry would be a vector of features
# but in this example, it's just a number representing the height
X = hwdata['height']

# also by convention, the dependent variable is called y
y = hwdata['weight']

In [None]:
# to get a feel for how linear the weight vs height data is let's look at 
# the first 100 points in the data

plt.plot(X[:100],y[:100], 'o')

In [None]:
# the next two lines are the entire ML model building portion of the example

lin_reg = LinearRegression()

lin_reg.fit(np.c_[X],y)

def guess_your_weight(height_in_inches):
    
    predicted_weight = lin_reg.coef_[0]*height_in_inches + lin_reg.intercept_
    return predicted_weight

In [None]:
X_min = X.min()
X_max = X.max()
y_min = guess_your_weight(X_min)
y_max = guess_your_weight(X_max)

plt.plot(X,y, 'o')
plt.plot([X_min,X_max],[y_min,y_max],color='r')

#### Scoring the model
While the above is the best model we can build for this data, it may not be a great model. We can use the $R^2$ (squared correlation or coefficient of determination) to see how well our linear model predicts an individual's weight given their height.

Note that a score near 1 means we have a model that fits our data very well and a score near 0 means we don't.

In [None]:
print("Our model's r^2 is ", lin_reg.score(np.c_[X],y), " so, it's not great...")

### A multiple linear model
We're going to look at weight prediction again, but this time in fish! We're using the Fish Market data set from Kaggle (https://www.kaggle.com/datasets/aungpyaeap/fish-market)

In [None]:
# first let's load the data
fish_data = pd.read_csv('Fish.csv')

print(fish_data.shape)

fish_data.head(20)

To keep the model simple, we're going to ignore the species column. If we wanted to improve the model, we'd encode that information numerically and include it in the analysis.

In case you're wondering about the meaning of the different measurements, here are the slightly more detailed descriptions from the site.

* Length1 = vertical length in cm

* Length2 = diagonal length in cm

* Length3 = cross length in cm

* Height = height in cm

* Width = diagonal width in cm

That didn't really clear it up for me either...

In [None]:
# Dependent (Target) Variable
y = fish_data['Weight']

# Independant Variables
X = fish_data.iloc[:,2:7]

X.head()

In [None]:
fish_model = LinearRegression()
fish_model.fit(X,y)

model_score = fish_model.score(X,y)

In [None]:
intercept = fish_model.intercept_
weights = map(str, fish_model.coef_)

weighted_values = ") + (".join([" * ".join(tuple(item)) for item in list(zip(weights, X.columns))])

print("Our spiffy new linear relationship:")
print("Weight =", "(", weighted_values, ") + ", intercept)

In [None]:
# Let's see how well a linear model did in this case.

if model_score > 0.5:
    print("At", model_score, "the R^2 isn't totally awful.")
else:
    print("Yeah,", model_score, "is a pretty bad R^2, even for a quick example")

Regardless of the outcome above, you can see a much more in-depth analysis of the data and creation of multiple linear model at https://www.kaggle.com/code/akdagmelih/multiplelinear-regression-fish-weight-estimation