In [4]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from google.colab import drive

# Load the dataset
file_path = '/content/drive/MyDrive/Colab Notebooks/bottle.csv'
dataset = pd.read_csv(file_path)

# Clean the dataset by removing rows with missing values
dataset_clean = dataset.dropna(subset=['T_degC', 'Salnty', 'Depthm', 'O2ml_L'])

# Select independent and dependent variables
X = dataset_clean[['Salnty', 'Depthm', 'O2ml_L']].values
y = dataset_clean['T_degC'].values

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Generate polynomial features
degree = 3  # Degree of polynomial features
poly = PolynomialFeatures(degree=degree)
X_poly_train = poly.fit_transform(X_train)
X_poly_test = poly.transform(X_test)

# Train a linear regression model on the polynomial features
model = LinearRegression()
model.fit(X_poly_train, y_train)

# Predict the water temperature
y_pred = model.predict(X_poly_test)

# R^2
r_squared = model.score(X_poly_test, y_test)
print(f'R-squared: {r_squared}')

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')

# Print predictions and actual values
np.set_printoptions(precision=2)
comparison = np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)), axis=1)
print("Predictions vs Actual Values:")
print(comparison)

  dataset = pd.read_csv(file_path)


R-squared: 0.8742038663367315
Mean Squared Error: 2.2487147701558614
Predictions vs Actual Values:
[[13.77 16.12]
 [ 6.23  6.26]
 [ 8.26  7.88]
 ...
 [ 8.64  9.34]
 [ 5.63  5.46]
 [10.5  10.27]]
