In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
import matplotlib.pyplot as plt

# Load the data
df = pd.read_csv(r"/Users/avniguota/Desktop/nir sensor/ORANGE_DATA_FINAL.csv")

# Prepare the data
x = df.to_numpy()[:, 0:18]
y = df.to_numpy()[:, 18]
seed = 42

# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=seed)

print(f'Shape of Train Data: {x_train.shape}')
print(f'Shape of Test Data: {x_test.shape}')

# Train the Linear Regression model
model_lr = LinearRegression()
model_lr.fit(x_train, y_train)

# Evaluate the model
train_score = model_lr.score(x_train, y_train)
test_score = model_lr.score(x_test, y_test)

print("Train Score:", train_score)
print("Test Score:", test_score)

# Make predictions on the test data
y_pred_lr = model_lr.predict(x_test)

# Calculate R^2 score
r2_lr = r2_score(y_test, y_pred_lr)
print("R^2 Coefficient:", r2_lr)

# Calculate Mean Absolute Percentage Error (MAPE)
absolute_percentage_errors_lr = np.abs((y_test - y_pred_lr) / y_test)
mape_lr = np.mean(absolute_percentage_errors_lr) * 100  # Convert to percentage
print(f'MAPE for LR: {mape_lr:.2f}%')

# Get the coefficients and intercept of the linear regression model
intercept = model_lr.intercept_
coefficients = model_lr.coef_

# Define feature names
feature_names = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'R', 'S', 'T', 'U', 'V', 'W']

# Print the equation of the line
equation = f"Glucose = {intercept:.2f} + "
for coef, feature_name in zip(coefficients, feature_names):
    equation += f"{coef:.2f} * {feature_name} + "

# Remove the trailing ' + '
equation = equation[:-3]
print("Equation of the line:", equation)

# Rank features by importance (absolute value of coefficients)
feature_importance = np.abs(coefficients)
sorted_indices = np.argsort(feature_importance)[::-1]  # Sort in descending order

print("\nFeature Ranks:")
for i in sorted_indices:
    print(f"{feature_names[i]}: {feature_importance[i]:.4f}")


Shape of Train Data: (24, 18)
Shape of Test Data: (6, 18)
Train Score: 0.9126136979377681
Test Score: -8.739938219207966
R^2 Coefficient: -8.739938219207966
MAPE for LR: 21.47%
Equation of the line: Glucose = 0.10 + 0.00 * A + 0.05 * B + 0.00 * C + -0.05 * D + 0.03 * E + -0.02 * F + 0.03 * G + -0.01 * H + 0.02 * I + -0.16 * J + 0.11 * K + -0.20 * L + 0.01 * R + -0.02 * S + 0.09 * T + -0.40 * U + -0.02 * V + 0.08 * W

Feature Ranks:
U: 0.3974
L: 0.1963
J: 0.1600
K: 0.1149
T: 0.0851
W: 0.0753
B: 0.0516
D: 0.0459
E: 0.0304
G: 0.0280
I: 0.0200
V: 0.0181
F: 0.0160
S: 0.0152
H: 0.0133
R: 0.0065
C: 0.0044
A: 0.0013
