In [114]:
# Dependencies
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from pathlib import Path

In [None]:
# Import data
amazon_data = Path('C:/Users/cedet/OneDrive/Desktop/amazon_data_cleaned1.csv')

print(amazon_data)

In [None]:
# Read in data
amazon_df = pd.read_csv(amazon_data)

# Display the first 3 rows of the dataframe
display(amazon_df.head(3))

In [None]:
amazon_df.dtypes

In [118]:
# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Apply Label Encoding to the specified columns
amazon_df['Category Name'] = label_encoder.fit_transform(amazon_df['Category Name'])
amazon_df['Product Description'] = label_encoder.fit_transform(amazon_df['Product Description'])
amazon_df['Best Seller'] = label_encoder.fit_transform(amazon_df['Best Seller'])


In [None]:
# Create the X set by using the `reshape` function to format the Best Seller data as a single column array.
X = amazon_df[["Category Name", "Product Description", "Price",
                                "Stars", "Product Volume", "Total Spend"]].values
# Display sample data
X[:2]

In [120]:
# Create an array for the dependent variable y with the Stars
y = amazon_df["Best Seller"]

In [121]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [122]:
# Create a Linear Regression mode
model = LinearRegression()

In [123]:
# Fit the model to the training data and calculate the scores for the training and testing data
model = model.fit(X_train, y_train)

In [124]:
# Predict on the test set
predictions = model.predict(X_test)

In [None]:
# Create a copy of the original data
amazon_df_with_predictions = amazon_df.copy()

# Add a new column of predictions to the dataframe
amazon_df_with_predictions["Stars Predictions"] = model.predict(X)

# Display the first 3 rows of the dataframe
display(amazon_df_with_predictions.head(3))

In [None]:
# Calculate the mean_squared_error (MSE) on actual versus predicted test "y"
score = round(model.score(X_test, y_test)*100,2)
r2 = round(r2_score(y_test, predictions)*100,2)
mse = round(mean_squared_error(y_test, predictions),2)
rmse = round(np.sqrt(mse),2)

# Print the score, r2, mse, and rmse
print(f"The score is {score}.")
print(f"The r2 is {r2}.")
print(f"The mean squared error is {mse}.")
print(f"The root mean squared error is {rmse}.")