#### Import Libraries

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
import numpy as np

#### Load Data

In [None]:
# Load loan data set
df = pd.read_csv('CarPrice_Assignment.csv')

#### Explore Data 

In [None]:
# view the shape of the data frame
print(f" The data set has {df.shape[0]} rows and {df.shape[1]} columns")

In [None]:
# view first few rows of data set
df.head()

In [None]:
# view statistical nature of the data set
df.describe()

In [None]:
# check data types
df.info()

In [None]:
features = list(df.columns)

In [None]:
for item in features:
    # visualize linear relationships in the data set
    df.plot(kind = 'scatter', x = item, y = 'price')

#### Check target variable distribution

In [None]:
df['price'].plot(kind = 'hist')

#### Redistribute target data

In [None]:
from scipy.stats import boxcox

# Ensure that all data points are > 0
df['price'] = df['price'].apply(lambda x: x if x > 0 else 1)
df['boxcox_price'], _ = boxcox(df['price'])
df['boxcox_price'].plot(kind='hist')



In [None]:
for item in features:
    # visualize linear relationships in the data set with sclaed data
    df.plot(kind = 'scatter', x = item, y = 'boxcox_price')

#### Clean the Data

In [None]:
# Drop rows where any of the specified features have missing values
df = df.dropna(subset=list(df.columns))

In [None]:
# view the shape of the data set
df.shape

In [None]:
# Encode catageroical features

# Initialize label encoder
labelencoder = LabelEncoder()

# List of categorical columns to label encode
cat_columns = ['symboling', 'CarName', 'fueltype', 'aspiration', 'doornumber', 'carbody', 'drivewheel', 'enginelocation','enginetype','cylindernumber','fuelsystem']

# Apply label encoder on each of the selected columns
for col in cat_columns:
    df[col] = labelencoder.fit_transform(df[col])


In [None]:
df.head()

In [None]:
df.info()

In [None]:
for item in features:
    # visualize linear relationships in the data set
    df.plot(kind = 'scatter', x = item, y = 'boxcox_price')

In [None]:
features=[
'carwidth',
'curbweight',
'enginesize',
'horsepower',
'citympg',
'highwaympg']

In [None]:
# Initialize the Standard Scaler
scaler = StandardScaler()

# Fit the scaler on the columns and transform them
# This will standardize the feature columns in-place
df[features] = scaler.fit_transform(df[features])

# Display the first few rows of the DataFrame to verify the scaling
# This allows you to check that the scaling was applied correctly
df.head()

In [None]:
df.info()

#### Prepare the data

In [None]:
y = df['price']
y

In [None]:
# Create the feature matrix X
X = df[features]
X

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=1234)

#### Train the model

In [None]:
model = LinearRegression().fit(X_train,y_train)
print("Model Intercept: ",model.intercept_)
print("Model Coef: ",model.coef_)

#### Evaluate the model

In [None]:
print("The Model Score is: ",model.score(X_test, y_test)
y_pred = model.predict(X_test)
print("The mean absolute error is: ", mean_absolute_error(y_test, y_pred))
print(f"the min price is: {min(y)}")
print(f"the max price is: {max(y)}")

#### Make a prediction

In [None]:
error = []
rangeNum = 200
for row in range(1, rangeNum):
    # Assume single_row_df is your DataFrame containing the single row you want to predict
    single_row_df = df.loc[[row], features]  # Note the double brackets to keep it as a DataFrame

    y_pred_single = model.predict(single_row_df)
    predicted_price = y_pred_single[0]
    actual_price = df.loc[row, 'price']

    #print("Predicted price:", round(predicted_price, 2))
    #print(f"Actual price: {actual_price}")

    # Calculate the absolute percentage error
    abs_percent_error = abs((predicted_price - actual_price) / actual_price) * 100
    #print(f"The prediction is within {round(abs_percent_error, 2)}% of the actual price")
    error.append(abs_percent_error)

# Calculate the mean error of all errors during prediction
mean_error = np.mean(error)
print(f"The mean error percentage is {round(mean_error, 2)}%")

In [1]:
from IPython.display import display, Javascript
display(Javascript('IPython.notebook.clear_all_output()'))


<IPython.core.display.Javascript object>