In [1]:
import pandas as pd

In [2]:
columns = ['symboling', 'normalized_losses', 'make', 'fuel_type', 'aspiration', 'num_doors', 'body_style', 
           'drive_wheels', 'engine_location', 'wheel_base', 'length', 'width', 'height', 'curb_weight', 
           'engine_type', 'num_cylinders', 'engine_size', 'fuel_system', 'bore', 'stroke', 'compression_ratio', 
           'horsepower', 'peak_rpm', 'city_mpg', 'highway_mpg', 'price']

df = pd.read_csv("../../data/as2/imports-85.data", names=columns)
df

Unnamed: 0,symboling,normalized_losses,make,fuel_type,aspiration,num_doors,body_style,drive_wheels,engine_location,wheel_base,...,engine_size,fuel_system,bore,stroke,compression_ratio,horsepower,peak_rpm,city_mpg,highway_mpg,price
0,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495
1,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500
2,1,?,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500
3,2,164,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.40,10.0,102,5500,24,30,13950
4,2,164,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.40,8.0,115,5500,18,22,17450
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200,-1,95,volvo,gas,std,four,sedan,rwd,front,109.1,...,141,mpfi,3.78,3.15,9.5,114,5400,23,28,16845
201,-1,95,volvo,gas,turbo,four,sedan,rwd,front,109.1,...,141,mpfi,3.78,3.15,8.7,160,5300,19,25,19045
202,-1,95,volvo,gas,std,four,sedan,rwd,front,109.1,...,173,mpfi,3.58,2.87,8.8,134,5500,18,23,21485
203,-1,95,volvo,diesel,turbo,four,sedan,rwd,front,109.1,...,145,idi,3.01,3.40,23.0,106,4800,26,27,22470


In [3]:
# Replace missing values represented by '?'
df.replace('?', pd.NA, inplace=True)

# Convert relevant columns to numeric data types
numeric_columns = ['normalized_losses', 'bore', 'stroke', 'horsepower', 'peak_rpm', 'price']
df[numeric_columns] = df[numeric_columns].apply(pd.to_numeric, errors='coerce')

# Fill missing numeric values with the mean of each column
df[numeric_columns] = df[numeric_columns].fillna(df[numeric_columns].mean())

# Print the DataFrame
df

Unnamed: 0,symboling,normalized_losses,make,fuel_type,aspiration,num_doors,body_style,drive_wheels,engine_location,wheel_base,...,engine_size,fuel_system,bore,stroke,compression_ratio,horsepower,peak_rpm,city_mpg,highway_mpg,price
0,3,122.0,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111.0,5000.0,21,27,13495.0
1,3,122.0,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111.0,5000.0,21,27,16500.0
2,1,122.0,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154.0,5000.0,19,26,16500.0
3,2,164.0,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.40,10.0,102.0,5500.0,24,30,13950.0
4,2,164.0,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.40,8.0,115.0,5500.0,18,22,17450.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200,-1,95.0,volvo,gas,std,four,sedan,rwd,front,109.1,...,141,mpfi,3.78,3.15,9.5,114.0,5400.0,23,28,16845.0
201,-1,95.0,volvo,gas,turbo,four,sedan,rwd,front,109.1,...,141,mpfi,3.78,3.15,8.7,160.0,5300.0,19,25,19045.0
202,-1,95.0,volvo,gas,std,four,sedan,rwd,front,109.1,...,173,mpfi,3.58,2.87,8.8,134.0,5500.0,18,23,21485.0
203,-1,95.0,volvo,diesel,turbo,four,sedan,rwd,front,109.1,...,145,idi,3.01,3.40,23.0,106.0,4800.0,26,27,22470.0


In [4]:
# Check for remaining missing values.
df.isnull().sum()

symboling            0
normalized_losses    0
make                 0
fuel_type            0
aspiration           0
num_doors            2
body_style           0
drive_wheels         0
engine_location      0
wheel_base           0
length               0
width                0
height               0
curb_weight          0
engine_type          0
num_cylinders        0
engine_size          0
fuel_system          0
bore                 0
stroke               0
compression_ratio    0
horsepower           0
peak_rpm             0
city_mpg             0
highway_mpg          0
price                0
dtype: int64

In [5]:
# Fill missing values in 'num_doors' with the most frequent value (mode)
mode_value = df['num_doors'].mode()[0]
df.loc[df['num_doors'].isnull(), 'num_doors'] = mode_value
df.isnull().sum()

symboling            0
normalized_losses    0
make                 0
fuel_type            0
aspiration           0
num_doors            0
body_style           0
drive_wheels         0
engine_location      0
wheel_base           0
length               0
width                0
height               0
curb_weight          0
engine_type          0
num_cylinders        0
engine_size          0
fuel_system          0
bore                 0
stroke               0
compression_ratio    0
horsepower           0
peak_rpm             0
city_mpg             0
highway_mpg          0
price                0
dtype: int64

In [6]:
df

Unnamed: 0,symboling,normalized_losses,make,fuel_type,aspiration,num_doors,body_style,drive_wheels,engine_location,wheel_base,...,engine_size,fuel_system,bore,stroke,compression_ratio,horsepower,peak_rpm,city_mpg,highway_mpg,price
0,3,122.0,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111.0,5000.0,21,27,13495.0
1,3,122.0,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111.0,5000.0,21,27,16500.0
2,1,122.0,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154.0,5000.0,19,26,16500.0
3,2,164.0,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.40,10.0,102.0,5500.0,24,30,13950.0
4,2,164.0,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.40,8.0,115.0,5500.0,18,22,17450.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200,-1,95.0,volvo,gas,std,four,sedan,rwd,front,109.1,...,141,mpfi,3.78,3.15,9.5,114.0,5400.0,23,28,16845.0
201,-1,95.0,volvo,gas,turbo,four,sedan,rwd,front,109.1,...,141,mpfi,3.78,3.15,8.7,160.0,5300.0,19,25,19045.0
202,-1,95.0,volvo,gas,std,four,sedan,rwd,front,109.1,...,173,mpfi,3.58,2.87,8.8,134.0,5500.0,18,23,21485.0
203,-1,95.0,volvo,diesel,turbo,four,sedan,rwd,front,109.1,...,145,idi,3.01,3.40,23.0,106.0,4800.0,26,27,22470.0


# Linear Regression Model:

- Implement linear regression on the dataset and evaluate the accuracy of your
regression model using metrics like Mean Absolute Error and R² score.

In [7]:
# Define the features and target variable
features = ['normalized_losses', 'bore', 'stroke', 'horsepower', 'peak_rpm', 'city_mpg', 'highway_mpg', 'engine_size']
target = 'price'

x = df[features] 
y = df[target]   

# Display the first few rows of features and target
print("Features (X):")
print(x.head())
print("\nTarget (y):")
print(y.head())

Features (X):
   normalized_losses  bore  stroke  horsepower  peak_rpm  city_mpg  \
0              122.0  3.47    2.68       111.0    5000.0        21   
1              122.0  3.47    2.68       111.0    5000.0        21   
2              122.0  2.68    3.47       154.0    5000.0        19   
3              164.0  3.19    3.40       102.0    5500.0        24   
4              164.0  3.19    3.40       115.0    5500.0        18   

   highway_mpg  engine_size  
0           27          130  
1           27          130  
2           26          152  
3           30          109  
4           22          136  

Target (y):
0    13495.0
1    16500.0
2    16500.0
3    13950.0
4    17450.0
Name: price, dtype: float64


In [8]:
from sklearn.model_selection import train_test_split

# Split the dataset into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# Display the sizes of the training and testing sets
print(f"\nTraining set size: {x_train.shape[0]}")
print(f"Testing set size: {x_test.shape[0]}")


Training set size: 164
Testing set size: 41


In [9]:
from sklearn.linear_model import LinearRegression

# Create and train the linear regression model
model = LinearRegression()
model.fit(x_train, y_train)

# Display the coefficients of the model
print("\nModel coefficients:")
for feature, coef in zip(features, model.coef_):
    print(f"{feature}: {coef:.2f}")


Model coefficients:
normalized_losses: 1.10
bore: 319.82
stroke: -2214.01
horsepower: -14.87
peak_rpm: 1.62
city_mpg: -39.84
highway_mpg: -234.50
engine_size: 149.23


In [10]:
# Make predictions on the test set
y_pred = model.predict(x_test)

# Display the first few predicted prices
print("\nPredicted prices (first 5):")
print(y_pred[:5])


Predicted prices (first 5):
[26895.9816565  15565.69836027 11015.16278264 13699.15750229
 20655.67063176]


In [11]:
# Make predictions on the test set
y_pred = model.predict(x_test)
from sklearn.metrics import mean_absolute_error, r2_score

# Evaluate the model
lin_mae = mean_absolute_error(y_test, y_pred)
lin_r2 = r2_score(y_test, y_pred)

print(f'Mean Absolute Error: {lin_mae:.2f}')
print(f'R² Score: {lin_r2:.2f}')

Mean Absolute Error: 3126.86
R² Score: 0.76


# Polynomial Regression Model:

- Extend the linear model to a polynomial one and compare the two models using
evaluation metrics (MAE & R² score).

In [12]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline

# Try different polynomial degrees
degrees = [1, 2, 3, 4, 5]
results = {}

for degree in degrees:
    polynomial_model = make_pipeline(PolynomialFeatures(degree), LinearRegression())
    polynomial_model.fit(x_train, y_train)
    y_poly_pred = polynomial_model.predict(x_test)
    
    # Calculate evaluation metrics
    poly_mae = mean_absolute_error(y_test, y_poly_pred)
    poly_r2 = r2_score(y_test, y_poly_pred)
    
    results[degree] = (poly_mae, poly_r2)

# Display results
for degree, (mae, r2) in results.items():
    print(f"Degree {degree} - MAE: {mae:.2f}, R²: {r2:.2f}")


Degree 1 - MAE: 3126.86, R²: 0.76
Degree 2 - MAE: 5895.37, R²: 0.27
Degree 3 - MAE: 69703.01, R²: -288.04
Degree 4 - MAE: 8181.61, R²: -2.00
Degree 5 - MAE: 7374.84, R²: -2.25


#### According to our result, we can't simply use our Polynomial Reg. for taking result, because of severall reason. So let's try to use Ridge Polynomial Reg. Which would prevent overfitting model here

Ridge Regression is a type of linear regression that includes a regularization term. This term helps prevent overfitting, especially in cases where there are many features or when the features are highly correlated.

`The alpha parameter is the regularization strength:`

- A higher value of αα applies more penalty to the coefficients, leading to a simpler model (less overfitting).
- A lower value of αα reduces the effect of regularization, allowing the model to fit the training data more closely.

In [13]:
from sklearn.linear_model import Ridge

degree = 2
ridge_model = make_pipeline(PolynomialFeatures(degree), Ridge(alpha=1.0))

# Train the model
ridge_model.fit(x_train, y_train)

# Predict and evaluate
y_ridge_pred = ridge_model.predict(x_test)
ridge_mae = mean_absolute_error(y_test, y_ridge_pred)
ridge_r2 = r2_score(y_test, y_ridge_pred)

print(f"Ridge Polynomial Model MAE: {ridge_mae:.2f}")
print(f"Ridge Polynomial Model R²: {ridge_r2:.2f}")
print(f'Linear Model MAE: {lin_mae:.2f}')
print(f'Linear Model R²: {lin_r2:.2f}')

Ridge Polynomial Model MAE: 2885.26
Ridge Polynomial Model R²: 0.78
Linear Model MAE: 3126.86
Linear Model R²: 0.76


`Statistical Analysis:`
- Perform statistical analysis on the dataset to compute the mean, variance, and
standard deviation of at least two variables.

In [14]:
numeric_columns = ['normalized_losses', 'bore', 'stroke', 'horsepower', 
                   'peak_rpm', 'city_mpg', 'highway_mpg', 'price']

means = df[numeric_columns].mean().round(2)
variances = df[numeric_columns].var().round(2)
std_devs = df[numeric_columns].std().round(2)

stats_df = pd.DataFrame({
    'Mean': means,
    'Variance': variances,
    'Standard Deviation': std_devs
})

stats_df

Unnamed: 0,Mean,Variance,Standard Deviation
normalized_losses,122.0,1003.69,31.68
bore,3.33,0.07,0.27
stroke,3.26,0.1,0.31
horsepower,104.26,1561.77,39.52
peak_rpm,5125.37,227509.06,476.98
city_mpg,25.22,42.8,6.54
highway_mpg,30.75,47.42,6.89
price,13207.13,61917513.18,7868.77


# Open-Ended Questions for Analysis and Interpretation

## 1. Understanding the Relationship in the Data:

- What kind of relationship did you observe between the independent variable(s) and the
dependent variable in your dataset? 

There was a **positive relationship** between `features` and `targets`. For example when *engine_size* increases, *price* tends to increase.

- Did the linear regression model capture this
relationship well? Why or why not?

The model captured this relationship reasonably **well** (R² = 0.76) but may have missed non-linear patterns, because it is less 80%.

## 2. Interpreting Model Coefficients:

- Explain the meaning of the coefficients in your linear regression model. How does the
slope of the regression line (coefficient) relate to the change in your dependent variable?

Each coefficient represents the change in the price for a one-unit change in the feature. For instance, a horsepower coefficient of -14.87 means that increasing horsepower by 1 unit decreases the predicted price.

- What is the significance of the intercept in your model? How would the results change if
the intercept were different?

The intercept indicates the expected price when all features are zero. Changes in the intercept could shift the entire regression line, affecting predictions.

The intercept is the predicted price of a car when all features (like engine_size, horsepower, etc.) are zero.
Impact of Changes: If the intercept increases, the entire regression line shifts upward, leading to higher predicted prices across all input values. Conversely, a lower intercept shifts the line downward, resulting in lower predictions.

## 3. Comparing Linear and Polynomial Models:
- Compare the performance of your linear regression model with the polynomial
regression model. Which one performed better based on evaluation metrics (e.g., R²
score, MAE)? Why do you think the polynomial model may or may not have been a
better fit?

The linear model (MAE: 3126.86, R²: 0.76) performed better than the polynomial model (power of 2) (MAE: 5895.37, R²: 0.27). The polynomial model likely overfitted the data, resulting in poorer predictions. As we can see the highest power of polynomial model leads to lower predictions percent.

## 4. Model Evaluation and Error Analysis:

- You did the evaluation of the accuracy of your regression models using metrics like MAE
and R². What do these metrics tell you about the quality of your model’s predictions? Are
the errors acceptable for your specific problem?

MAE (Mean Absolute Error): This metric represents the average absolute difference between predicted and actual values. Lower MAE indicates better prediction accuracy. For our models:

+ Ridge Polynomial Model: MAE: 2885.26
+ Linear Model: MAE: 3126.86

The Ridge model has a lower MAE, suggesting it predicts car prices more accurately than the linear model.

R² Score: This metric indicates how well the model explains the variance in the target variable. A higher R² score means a better fit:

+ Ridge Polynomial Model: R²: 0.78
+ Linear Model: R²: 0.76

Both models show similar performance, but the Ridge model explains slightly more variance in car prices.

The polynomial models exhibit high MAE and low R² as the degree increases, indicating overfitting and a poor fit to the data.


We should aim to **increase the R² score** and **decrease the MAE** for our models. Even with our current results, there are techniques like **scaling**, **regularization**, and **feature engineering** that can help achieve better performance.

## 5. Impact of Data Preprocessing:
- How did data preprocessing steps (e.g., handling missing values, scaling features)
impact your regression model? If you skipped preprocessing steps, how would this affect
your results?

Handling missing values improved data quality, enabling the models to learn better. Skipping these steps would lead to biased predictions and potentially lower model performance.

## 6. Real-World Implications:
- Based on your model, what real-world insights or predictions can be made from your
analysis? How reliable are these predictions, and what external factors might influence
their accuracy?

The model can predict car prices based on features, guiding pricing strategies. Reliability depends on data quality. External factors like market trends could affect accuracy.

## 7. Reflection on Statistical Concepts:
- How did statistical concepts such as mean, variance, and standard deviation help you
better understand your dataset?

Concepts like mean, variance, and standard deviation helped in understanding the **distribution** and **variability** of `features`, which is crucial for effective modeling.