## Initial Steps

### Import Libraries needed

In [None]:
# data manipulation
import pandas as pd
import numpy as np

# data viz
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('fivethirtyeight')

# ignore warnings
import warnings
warnings.filterwarnings('ignore')

### Import the Data

In [None]:
url = 'https://raw.githubusercontent.com/vkoul/data/main/misc/car_data.csv'

df = pd.read_csv(url)

## Data Inspection

### Data Inspect

In [None]:
df.head(10)

In [None]:
df.shape

### Missing Values and Duplicated

In [None]:
df.isna().sum()

In [None]:
df.isna().mean().multiply(100)

In [None]:
# Not sure if i should remove NA but did it to consider everything
df = df.dropna()
df

In [None]:
df.duplicated().sum()/df.shape[0]

In [None]:
# Remove Duplicates, there is no reason for that in this subset. It is a list of cars and it's price
df = df.drop_duplicates()
df

In [None]:
df.duplicated().sum()/df.shape[0]

**Rename the Columns (just to be easier)**

In [None]:
df.columns

In [None]:
df.columns = df.columns.str.lower()

In [None]:
df.columns = df.columns.str.replace(' ','_')

In [None]:
df.columns

In [None]:
# Changing the name mrsp to price just to be easier
df.rename(columns={'msrp':'price'},inplace = True)
df.columns

## EDA

### Univariate Analysis (and remove duplicates)

In [None]:
df.describe()

In [None]:
df.describe([0.005, 0.01, 0.05, 0.1, 0.2, 0.8, 0.9, 0.99, 0.995, 0.999])

In [None]:
df.year.value_counts()

In [None]:
df.year.plot(kind = 'hist');

plt.title('Number of Cars per Year')
plt.ylabel('Number of Cars')
plt.xlabel('Year');

In [None]:
df.engine_hp.value_counts()

In [None]:
# Remove the 1001 value, really different from the others
df.drop(df[df['engine_hp'] == 1001].index, inplace = True)
df.engine_hp.value_counts()

In [None]:
pd.cut(df.engine_hp, bins=6).value_counts().sort_index().plot(kind='bar');

plt.title('Number of Cars per Class of Engine HP')
plt.ylabel('Number of Cars')
plt.xlabel('Engine HP');

In [None]:
df.engine_cylinders.value_counts()

In [None]:
# Remove the Engine Cylinders = 0 because it should be an outlier (a mistake)
df.drop(df[df['engine_cylinders'] == 0].index, inplace = True)
df.engine_cylinders.value_counts()

In [None]:
df.engine_cylinders.plot(kind='hist');

plt.title('Number of Cars per Engine Cylinders')
plt.ylabel('Number of Cars')
plt.xlabel('Engine Cylinders');

In [None]:
df.number_of_doors.value_counts()

In [None]:
# Remove the cars that have 3 Doors
df.drop(df[df['number_of_doors'] == 3].index, inplace = True)
df.number_of_doors.value_counts()

In [None]:
df.number_of_doors.value_counts().plot(kind='pie')

plt.title('Proportion of Cars with 4 and 2 Doors')
plt.ylabel('Number of Cars');

In [None]:
df.highway_mpg.value_counts()

In [None]:
# Remove the 354 value of highway_mpg
df.drop(df[df['highway_mpg'] == 354].index, inplace = True)
df.highway_mpg.value_counts()

In [None]:
pd.cut(df.highway_mpg, bins=5).value_counts().sort_index().plot(kind='bar');

plt.title('Number of Cars per Highway MPG')
plt.ylabel('Number of Cars')
plt.xlabel('MPG');

In [None]:
df.city_mpg.value_counts()

In [None]:
pd.cut(df.city_mpg, bins=5).value_counts().sort_index().plot(kind='bar');

plt.title('Number of Cars per City MPG')
plt.ylabel('Number of Cars')
plt.xlabel('MPG');

In [None]:
bin_edges = [2000,10000,20000,30000,40000,50000,60000,70000,2065092]

pd.cut(df.price, bin_edges, right = False).value_counts().sort_index().plot(kind='bar');

plt.title('Number of Cars per Price')
plt.ylabel('Number of Cars')
plt.xlabel('Price');

In [None]:
pd.cut(df.price, bin_edges, right = False).value_counts()

In [None]:
df.describe()

In [None]:
df.describe([0.005, 0.01, 0.05, 0.1, 0.2, 0.8, 0.9, 0.99, 0.995, 0.999])

In [None]:
df.describe(include = 'O')

### Bivariate Analysis

In [None]:
df.columns

**Price by Make**

In [None]:
df.make.value_counts().nlargest(10)

In [None]:
df.groupby('make')['price'].mean().plot(kind='bar', figsize = (15,10));

**Price by Engine Fuel Type**

In [None]:
df.groupby('engine_fuel_type')['price'].mean().plot(kind='bar', figsize = (15,10));

**Correlation between values**

In [None]:
df.corr()['price'].round(2)

In [None]:
df.drop('price', axis=1).corrwith(df['price']).round(2)
# Without the 1 for Price

In [None]:
corr = df.corr().round(2)
corr

In [None]:
sns.heatmap(corr, annot = True, cmap = 'Reds');

In [None]:
price = df.corr()['price'].round(2).to_frame()
sns.heatmap(price, annot=True, cmap = 'Reds')

### Data Prep for Modelling

**Separate the values of Market Category**

In [None]:
# Splipt the 'market_category' column by a comma and expand the resulting Dataframe
df_expanded = df['market_category'].str.split(',', expand=True)

# Copy the dataframe
df2 = df[:].copy()

# Create a new DataFrame with one column for each unique value in the expanded DataFrame
df_expanded = pd.get_dummies(df_expanded.stack()).sum(level=0)

# Concatenate the original DataFrame and the expanded DataFrame
df2 = pd.concat([df, df_expanded], axis = 1)

In [None]:
df2.columns

In [None]:
# Drop Market Category category
del df2['market_category']
df2.columns

**Conver the other categorical value to numerical**

In [None]:
df2 = pd.get_dummies(df2, columns = ['make', 'model', 'engine_fuel_type', 'transmission_type', 'driven_wheels',
                                     'vehicle_size', 'vehicle_style'])

In [None]:
df2.columns

**Rename again the columns**

In [None]:
df2.columns = df2.columns.str.lower()

In [None]:
df2.columns = df2.columns.str.replace(' ','_')

In [None]:
df2.columns = df2.columns.str.replace('-','_')

In [None]:
df2.columns

**Train Test Splitting**

In [None]:
# Data Prep
from sklearn.model_selection import train_test_split

In [None]:
# Splitting the Data
x = df2.drop(columns = 'price')
y = df2['price']

In [None]:
# Doing train test split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 42)

In [None]:
print('x_train: ', x_train.shape)
print('y_train: ', y_train.shape)
print('x_test: ', x_test.shape)
print('y_test: ', y_test.shape)

**Standardization**

In [None]:
# Data Prep
from sklearn.preprocessing import StandardScaler

In [None]:
scaler = StandardScaler()

In [None]:
x_train = scaler.fit_transform(x_train)

In [None]:
x_test = scaler.transform(x_test)

In [None]:
x_train

In [None]:
x_test

## Model Building - Linear

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
linear = LinearRegression()

In [None]:
linear.fit(x_train, y_train)

In [None]:
linear.score(x_train, y_train)

In [None]:
predictions = linear.predict(x_train)

**Train Data Predictions**

In [None]:
# Combine the actual data and predictions
output = pd.DataFrame({'actual': y_train,
                       'predictions': predictions})

# Sort the index
output = output.sort_index()

# New output
output.head()

In [None]:
# Visualize the Predictions

In [None]:
plt.figure(figsize = (20,15))

plt.plot(output['actual'], label = 'actual')
plt.plot(output['predictions'], label = 'predictions')

plt.ylabel('Price')

plt.legend()

plt.show();

**Test Data Predictions**

In [None]:
# Data Prep
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [None]:
linear.score(x_test, y_test)

In [None]:
# Predictions for test data

test_predictions = np.round(linear.predict(x_test),1)

# Calculate the score
print('R^2 of test: {:.2f}'.format(linear.score(x_test, y_test)))

# Accuracy metrics
print('RMSE of test: {:.2f}'.format(mean_squared_error(y_test, test_predictions, squared = False)))
print('MAE of test: {:.2f}'.format(mean_absolute_error(y_test, test_predictions)))

In [None]:
## Model Score of test data is really bad opposed to a really good one for training data. This actually makes sense due
## to the amount of variables that we are using. It is clearly a problem of overfitting.

## Model Building - KNeihborsRegressor

In [None]:
# Prep the data
from sklearn.neighbors import KNeighborsRegressor

In [None]:
knn = KNeighborsRegressor(n_neighbors = 1)

# Fit the model
knn.fit(x_train, y_train)

# Record training set accuracy
print(knn.score(x_train, y_train))

# Record test accuracy
print(knn.score(x_test, y_test))

**Try several values for K**

In [None]:
# empty list that will be populated
training_accuracy = []
test_accuracy = []

# try n_neighbors from 1 to 20
neighbors_settings = range(1,20)

for n_neighbors in neighbors_settings:
    
    #build the model
    knn = KNeighborsRegressor(n_neighbors = n_neighbors)
    knn.fit(x_train, y_train)
    
    # record training set accuracy
    training_accuracy.append(knn.score(x_train, y_train))
    
    # record generalization accuracy
    test_accuracy.append(knn.score(x_test, y_test))

In [None]:
print(test_accuracy)
print('--------')
print(training_accuracy)

In [None]:
## Visualize
plt.figure(figsize = (10,8))

plt.plot(neighbors_settings, training_accuracy, marker = 'o', label = 'Training Accuracy')
plt.plot(neighbors_settings, test_accuracy, marker = 'o', label = 'Test Accuracy')

# highlight the max and min accuracy values
plt.axvline(x = test_accuracy.index(max(test_accuracy))+1, linewidth = 10, alpha = 0.2, color ='green')
plt.axvline(x = test_accuracy.index(min(test_accuracy))+1, linewidth = 10, alpha = 0.2, color ='red')

plt.ylabel('Accuracy')
plt.xlabel('K')

plt.legend();

In [None]:
# Is it ok to use K = 1 or is it wrong? We are going to have big variance with this, should we opt for K = 2?

**Using K = 2**

In [None]:
knn = KNeighborsRegressor(n_neighbors = 2)

# Fit the model
knn.fit(x_train, y_train)

# Record training set accuracy
print(knn.score(x_train, y_train))

# Record test accuracy
print(knn.score(x_test, y_test))

In [None]:
predictionsknn = knn.predict(x_train)

In [None]:
# # combine the actual data and predictions
outputknn = pd.DataFrame({"actualknn": y_train, 
                       "predictionsknn": predictionsknn})

# # sort the index
outputknn = outputknn.sort_index()

# # new output
outputknn.head()

In [None]:
plt.figure(figsize = (20,15))

plt.plot(outputknn['actualknn'], label = 'actual')
plt.plot(outputknn['predictionsknn'], label = 'predictions')

plt.ylabel('Price')

plt.legend()

plt.show();

In [None]:
test_predictionsknn = np.round(knn.predict(x_test),1)

print("R^2 of test: {:.2f}".format(knn.score(x_test, y_test)))
print("RMSE of test: {:.2f}".format(mean_squared_error(y_test, test_predictionsknn, squared = False)))
print("MAE of test: {:.2f}".format(mean_absolute_error(y_test, test_predictionsknn)))

In [None]:
test_outputknn = pd.DataFrame({"actualknn": y_test, 
                               "test_predictionsknn": test_predictionsknn})

# # sorting index
test_outputknn = test_outputknn.sort_index()
test_outputknn.head()

In [None]:
plt.figure(figsize = (20,15))

plt.plot(test_outputknn["actualknn"], marker = "o", label = "actual")
plt.plot(test_outputknn["test_predictionsknn"], marker = "o", label = "predictions")

plt.ylabel("Price")

plt.legend()

plt.show();