In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
from sklearn.preprocessing import LabelEncoder 
sns.set_theme(style='darkgrid', palette='colorblind')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [None]:
df = pd.read_csv("/kaggle/input/car-price/CarPrice_Assignment.csv",index_col='car_ID')

In [None]:
df.head(10)

In [None]:
df.shape

In [None]:
df.columns

In [None]:
df.describe()

In [None]:
df.describe(include="object")

In [None]:
df.info()

In [None]:
df.dtypes

# Null values 

In [None]:
df.isna().sum()

In [None]:
df.isna().sum().plot(kind = 'bar')

# duplicated rows 

In [None]:
# Finding duplicate rows
duplicate_rows = df[df.duplicated(keep='first')]

# Number of duplicate rows
num_duplicates = duplicate_rows.shape[0]

# Displaying the duplicate rows
print(f"Number of duplicate rows: {num_duplicates}")
duplicate_rows

# visualization

In [None]:
correlation_matrix = df.corr()
plt.figure(figsize=(8, 8))
sns.heatmap(correlation_matrix, linewidth=.5, cmap='PuRd', square=True)
plt.title('Correlation Matrix', fontsize=20)
plt.show()

In [None]:
print('Top 5 Most Positively Correlated to the Target Variable')
correlation_matrix['price'].sort_values(ascending=False).head(5)

In [None]:
print('Top 5 Most Negatively Correlated to the Target Variable')
correlation_matrix['price'].sort_values(ascending=True).head(5)

In [None]:
columns_to_drop = [col for col in correlation_matrix.columns if abs(correlation_matrix.loc['price', col]) < 0.5]
columns_to_drop

In [None]:
df = df.drop(columns_to_drop, axis=1)
df.shape

In [None]:
num_cols = df.select_dtypes(exclude='object').columns.to_list() # selecting numerical columns

for col in num_cols:
    fig, ax = plt.subplots(1, 2, figsize=(20, 5))
    sns.distplot(df[col], ax=ax[0] , color='teal')  # Use distplot instead of histplot
    sns.boxplot(x=df[col], ax=ax[1] ,color='teal')
    
    ax[0].set_title(f'Distribution of {col}')
    ax[1].set_title(f'Boxplot of {col}')

In [None]:
# Univariate analysis for categorical variables
num_Categorical_cols = df.select_dtypes(include='object').columns.to_list() # selecting numerical columns
num_Categorical_cols = num_Categorical_cols[1:]
for column in num_Categorical_cols:
    print("Unique values in column", column)
    print(df[column].value_counts())
    plt.figure(figsize=(10, 6))
    sns.catplot(data=df, x= column ,kind="count", palette="ch:.25")
    plt.title(f"Distribution of {column}")
    plt.xlabel(column)
    plt.ylabel("Frequency")
    plt.show()

In [None]:
columns_to_graph = df.select_dtypes(exclude='object').columns.to_list() # selecting numerical columns
columns_to_graph

In [None]:
# Sort the data based on the "Price" column from low to high
data_sorted = df.sort_values(by='price')
# colors 
colors = ['b', 'g', 'r', 'c', 'm', 'y', 'k', 'teal', 'tab:blue', 'tab:orange', 'tab:green', 'tab:red', 'tab:purple', 'tab:brown', 'tab:pink' ,'tab:gray']

# Get a list of all columns except for 'Price'
columns_to_graph = df.select_dtypes(exclude='object').columns.to_list() # selecting numerical columns
# Plot each column against 'Price' for the sorted data

for i , column in enumerate(columns_to_graph):
    plt.figure(figsize=(6, 6))  # Set the figure size
    sns.scatterplot(data=data_sorted, x=column, y='price', palette="ch:.25" , color =colors[i])
    plt.xlabel(column)
    plt.ylabel('price')
    plt.xticks(rotation='vertical')  # Rotate x-axis labels vertically
    plt.show()

In [None]:
df["doornumber"].value_counts().plot.pie()


In [None]:
sns.kdeplot(data=df, x="price", hue="drivewheel", multiple="fill")

In [None]:
print(df['carlength'].describe())
plt.figure(figsize=(9, 8))
sns.distplot(df['carlength'], color='g', bins=100, hist_kws={'alpha': 0.4});

# Categorical

In [None]:
le = LabelEncoder()

In [None]:
columns = df.select_dtypes(include='object').columns.to_list() # selecting numerical columns
columns = num_Categorical_cols[0:]

In [None]:
for col in columns:
    df[col] = le.fit_transform(df[col])

# model

In [None]:
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

In [None]:
X = df.drop(columns=['CarName',"price"])
y = df['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Display the shapes of the resulting datasets
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

In [None]:
models = {
    'Linear Regression': LinearRegression(),
    'Random Forest': RandomForestRegressor(random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(random_state=42),
}
best_model = None
best_r2 = 0

for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred= model.predict(X_test)

    # Evaluate the model
    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    submit = pd.DataFrame()
    submit['Actual Price'] = y_test
    submit['Predict_price'] = y_pred
    submit = submit.reset_index()
    print(submit.head(8))
    r2 = r2_score(y_test, y_pred)

    if r2 > best_r2:
        best_r2 = r2
        best_model = model.__class__.__name__

    print(f'{model_name}:')
    print(f'R2 Score: {r2:.2f}')
    print(f'Mean Absolute Error (MAE): {mae:.2f}')
    print(f'Root Mean Squared Error (RMSE): {rmse:.2f}')
    print('----------------------------------------')
print(f"The best performing model is: {best_model} with accuracy: {best_r2:.2f}")

In [None]:
importances = model.feature_importances_

feature_names = X.columns

feature_importance_dict = dict(zip(feature_names, importances))

sorted_feature_importance = sorted(feature_importance_dict.items(), key=lambda x: x[1], reverse=True)

for feature, importance in sorted_feature_importance:
    print(f"{feature}: {importance:.2f}")

plt.figure(figsize=(12, 7))
plt.barh(*zip(*sorted_feature_importance), alpha=0.9, color='teal')
plt.title('Feature Importance', fontsize=15)
plt.xlabel("Importance")
plt.ylabel("Feature")
plt.show()

# Reduce Unnecessary Columns


# SelectKBest

In [None]:
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.feature_selection import SelectKBest, chi2

In [None]:
X = df.drop(columns=['CarName',"price"])
y = df['price']


In [None]:
selector = SelectKBest(score_func=f_regression, k=8)
X_new = selector.fit_transform(X, y)

# Print selected features' indices
selected_indices = selector.get_support()
print("Selected feature indices:", selected_indices)

In [None]:
# Get the names of the selected features
selected_feature_names = X.columns[selected_indices]

# Convert the transformed X_new array back to a pandas DataFrame with selected features
X_new_df = pd.DataFrame(X_new, columns=selected_feature_names)

# Print the resulting DataFrame
X_new_df["price"] = df.price
X_new_df.head(5)


In [None]:
X_new_df['price']=X_new_df['price'].fillna(13495.0)

In [None]:
X = df[['drivewheel','carlength','carwidth','curbweight','enginesize','horsepower','citympg','highwaympg']]
y = df['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Display the shapes of the resulting datasets
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

In [None]:
models = {
    'Linear Regression': LinearRegression(),
    'Random Forest': RandomForestRegressor(random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(random_state=42),
}
best_model = None
best_r2 = 0

for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred= model.predict(X_test)

    # Evaluate the model
    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    submit = pd.DataFrame()
    submit['Actual Price'] = y_test
    submit['Predict_price'] = y_pred
    submit = submit.reset_index()
    print(submit.head(8))
    r2 = r2_score(y_test, y_pred)

    if r2 > best_r2:
        best_r2 = r2
        best_model = model.__class__.__name__

    print(f'{model_name}:')
    print(f'R2 Score: {r2:.2f}')
    print(f'Mean Absolute Error (MAE): {mae:.2f}')
    print(f'Root Mean Squared Error (RMSE): {rmse:.2f}')
    print('----------------------------------------')
print(f"The best performing model is: {best_model} with accuracy: {best_r2:.2f}")

# forward_selection

In [None]:
X = df.drop(columns=['CarName',"price"])
y = df['price']

In [None]:
import pandas as pd
import statsmodels.api as sm

def forward_selection(df, target, significance_level=0.05):
    initial_features = df.columns.tolist()
    best_features = []
    while len(initial_features) > 0:
        remaining_features = list(set(initial_features) - set(best_features))
        new_pval = pd.Series(index=remaining_features)
        for new_column in remaining_features:
            model = sm.OLS(target, sm.add_constant(df[best_features + [new_column]])).fit()
            new_pval[new_column] = model.pvalues[new_column]
        min_p_value = new_pval.min()
        if min_p_value < significance_level:
            best_features.append(new_pval.idxmin())
        else:
            break
    return best_features

 

# Assuming you have already defined X and y as the features and target variable respectively
selected_features = forward_selection(X, y)
print("Selected features:", selected_features)


In [None]:
Selected_features = ['enginesize', 'curbweight', 'enginelocation', 'carwidth', 'drivewheel', 'horsepower', 'boreratio']

X = df[selected_features]
y = df['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Display the shapes of the resulting datasets
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

In [None]:
models = {
    'Linear Regression': LinearRegression(),
    'Random Forest': RandomForestRegressor(random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(random_state=42),
}
best_model = None
best_r2 = 0

for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred= model.predict(X_test)

    # Evaluate the model
    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    submit = pd.DataFrame()
    submit['Actual Price'] = y_test
    submit['Predict_price'] = y_pred
    submit = submit.reset_index()
    print(submit.head(8))
    r2 = r2_score(y_test, y_pred)

    if r2 > best_r2:
        best_r2 = r2
        best_model = model.__class__.__name__

    print(f'{model_name}:')
    print(f'R2 Score: {r2:.2f}')
    print(f'Mean Absolute Error (MAE): {mae:.2f}')
    print(f'Root Mean Squared Error (RMSE): {rmse:.2f}')
    print('----------------------------------------')
print(f"The best performing model is: {best_model} with accuracy: {best_r2:.2f}")