In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style()
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
from sklearn.preprocessing import LabelEncoder 
le = LabelEncoder()

In [None]:
df=pd.read_csv("/kaggle/input/car-price/CarPrice_Assignment.csv",index_col='car_ID')
import warnings
warnings.filterwarnings('ignore')

# MODEL

In [None]:
num_cols = df.select_dtypes(include='object').columns.to_list() # selecting numerical columns
col=num_cols[0:]
for x in col:
    df[x] = le.fit_transform(df[x])

In [None]:
df.head()

# SPLIT AND EVALUATE

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import numpy as np
#=============================================================================================

models = {
    'Linear Regression': LinearRegression(),
    'Random Forest': RandomForestRegressor(random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(random_state=42),
}
#=============================================================================================
X = df.drop(columns=['CarName',"price"])
y = df['price']

# Replace 'your_X_data' and 'your_y_data' with your actual feature and target data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
#=============================================================================================

for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    # Evaluate the model
    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))

    print(f'{model_name}:')
    print(f'R-squared: {r2:.2f}')
    print(f'Mean Absolute Error (MAE): {mae:.2f}')
    print(f'Root Mean Squared Error (RMSE): {rmse:.2f}')
    print('----------------------------------------')


# SELECT BEST AND CREATE NEW DATA SET 

In [None]:
import pandas as pd
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.preprocessing import MinMaxScaler
#==================================================================
# Assuming 'target_column' is the name of your target variable
X = df.drop(columns=['CarName', 'price'])
y = df['price']
#===================================================================
# Scale the features to non-negative values using MinMaxScaler
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)
#===================================================================

# Select the top k features using f_regression
k = 5  # Number of features to select
selector = SelectKBest(score_func=f_regression, k=k)
X_selected = selector.fit_transform(X_scaled, y)
#===================================================================

# Get the indices of the selected features
selected_indices = selector.get_support(indices=True)

# Create a new DataFrame with the selected features
selected_feature_names = [X.columns[i] for i in selected_indices]
X_new = pd.DataFrame(X_selected, columns=selected_feature_names)

# Combine the selected features with the target column
new_data = pd.concat([X_new, y], axis=1)
#===================================================================

# Save the new dataset to a CSV file (optional)
new_data.to_csv('new_dataset_with_selected_features.csv', index=False)
new_data['price']=new_data['price'].fillna(13495.0)
new_data.head()

# SPLIT AND EVALUATE FOR NEW DATA 

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import numpy as np
#=============================================================================================

models = {
    'Linear Regression': LinearRegression(),
    'Random Forest': RandomForestRegressor(random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(random_state=42),
}
#====================================================================================
X = df[['carwidth','curbweight','enginesize','horsepower','highwaympg']] 
y = df['price']

# Replace 'your_X_data' and 'your_y_data' with your actual feature and target data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
#===============================================================================================
for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    # Evaluate the model
    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))

    print(f'{model_name}:')
    print(f'R-squared: {r2:.2f}')
    print(f'Mean Absolute Error (MAE): {mae:.2f}')
    print(f'Root Mean Squared Error (RMSE): {rmse:.2f}')
    print('----------------------------------------')


# FORWARD SELECTION

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

# Load your dataset (replace with your data loading method)
# ...

# Assuming 'target_column' is the name of your target variable
X = df.drop(columns=['CarName', 'price'])
y = df['price']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize variables for selected features and best R-squared
selected_features = []
best_r2 = -float('inf')

# Define the maximum number of features to select
max_features = 5
#or -> while len(selected_features) < len(X.columns):->if ineed all columns
while len(selected_features) < max_features:
    best_feature = None
    best_feature_r2 = -float('inf')

    # Iterate through remaining features
    for feature in X_train.columns:
        if feature not in selected_features:
            current_features = selected_features + [feature]
            X_train_subset = X_train[current_features]
            X_test_subset = X_test[current_features]

            model = LinearRegression()
            model.fit(X_train_subset, y_train)
            y_pred = model.predict(X_test_subset)
            r2 = r2_score(y_test, y_pred)

            if r2 > best_feature_r2:
                best_feature_r2 = r2
                best_feature = feature

    if best_feature:
        selected_features.append(best_feature)
        best_r2 = max(best_r2, best_feature_r2)

# Print selected features and their R-squared values
print("Selected Features:")
for idx, feature in enumerate(selected_features, start=1):
    print(f"{idx}. {feature}")

print(f"\nBest R-squared value with selected features: {best_r2:.4f}")


# FORWARD SELECTION ALSOO

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

# Load your dataset (replace with your data loading method)

# Assuming 'target_column' is the name of your target variable
X = df.drop(columns=['CarName', 'price'])
y = df['price']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

def forward_selection(X_train, X_test, y_train, y_test, num_features=5):
    selected_features = []
    best_r2 = 0
    best_feature = None

    while len(selected_features) < num_features:
        candidate_features = list(set(X_train.columns) - set(selected_features))
        for feature in candidate_features:
            model = LinearRegression()
            model.fit(X_train[selected_features + [feature]], y_train)
            y_pred = model.predict(X_test[selected_features + [feature]])
            r2 = r2_score(y_test, y_pred)

            if r2 > best_r2:
                best_r2 = r2
                best_feature = feature

        if best_feature is not None:
            selected_features.append(best_feature)
            best_feature = None
        else:
            break

    return selected_features

selected_features = forward_selection(X_train, X_test, y_train, y_test, num_features=5)
print("Selected features:", selected_features)


# SPLIT AND EVALUATE FOR FORWARD SELECTION

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import numpy as np
#=============================================================================================

models = {
    'Linear Regression': LinearRegression(),
    'Random Forest': RandomForestRegressor(random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(random_state=42),
}
#====================================================================================
X = df[['fueltype','curbweight','enginesize','peakrpm','drivewheel']] 
y = df['price']

# Replace 'your_X_data' and 'your_y_data' with your actual feature and target data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
#===============================================================================================
for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    # Evaluate the model
    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))

    print(f'{model_name}:')
    print(f'R-squared: {r2:.2f}')
    print(f'Mean Absolute Error (MAE): {mae:.2f}')
    print(f'Root Mean Squared Error (RMSE): {rmse:.2f}')
    print('----------------------------------------')