In [3]:
import pickle
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.metrics import r2_score

# Load the dataset
filename = r'C:\Users\Anjel69\Desktop\machinelearning\laptop_price.csv'
laptop = pd.read_csv(filename, encoding='latin-1')

# Specify the column you want to remove
column_to_remove = 'laptop_ID', 'Inches', 'ScreenResolution', 'Cpu', 'Gpu'

# Drop the specified column
laptop = laptop.drop(column_to_remove, axis=1)

# Split the data into features (X) and target variable (y)
X = laptop[['Company', 'Product', 'TypeName', 'ScreenResolution', 'Cpu', 'Memory', 'Gpu', 'OpSys', 'Weight']]
y = laptop['Price_euros']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# One-hot encode categorical features
ohe = OneHotEncoder()
ohe.fit(X[['Company', 'Product', 'TypeName', 'ScreenResolution', 'Cpu', 'Memory', 'Gpu', 'OpSys', 'Weight']])

# Create a column transformer
column_trans = make_column_transformer(
    (OneHotEncoder(categories=ohe.categories_), ['Company', 'Product', 'TypeName', 'ScreenResolution', 'Cpu', 'Memory', 'Gpu', 'OpSys', 'Weight']),
    remainder='passthrough'
)

# Create a linear regression model
lr = LinearRegression()

# Create a pipeline with one-hot encoding, column transformation, and linear regression
pipe = make_pipeline(column_trans, lr)

# Fit the model to the training data
pipe.fit(X_train, y_train)

# Make predictions on the test set
y_pred = pipe.predict(X_test)

# Evaluate the model performance
r2 = r2_score(y_test, y_pred)
print(f'R-squared Score on Test Set: {r2}')

# Cross-validate the model
scores = []
for i in range(1000):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=i)
    lr = LinearRegression()
    pipe = make_pipeline(column_trans, lr)
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)
    scores.append(r2_score(y_test, y_pred))

# Find the best random state
best_random_state = np.argmax(scores)
best_score = scores[best_random_state]
print(f'Best Random State: {best_random_state}, R-squared Score: {best_score}')

# Retrain the model with the best random state
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=best_random_state)
lr = LinearRegression()
pipe = make_pipeline(column_trans, lr)
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)
r2 = r2_score(y_test, y_pred)
print(f'R-squared Score on Test Set with Best Random State: {r2}')

# Save the model to a file
model_filename = r'C:\Users\Anjel69\Desktop\machinelearning\RegressionModel.pkl'
pickle.dump(pipe, open(model_filename, 'wb'))

# Example prediction using the saved model
example_input = pd.DataFrame(columns=X_test.columns, data=np.array(['Macbook Air', 'Apple', 'Intel HD Graphics 6000']).reshape(1, 9))
prediction = pipe.predict(example_input)
print(f'Example Prediction: {prediction}')


KeyError: "[('laptop_ID', 'Inches', 'ScreenResolution', 'Cpu', 'Gpu')] not found in axis"