In [None]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv('laptop_data.csv')
df

In [None]:
df.info()

In [None]:
print('Duplicated sum values:', df.duplicated().sum(), '\n')
print('Missing sum values:')
print(df.isnull().sum())

In [None]:
df.drop(columns=['Unnamed: 0'], inplace=True)

In [None]:
df.head(1)

In [None]:
df['Ram'] = df['Ram'].str.replace('GB', '')
df['Weight'] = df['Weight'].str.replace('kg', '')

In [None]:
df.head(3)

In [None]:
df['Ram'] = df['Ram'].astype('int32')
df['Weight'] = df['Weight'].astype('float32')

In [None]:
df.dtypes

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
sns.histplot(df['Price'], kde=True, stat='density')

In [None]:
df['Company'].value_counts().plot(kind='bar')

In [None]:
sns.barplot(x=df['Company'], y=df['Price'])
plt.xticks(rotation=45)
plt.show()

In [None]:

df['TypeName'].value_counts().plot(kind='bar')

In [None]:
sns.barplot(x=df['TypeName'], y=df['Price'])
plt.xticks(rotation=45)
plt.show()

In [None]:
sns.histplot(df['Inches'], kde=True, stat='density')

In [None]:
sns.scatterplot(x=df['Inches'], y=df['Price'])

In [None]:
df['ScreenResolution'].value_counts()

In [None]:
df['Touchscreen'] = df['ScreenResolution'].apply(
    lambda x: 1 if 'Touchscreen' in x else 0)

In [None]:
df.sample(5)

In [None]:
df['Touchscreen'].value_counts().plot(kind='bar')

In [None]:
sns.barplot(x=df['Touchscreen'], y=df['Price'])

In [None]:
df['Ips'] = df['ScreenResolution'].apply(lambda x: 1 if 'IPS' in x else 0)

In [None]:
df.sample(3)

In [None]:
sns.barplot(x=df['Ips'], y=df['Price'])

In [None]:
SplitResolution = df['ScreenResolution'].str.split('x', n=1, expand=True)

In [None]:
df['X_res'] = SplitResolution[0]
df['Y_res'] = SplitResolution[1]

In [None]:
df.sample(4)

In [None]:
df['X_res'] = df['X_res'].str.replace(',', '').str.findall(r'(\d+\.?\d+)').apply(lambda x: x[0])

In [None]:
df.head(3)

In [None]:
df.dtypes

In [None]:
df['X_res'] = df['X_res'].astype('int')
df['Y_res'] = df['Y_res'].astype('int')

In [None]:
df['Price'].dtypes

In [None]:
df.corr(numeric_only=True)['Price']

In [None]:
df['ppi'] = (((df['X_res']**2) + (df['Y_res']**2))** 0.5/df['Inches']).astype(float)

In [None]:
df.corr(numeric_only=True)['Price']

In [None]:
df.drop(columns=['ScreenResolution'], inplace=True)

In [None]:
df.drop(columns=['Inches', 'X_res', 'Y_res'], inplace=True)

In [None]:
df.head(3)

In [None]:
df['Cpu'].value_counts()

In [None]:
df['Cpu Name'] = df['Cpu'].apply(lambda x:" ".join(x.split()[0:3]))

In [None]:
df.sample(3)

In [None]:
def fetch_processor(text):
    if text == 'Intel Core i3' or text == 'Intel Core i5' or text == 'Intel Core i7':
        return text
    else:
        if text.split()[0] == 'Intel':
            return 'Other Intel Processor'
        else:
            return 'AMD Processor'

In [None]:
df['Cpu Brand'] = df['Cpu Name'].apply(fetch_processor)

In [None]:
df.sample(3)

In [None]:
df['Cpu Brand'].value_counts().plot(kind='bar')

In [None]:
sns.barplot(x=df['Cpu Brand'], y=df['Price'])
plt.xticks(rotation=45)
plt.show()

In [None]:
df.drop(columns=['Cpu', 'Cpu Name'], inplace=True)

In [None]:
df.head(1)

In [None]:
df['Ram'].value_counts().plot(kind='bar')

In [None]:
sns.barplot(x=df['Ram'], y=df['Price'])
plt.xticks(rotation=45)
plt.show()

In [None]:
df['Memory'].value_counts()

In [None]:
df.columns = df.columns.str.strip()

if 'Memory' in df.columns:
    df['Memory'] = df['Memory'].astype(str).str.replace(r'\.0$', '', regex=True)
    df["Memory"] = df["Memory"].str.replace('GB', '', regex=False)
    df["Memory"] = df["Memory"].str.replace('TB', '000', regex=False)
    new = df["Memory"].str.split("+", n=1, expand=True)
    df["first"] = new[0].str.strip()
    df["Layer1HDD"] = df["first"].apply(lambda x: 1 if "HDD" in x else 0)
    df["Layer1SSD"] = df["first"].apply(lambda x: 1 if "SSD" in x else 0)
    df["Layer1Hybrid"] = df["first"].apply(lambda x: 1 if "Hybrid" in x else 0)
    df["Layer1Flash_Storage"] = df["first"].apply(lambda x: 1 if "Flash Storage" in x else 0)
    df['first'] = df['first'].str.extract(r'(\d+)', expand=False).astype(float)
    df["second"] = new[1].fillna("0").str.strip()
    df["Layer2HDD"] = df["second"].apply(lambda x: 1 if "HDD" in x else 0)
    df["Layer2SSD"] = df["second"].apply(lambda x: 1 if "SSD" in x else 0)
    df["Layer2Hybrid"] = df["second"].apply(lambda x: 1 if "Hybrid" in x else 0)
    df["Layer2Flash_Storage"] = df["second"].apply(lambda x: 1 if "Flash Storage" in x else 0)
    df['second'] = df['second'].str.extract(r'(\d+)', expand=False).astype(float)
    df["HDD"] = (df["first"] * df["Layer1HDD"] + df["second"] * df["Layer2HDD"]).astype(int)
    df["SSD"] = (df["first"] * df["Layer1SSD"] + df["second"] * df["Layer2SSD"]).astype(int)
    df["Hybrid"] = (df["first"] * df["Layer1Hybrid"] + df["second"] * df["Layer2Hybrid"]).astype(int)
    df["Flash_Storage"] = (df["first"] * df["Layer1Flash_Storage"] + df["second"] * df["Layer2Flash_Storage"]).astype(int)

    df.drop(columns=['first', 'second', 'Layer1HDD', 'Layer1SSD', 'Layer1Hybrid', 'Layer1Flash_Storage','Layer2HDD', 'Layer2SSD', 'Layer2Hybrid', 'Layer2Flash_Storage'], inplace=True)
else:
    print("The 'Memory' column does not exist in the DataFrame.")


In [None]:
df.sample(5)

In [None]:
df.drop(columns=['Memory'], inplace=True)

In [None]:
df.head()

In [None]:
df.corr(numeric_only=True)['Price']

In [None]:
df.drop(columns=['Hybrid', 'Flash_Storage'], inplace=True)

In [None]:
df.sample(3)

In [None]:
df['Gpu'].value_counts()

In [None]:
df['GpuBrand'] = df['Gpu'].apply(lambda x: x.split()[0])

In [None]:
df.sample(5)

In [None]:
df['GpuBrand'].value_counts()

In [None]:
df = df[df['GpuBrand'] != 'ARM']

In [None]:
df['GpuBrand'].value_counts()

In [None]:
sns.barplot(x=df['GpuBrand'], y=df['Price'])
plt.xticks(rotation=45)
plt.show()

In [None]:
df.drop(columns=['Gpu'], inplace=True)

In [None]:
df.sample(4)

In [None]:
df['OpSys'].value_counts()

In [None]:
sns.barplot(x=df['OpSys'], y=df['Price'])
plt.xticks(rotation=45)
plt.show()

In [None]:
def categorizeOS(op):
    if op == 'Windows 10' or op == 'Windows 7' or op == 'Windows 10 S':
        return 'Windows'
    elif op == 'macOS' or op == 'Mac OS X':
        return 'Mac'
    else:
        return 'Linux/ChromeOS/Others'

In [None]:
df['os'] = df['OpSys'].apply(categorizeOS)

In [None]:
df.sample(5)

In [None]:
df.drop(columns=['OpSys'], inplace=True)

In [None]:
df.head()

In [None]:
sns.barplot(x=df['os'], y=df['Price'])
plt.xticks(rotation=45)
plt.show()

In [None]:
sns.histplot(df['Weight'], kde=True, stat='density')

In [None]:
sns.scatterplot(x=df['Weight'], y=df['Price'])

In [None]:
df.corr(numeric_only=True)

In [None]:
sns.heatmap(df.corr(numeric_only=True))

In [None]:
sns.histplot(np.log(df['Price']), kde=True, stat='density')

In [None]:
x = df.drop(columns=['Price'])
y = np.log(df['Price'])

In [None]:
x.sample(5)

In [None]:
y

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    x, y, test_size=0.15, random_state=2)

In [None]:
X_train

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.model_selection import GridSearchCV


In [None]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor, ExtraTreesRegressor, VotingRegressor, StackingRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor

In [None]:
df.head()

In [None]:
# Define the transformers for specific columns
transformers = [('col_tnf', OneHotEncoder(sparse_output=False, drop='first'), [0, 1, 7, 10, 11])]

In [None]:

step1 = ColumnTransformer(transformers=transformers, remainder='passthrough')
step2 = LinearRegression()
pipe = Pipeline([('step1', step1),('step2', step2),])
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
print('R2 Score:', r2)
print('Mean Absolute Error:', mae)

In [None]:
step1 = ColumnTransformer(transformers=transformers, remainder='passthrough')
step2 = Ridge(alpha=10)
pipe = Pipeline([('step1', step1),('step2', step2),])
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
print('R2 Score:', r2)
print('Mean Absolute Error:', mae)

In [None]:
step1 = ColumnTransformer(transformers=transformers, remainder='passthrough')
step2 = Lasso(alpha=0.001)
pipe = Pipeline([('step1', step1),('step2', step2),])
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
print('R2 Score:', r2)
print('Mean Absolute Error:', mae)

In [None]:
step1 = ColumnTransformer(transformers=transformers, remainder='passthrough')
step2 = KNeighborsRegressor(n_neighbors=3)
pipe = Pipeline([('step1', step1),('step2', step2),])
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
print('R2 Score:', r2)
print('Mean Absolute Error:', mae)

In [None]:
step1 = ColumnTransformer(transformers=transformers, remainder='passthrough')
step2 = DecisionTreeRegressor(max_depth=8)
pipe = Pipeline([
    ('step1', step1),
    ('step2', step2),
])

pipe.fit(X_train, y_train)

y_pred = pipe.predict(X_test)

r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

print('R2 Score:', r2)
print('Mean Absolute Error:', mae)

### Support Vector Machine (SVM)
A powerful supervised machine learning algorithm used for classification and regression tasks. It works by finding a hyperplane in a high-dimensional space that best separates the data into different classes while maximizing the margin between the classes. SVM is effective for both linear and non-linear problems and is known for its ability to handle complex data and high-dimensional feature spaces.

In [None]:
step1 = ColumnTransformer(transformers=transformers, remainder='passthrough')

# Create a SVM model
step2 = SVR(kernel='rbf', C=10000, epsilon=0.1)

pipe = Pipeline([
    ('step1', step1),
    ('step2', step2),
])

pipe.fit(X_train, y_train)

y_pred = pipe.predict(X_test)

r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

print('R2 Score:', r2)
print('Mean Absolute Error:', mae)

### Random Forest
An ensemble learning method that combines multiple decision trees to create a more robust and accurate predictive model. It works by constructing a forest of decision trees during training and averaging or voting on their predictions to improve the overall model's performance. Random Forest is widely used for classification and regression tasks and is known for its ability to handle high-dimensional data, reduce overfitting, and provide feature importance rankings.

In [None]:
step1 = ColumnTransformer(transformers=transformers, remainder='passthrough')

# Create a Random Forest Regressor model
step2 = RandomForestRegressor(n_estimators=100,
                              random_state=3,
                              max_samples=0.5,
                              max_features=0.75,
                              max_depth=15)

pipe = Pipeline([
    ('step1', step1),
    ('step2', step2),
])

pipe.fit(X_train, y_train)

y_pred = pipe.predict(X_test)

r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

print('R2 Score:', r2)
print('Mean Absolute Error:', mae)

### ExtraTrees

Extra Trees, short for Extremely Randomized Trees, is an ensemble learning method similar to Random Forest that builds multiple decision trees. However, it differs in the way it constructs individual trees by using randomization techniques to split nodes and reduce variance. Extra Trees is known for its computational efficiency and robustness against overfitting, making it suitable for various machine learning tasks, especially when dealing with high-dimensional data or noisy datasets.

In [None]:
step1 = ColumnTransformer(transformers=transformers, remainder='passthrough')

# Create a ExtraTrees Regressor model
step2 = ExtraTreesRegressor(n_estimators=100,
                            random_state=3,
                            max_samples=0.5,
                            max_features=0.75,
                            max_depth=15,
                            bootstrap=True)

pipe = Pipeline([
    ('step1', step1),
    ('step2', step2),
])

pipe.fit(X_train, y_train)

y_pred = pipe.predict(X_test)

r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

print('R2 Score:', r2)
print('Mean Absolute Error:', mae)

### Adaptive Boost (AdaBoost)
An ensemble learning technique that combines multiple weak learners (typically simple models) to create a strong predictive model. It iteratively adjusts the weights of training instances, emphasizing the misclassified data points in each iteration to improve the model's performance. AdaBoost is particularly effective for binary classification problems and is known for its ability to boost the accuracy of weak models by focusing on difficult-to-classify examples.

In [None]:
step1 = ColumnTransformer(transformers=transformers, remainder='passthrough')

# Create a AdaBoost Regressor model
step2 = AdaBoostRegressor(n_estimators=15, learning_rate=1.0)

pipe = Pipeline([
    ('step1', step1),
    ('step2', step2),
])

pipe.fit(X_train, y_train)

y_pred = pipe.predict(X_test)

r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

print('R2 Score:', r2)
print('Mean Absolute Error:', mae)

### Gradient 
An ensemble learning method that builds a predictive model by combining the predictions of multiple weak models, such as decision trees, sequentially. It optimizes the model by minimizing the errors of the previous models, adjusting their weights, and adding new models in a gradient descent fashion, resulting in a strong predictive model with improved accuracy. Gradient Boosting is widely used for regression and classification tasks and is known for its robustness and capability to handle complex relationships in data.

In [None]:
step1 = ColumnTransformer(transformers=transformers, remainder='passthrough')

# Create a AdaBoost Regressor model
step2 = GradientBoostingRegressor(n_estimators=500)

pipe = Pipeline([
    ('step1', step1),
    ('step2', step2),
])

pipe.fit(X_train, y_train)

y_pred = pipe.predict(X_test)

r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

print('R2 Score:', r2)
print('Mean Absolute Error:', mae)

### Extreme Gradient Boost (XGBoost)

A powerful and efficient machine learning algorithm that enhances the Gradient Boosting method. It is known for its speed and performance, utilizing techniques such as regularization, parallel processing, and tree pruning to optimize the boosting process. XGBoost is commonly used for a wide range of machine learning tasks, including regression, classification, and ranking, and has been a popular choice in various data science competitions and real-world applications due to its superior predictive accuracy and efficiency.

In [None]:
step1 = ColumnTransformer(transformers=transformers, remainder='passthrough')

# Create a XGB Regressor model
step2 = XGBRegressor(n_estimators=45, max_depth=5, learning_rate=0.5)

pipe = Pipeline([
    ('step1', step1),
    ('step2', step2),
])

pipe.fit(X_train, y_train)

y_pred = pipe.predict(X_test)

r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

print('R2 Score:', r2)
print('Mean Absolute Error:', mae)

### Voting 
An ensemble machine learning method that combines the predictions of multiple regression models to make a final prediction. It aggregates the results by averaging the individual model predictions, resulting in a more robust and accurate regression model that can benefit from the diverse strengths of the combined models.

In [None]:
step1 = ColumnTransformer(transformers=transformers, remainder='passthrough')

# Define individual regression models
rf = RandomForestRegressor(n_estimators=350, random_state=3,
                           max_samples=0.5, max_features=0.75, max_depth=15)
gbdt = GradientBoostingRegressor(n_estimators=100, max_features=0.5)
xgb = XGBRegressor(n_estimators=25, learning_rate=0.3, max_depth=5)
et = ExtraTreesRegressor(n_estimators=100, random_state=3,
                         max_samples=0.5, max_features=0.75, max_depth=10, bootstrap=True)

# Create a Voting Regressor model that combines the individual models
# Adjusting weights for individual models in the ensemble to control their influence on the final prediction.
step2 = VotingRegressor(
    [('rf', rf), ('gbdt', gbdt), ('xgb', xgb), ('et', et)], weights=[5, 1, 1, 1])

pipe = Pipeline([
    ('step1', step1),
    ('step2', step2),
])

pipe.fit(X_train, y_train)

y_pred = pipe.predict(X_test)

r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

print('R2 Score:', r2)
print('Mean Absolute Error:', mae)

### Stacking
Also known as Stacked Generalization, is an ensemble learning technique that combines multiple machine learning models by training a meta-model on their predictions. It involves using a variety of base models to make predictions on the same dataset, and then a meta-model is trained on these base models' predictions to create a more powerful and accurate final model. Stacking is used to improve predictive performance and can handle complex relationships in the data by leveraging the strengths of different base models.

In [None]:
step1 = ColumnTransformer(transformers=transformers, remainder='passthrough')

# Define individual regression models
rf = RandomForestRegressor(n_estimators=350, random_state=3,
                           max_samples=0.5, max_features=0.75, max_depth=15)
gbdt = GradientBoostingRegressor(n_estimators=100, max_features=0.5)
xgb = XGBRegressor(n_estimators=25, learning_rate=0.3, max_depth=5)

# Create a Stacking Regressor model that combines the individual models
step2 = StackingRegressor(
    [('rf', rf), ('gbdt', gbdt), ('xgb', xgb)], final_estimator=Ridge(alpha=100))

pipe = Pipeline([
    ('step1', step1),
    ('step2', step2),
])

pipe.fit(X_train, y_train)

y_pred = pipe.predict(X_test)

r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

print('R2 Score:', r2)
print('Mean Absolute Error:', mae)

### Export Model
- Re-run the highest accuracy model again for recent R2 score
- Finally, export the model 

In [None]:
df

In [None]:
X_train

In [None]:
import pickle

# Export the model
pickle.dump(df,open('laptop_data.pkl','wb'))
pickle.dump(pipe,open('pipe_object.pkl','wb'))

## Personal Customization Hypertune Parameters
If you want hyper-tuned parameters for training machine learning models, run the following code to optimize their performance. Efficient hyperparameter tuning can significantly improve the accuracy and generalization of the models.

### Random Forest Regressor Model - Tuned

In [None]:
transformers = [
    ('col_tnf', OneHotEncoder(sparse_output=False, drop='first'), [0, 1, 7, 10, 11])
]

step1 = ColumnTransformer(transformers=transformers, remainder='passthrough')

step2 = RandomForestRegressor()

pipe = Pipeline([
    ('step1', step1),
    ('step2', step2),
])

# Parameter grid for hyperparameter tuning
param_grid = {
    'step2__n_estimators': [100, 200, 300],
    'step2__max_depth': [10, 15, 20],
    'step2__max_features': [0.6, 0.7, 0.8],
}

# GridSearchCV object for hyperparameter tuning
grid_search = GridSearchCV(pipe, param_grid, cv=5, scoring='r2', n_jobs=-1)

# Fit the model to the training data
grid_search.fit(X_train, y_train)

# Get the best parameters and estimator
best_params = grid_search.best_params_
best_estimator = grid_search.best_estimator_

# Predictions on the test data using the tuned model
y_pred = best_estimator.predict(X_test)

# Evaluate the tuned model
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

print('Best Parameters:', best_params)
print('R2 Score:', round(r2, 2), '(', round(r2 * 100, 2), '%)')
print('Mean Absolute Error:', round(mae, 2), '(', round(mae * 100, 2), '%)')

### Voting Regressor Model (Rf+Gradient) - Tuned

In [None]:
from sklearn.model_selection import RandomizedSearchCV

transformers = [
    ('col_tnf', OneHotEncoder(sparse_output=False, drop='first'), [0, 1, 7, 10, 11]),
]

step1 = ColumnTransformer(transformers=transformers, remainder='passthrough')

# Create an ensemble of models
rf = RandomForestRegressor()
gb = GradientBoostingRegressor()

# Create a VotingRegressor model
voting = VotingRegressor(
    estimators=[('rf', rf), ('gb', gb)],
    weights=[1, 1]
)

# Create a Pipeline that applies the ColumnTransformer and then the VotingRegressor model
pipe = Pipeline([
    ('step1', step1),
    ('scaler', StandardScaler()),
    ('voting', voting)
])

# Parameter grid for hyperparameter tuning
param_dist = {
    'voting__rf__n_estimators': [100, 200, 300],
    'voting__rf__max_depth': [10, 15, 20],
    'voting__rf__max_features': [0.6, 0.7, 0.8],
    'voting__gb__n_estimators': [100, 200, 300],
    'voting__gb__max_depth': [3, 4, 5],
}

random_search = RandomizedSearchCV(pipe, param_distributions=param_dist, n_iter=10, cv=5, scoring='r2', n_jobs=-1)

random_search.fit(X_train, y_train)

best_params = random_search.best_params_
best_estimator = random_search.best_estimator_

y_pred = best_estimator.predict(X_test)

r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

print('Best Parameters:', best_params)
print('R2 Score:', round(r2, 2), '(', round(r2 * 100, 2), '%)')
print('Mean Absolute Error:', round(mae, 2), '(', round(mae * 100, 2), '%)')


##### Re-run the highest accuracy tuned model again and Export the model

In [None]:
import pickle

pickle.dump(df,open('laptop_data.pkl','wb'))

# Export the tuned model
with open('pipe_object.pkl', 'wb') as model_file:
    pickle.dump(best_estimator, model_file)