In [42]:
# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.linear_model import (
    LinearRegression,  # noqa: F401
    LogisticRegression,
)
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler

# Load your dataset
# Replace 'your_dataset.csv' with the actual filename
# Making a initialization of class
from src.chip_analysis.data_summary import DataProcess

## Load my dataset and put my own path here
my_file_path = 'https://raw.githubusercontent.com/Zhengnan817/Project-3-Data-Reconstruction-and-Analysis/main/src/chip_analysis/data/chip_dataset.csv'
cpu_table = DataProcess(my_file_path)
df = cpu_table.view_data()
# df['Release Date'] = pd.to_datetime(df['Release Date'],format='%m/%d/%y')
columns_to_drop = ['Product','Vendor']
df_new = df.drop(columns=columns_to_drop)
gpu_data = df_new[df_new['Type'] == 'GPU']
gpu_data = gpu_data.drop(columns=['Type'])
cpu_data = df_new[df_new['Type'] == 'CPU']
cpu_data = cpu_data.drop(columns=['Type'])
gpu_data.head()
# Define features and target variable
# Separate features and target
X = cpu_data[["Release Date", "Process Size",  "Die Size", "Transistors"]]
y = cpu_data["Freq"]

# Define preprocessor for numerical and categorical features
numeric_features = ["Process Size", "Die Size", "Transistors"]
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_features = ["Release Date"]
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Define the model
model = RandomForestRegressor()

# Create the pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                            ('model', model)])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit the model on the training set
pipeline.fit(X_train, y_train)

# Evaluate the model on the testing set
y_pred = pipeline.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
print(f"Root Mean Squared Error on Test Set: {rmse}")

Root Mean Squared Error on Test Set: 433.5443969594437
