In [496]:
import pandas as pd  # noqa: F401
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.linear_model import (
    LinearRegression,  # noqa: F401
    LogisticRegression,
)
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures, StandardScaler

# Making a initialization of class
from src.chip_analysis.data_summary import DataProcess

## Load my dataset and put my own path here
my_file_path = 'https://raw.githubusercontent.com/Zhengnan817/Project-3-Data-Reconstruction-and-Analysis/main/src/chip_analysis/data/chip_dataset.csv'
cpu_table = DataProcess(my_file_path)
df = cpu_table.view_data()
df['Release Date'] = pd.to_datetime(df['Release Date'],format='%m/%d/%y')
columns_to_drop = ['Product','Vendor',"Process Size","Release Date","TDP"]
df_new = df.drop(columns=columns_to_drop)
gpu_data = df_new[df_new['Type'] == 'GPU']
gpu_data = gpu_data.drop(columns=['Type'])
gpu_data.head()

Unnamed: 0,Die Size,Transistors,Freq
1675,216.0,1160,100
1676,132.0,3300,1469
1677,73.0,242,600
1678,156.0,120,400
1679,420.0,720,600


In [497]:
# Define features and target variable
X = gpu_data.drop("Freq" ,axis=1)  # Features
Y = gpu_data["Freq"]  # Target variable

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

poly = PolynomialFeatures(degree=2)
X_poly_train = poly.fit_transform(X_train)
X_poly_test = poly.transform(X_test)

poly_feature_names = [f"X{feature_idx}" for feature_idx in range(X_poly_train.shape[1])]
X_poly_train_df = pd.DataFrame(X_poly_train, columns=poly_feature_names)
X_poly_test_df = pd.DataFrame(X_poly_test, columns=poly_feature_names)

In [498]:
numeric_features = ["Process Size", "TDP","Die Size"]
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

In [499]:

# Create preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features)
    ])

# Create a pipeline with a classifier
model =LinearRegression()

model

In [500]:
Pipeline_model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', model)
])

# Handle missing values before training the model
X_train_preprocessed = preprocessor.fit_transform(X_poly_train_df)
X_test_preprocessed = preprocessor.fit_transform(X_poly_test_df)

ValueError: A given column is not a column of the dataframe

In [None]:
# Fit the model on the entire training set
Pipeline_model.fit(X_poly_train, y_train)

# Predict on the test set
y_pred = Pipeline_model.predict(X_poly_test)

# # Evaluate the model on the test set
# accuracy = (y_pred == y_test).mean()
# accuracy

ValueError: Specifying the columns using strings is only supported for pandas DataFrames

In [None]:
# Evaluate the model using cross-validation
cv_scores = cross_val_score(model, X_train_preprocessed, y_train, cv=5)
print("Cross-validation scores:", cv_scores)
print("Mean CV score:", cv_scores.mean())

Cross-validation scores: [0.46110309 0.50999174 0.47011403 0.45449418 0.52606019]
Mean CV score: 0.48435264537295736


In [None]:
# print("Test set accuracy:", accuracy)