# Import Necessary Libraries

In [7]:

from bs4 import BeautifulSoup
import requests
import pandas as pd
import time 
import random
import numpy as np
import seaborn as sb
import matplotlib.pyplot as plt
sb.set() # set the default Seaborn style for graphics

# Load Nvidia Excel File Into Data Frame

In [8]:
# Load the Excel file to see its content and structure
# Marcus edited the file path. Please edit back to yours later

#file_path = r"Updated_GPU_Dataset279_Nvidia_gen.xlsx"
gpu_data = pd.read_excel('Updated_GPU_Dataset279_Nvidia_gen.xlsx') 

# Display the first few rows of the dataframe to understand its structure
gpu_data.head()

Unnamed: 0,Manufacturer,Generation,Product Name,GPU URL,GPU Chip,Chip URL,Released,Year,Bus,Memory,...,L2 Cache,L3 Cache,Max. TDP,Pixel Rate,Texture Rate,FP16 (half),FP32 (float),FP64 (double),Abritary Gen,Gen Year
0,Nvidia,GeForce 100,GeForce G100 OEM,https://www.techpowerup.com/gpu-specs/geforce-...,G98S,https://www.techpowerup.com/gpu-specs/nvidia-g...,"Mar 10th, 2009",2009.0,PCIe 2.0 x16,"256 MB, DDR2, 64 bit",...,16 KB,,40 W,2.160 GPixel/s,2.160 GTexel/s,,20.80 GFLOPS,,1,2006
1,Nvidia,GeForce 100,GeForce GT 120 OEM,https://www.techpowerup.com/gpu-specs/geforce-...,G96C,https://www.techpowerup.com/gpu-specs/nvidia-g...,"Mar 10th, 2009",2009.0,PCIe 2.0 x16,"512 MB, DDR2, 128 bit",...,32 KB,,50 W,5.904 GPixel/s,11.81 GTexel/s,,117.5 GFLOPS,,1,2006
2,Nvidia,GeForce 100,GeForce GT 120 Mac Edition,https://www.techpowerup.com/gpu-specs/geforce-...,G96C,https://www.techpowerup.com/gpu-specs/nvidia-g...,"Jan 20th, 2009",2009.0,PCIe 2.0 x16,"512 MB, GDDR3, 128 bit",...,32 KB,,50 W,4.400 GPixel/s,8.800 GTexel/s,,89.60 GFLOPS,,1,2006
3,Nvidia,GeForce 100,GeForce GT 130 OEM,https://www.techpowerup.com/gpu-specs/geforce-...,G94B,https://www.techpowerup.com/gpu-specs/nvidia-g...,"Mar 10th, 2009",2009.0,PCIe 2.0 x16,"512 MB, DDR2, 192 bit",...,64 KB,,105 W,6.000 GPixel/s,12.00 GTexel/s,,120.0 GFLOPS,,1,2006
4,Nvidia,GeForce 100,GeForce GT 130 Mac Edition,https://www.techpowerup.com/gpu-specs/geforce-...,G94B,https://www.techpowerup.com/gpu-specs/nvidia-g...,"Dec 12th, 2008",2008.0,PCIe 2.0 x16,"512 MB, GDDR3, 192 bit",...,64 KB,,105 W,7.200 GPixel/s,14.40 GTexel/s,,144.0 GFLOPS,,1,2006


In [9]:
#explore the dataset
gpu_data.info()

# You don't need this! When you call .info, it already gives you all the column names and the datatypes
# print("\n")
# print(gpu_data.dtypes)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 279 entries, 0 to 278
Data columns (total 42 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Manufacturer   279 non-null    object 
 1   Generation     279 non-null    object 
 2   Product Name   279 non-null    object 
 3   GPU URL        279 non-null    object 
 4   GPU Chip       279 non-null    object 
 5   Chip URL       279 non-null    object 
 6   Released       279 non-null    object 
 7   Year           277 non-null    float64
 8   Bus            279 non-null    object 
 9   Memory         279 non-null    object 
 10  GPU clock      279 non-null    object 
 11  Memory clock   279 non-null    object 
 12  Architecture   279 non-null    object 
 13  Foundry        279 non-null    object 
 14  Process Size   279 non-null    object 
 15  Transistors    279 non-null    object 
 16  Density        279 non-null    object 
 17  Die Size       279 non-null    object 
 18  DirectX   

# Helper Functions For Data Cleaning 

In [10]:
def convert_flops(value):
    value = value.replace(",", "")  # Remove commas
    if 'TFLOPS' in value:
        return float(value.replace('TFLOPS', '').strip()) * 1000
    elif 'GFLOPS' in value:
        return float(value.replace('GFLOPS', '').strip())
    else:
        return None  # Just in case there are other formats we haven't considered

def convert_to_mb(size):
    if 'GB' in size:
        # Extract the number and multiply by 1024 to convert GB to MB
        return float(size.replace(' GB', '')) * 1024
    elif 'MB' in size:
        # Just convert MB to float
        return float(size.replace(' MB', ''))
    return None

# Data Cleaning 

In [11]:
# Apply the convert_flops() to the 'FP32 (float)' column
gpu_data['FP32 (float) in GFLOPS'] = gpu_data['FP32 (float)'].apply(convert_flops)

# Cleaning the 'Memory' column by splitting into size, type, and bus width
gpu_data[['Memory Size', 'Memory Type', 'Bus Width']] = gpu_data['Memory'].str.extract(r'(\d+ GB|\d+ MB), (\w+), (\d+ bit)')

# Convert 'GPU clock' and 'Memory clock' to numeric values
gpu_data['GPU clock (MHz)'] = gpu_data['GPU clock'].str.replace(' MHz', '').astype(float)
gpu_data['Memory clock (MHz)'] = gpu_data['Memory clock'].str.replace(' MHz', '').astype(float)

# Convert 'Process Size' to numeric
gpu_data['Process Size (nm)'] = gpu_data['Process Size'].str.replace(' nm', '').astype(float)

# Convert 'Transistors' to numeric in millions
gpu_data['Transistors (millions)'] = gpu_data['Transistors'].str.replace(' million', '').str.replace(',', '').astype(float)

# Convert 'Density' to numeric
gpu_data['Density (M/mm²)'] = gpu_data['Density'].str.extract(r'([\d\.]+)').astype(float)

# Convert 'Die Size' to numeric
gpu_data['Die Size (mm²)'] = gpu_data['Die Size'].str.replace(' mm²', '').astype(float)

# Remove 'W' from 'Max. TDP' and convert to float
gpu_data['Max. TDP (W)'] = gpu_data['Max. TDP'].str.replace(' W', '').astype(float)

# Remove 'bit' from 'Bus Width' and convert to int
gpu_data['Bus Width (bits)'] = gpu_data['Bus Width'].str.replace(' bit', '').astype(int)

# Cleaning 'Pixel Rate' and 'Texture Rate'
gpu_data['Pixel Rate (GPixel/s)'] = gpu_data['Pixel Rate'].str.replace(' GPixel/s', '').astype(float)
gpu_data['Texture Rate (GTexel/s)'] = gpu_data['Texture Rate'].str.replace(' GTexel/s', '').str.replace(',', '').astype(float)

# Cleaning L2 Cahce
gpu_data['L2 Cache (KB)'] = gpu_data['L2 Cache'].str.replace(' KB', '').astype(float)

# Clean 'OpenGL' and 'OpenCL' version notations by ensuring numeric consistency
gpu_data['OpenGL'] = gpu_data['OpenGL'].astype(float)  # Ensuring all are float
#gpu_data['OpenCL'] = gpu_data['OpenCL'].str.extract(r'(\d+\.\d+)').astype(float)  # Extracting the main version

# Apply the convert_to_mb to the 'Memory Size' column
gpu_data['Memory Size (MB)'] = gpu_data['Memory Size'].apply(convert_to_mb)

gpu_data.dtypes

Manufacturer                object
Generation                  object
Product Name                object
GPU URL                     object
GPU Chip                    object
Chip URL                    object
Released                    object
Year                       float64
Bus                         object
Memory                      object
GPU clock                   object
Memory clock                object
Architecture                object
Foundry                     object
Process Size                object
Transistors                 object
Density                     object
Die Size                    object
DirectX                     object
OpenGL                     float64
OpenCL                     float64
Vulkan                      object
Shader Model                object
WDDM                       float64
Tensor Cores                object
RT Cores                    object
Shading Units                int64
TMUs                         int64
ROPs                

In [12]:
Nvidia_cleaned = gpu_data[['Manufacturer', 'Foundry', 'Architecture', 'Abritary Gen', 'Gen Year', 'FP32 (float) in GFLOPS', 'Memory Size (MB)', 'Memory Type', 'Bus Width (bits)', 'GPU clock (MHz)', 'Memory clock (MHz)', 'Process Size (nm)', 'Transistors (millions)', 'Density (M/mm²)', 'Die Size (mm²)', 'Max. TDP (W)', 'Pixel Rate (GPixel/s)', 'Texture Rate (GTexel/s)', 'L2 Cache (KB)', 'Shading Units']]
Nvidia_cleaned.info()
Nvidia_cleaned.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 279 entries, 0 to 278
Data columns (total 20 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Manufacturer             279 non-null    object 
 1   Foundry                  279 non-null    object 
 2   Architecture             279 non-null    object 
 3   Abritary Gen             279 non-null    int64  
 4   Gen Year                 279 non-null    int64  
 5   FP32 (float) in GFLOPS   279 non-null    float64
 6   Memory Size (MB)         279 non-null    float64
 7   Memory Type              279 non-null    object 
 8   Bus Width (bits)         279 non-null    int64  
 9   GPU clock (MHz)          279 non-null    float64
 10  Memory clock (MHz)       279 non-null    float64
 11  Process Size (nm)        279 non-null    float64
 12  Transistors (millions)   279 non-null    float64
 13  Density (M/mm²)          279 non-null    float64
 14  Die Size (mm²)           2

Unnamed: 0,Manufacturer,Foundry,Architecture,Abritary Gen,Gen Year,FP32 (float) in GFLOPS,Memory Size (MB),Memory Type,Bus Width (bits),GPU clock (MHz),Memory clock (MHz),Process Size (nm),Transistors (millions),Density (M/mm²),Die Size (mm²),Max. TDP (W),Pixel Rate (GPixel/s),Texture Rate (GTexel/s),L2 Cache (KB),Shading Units
0,Nvidia,UMC,Tesla,1,2006,20.8,256.0,DDR2,64,540.0,400.0,65.0,210.0,2.4,86.0,40.0,2.16,2.16,16.0,8
1,Nvidia,TSMC,Tesla,1,2006,117.5,512.0,DDR2,128,738.0,504.0,55.0,314.0,2.6,121.0,50.0,5.904,11.81,32.0,32
2,Nvidia,TSMC,Tesla,1,2006,89.6,512.0,GDDR3,128,550.0,800.0,55.0,314.0,2.6,121.0,50.0,4.4,8.8,32.0,32
3,Nvidia,TSMC,Tesla,1,2006,120.0,512.0,DDR2,192,500.0,500.0,55.0,505.0,2.6,196.0,105.0,6.0,12.0,64.0,64
4,Nvidia,TSMC,Tesla,1,2006,144.0,512.0,GDDR3,192,600.0,792.0,55.0,505.0,2.6,196.0,105.0,7.2,14.4,64.0,64


In [13]:
Nvidia_cleaned.to_csv('Nvidia_cleaned.csv')