In [1]:
# Basic Libraries
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt # we only need pyplot
sb.set() # set the default Seaborn style for graphics

## Data Cleaning

In [2]:
AMD_data = pd.read_excel('Updated_GPU_Dataset218_AMD_Gen.xlsx')
AMD_data.head()

Unnamed: 0,Manufacturer,Generation,Product Name,GPU URL,GPU Chip,Chip URL,Released,Year,Bus,Memory,...,L2 Cache,L3 Cache,Max. TDP,Pixel Rate,Texture Rate,FP16 (half),FP32 (float),FP64 (double),Abritary Gen,Gen Year
0,AMD,Northern Islands (HD 6200),Radeon HD 6230,https://www.techpowerup.com/gpu-specs/radeon-h...,Park,https://www.techpowerup.com/gpu-specs/amd-park...,"Jul 2nd, 2011",2011,PCIe 2.0 x16,"512 MB, DDR2, 64 bit",...,128 KB,,19 W,2.600 GPixel/s,5.200 GTexel/s,,104.0 GFLOPS,,1,2009
1,AMD,Northern Islands (HD 6200),Radeon HD 6250,https://www.techpowerup.com/gpu-specs/radeon-h...,Cedar,https://www.techpowerup.com/gpu-specs/amd-ceda...,"Jan 31st, 2011",2011,PCIe 2.0 x16,"512 MB, GDDR3, 64 bit",...,128 KB,,25 W,2.600 GPixel/s,5.200 GTexel/s,,104.0 GFLOPS,,1,2009
2,AMD,Northern Islands (HD 6200),Radeon HD 6290,https://www.techpowerup.com/gpu-specs/radeon-h...,Cedar,https://www.techpowerup.com/gpu-specs/amd-ceda...,"Dec 4th, 2011",2011,PCIe 2.0 x16,"1024 MB, GDDR3, 64 bit",...,128 KB,,25 W,2.600 GPixel/s,5.200 GTexel/s,,104.0 GFLOPS,,1,2009
3,AMD,Northern Islands (HD 6300),Radeon HD 6350,https://www.techpowerup.com/gpu-specs/radeon-h...,Cedar,https://www.techpowerup.com/gpu-specs/amd-ceda...,"Feb 7th, 2011",2011,PCIe 2.0 x16,"512 MB, GDDR3, 64 bit",...,128 KB,,25 W,2.600 GPixel/s,5.200 GTexel/s,,104.0 GFLOPS,,1,2009
4,AMD,Northern Islands (HD 6300),Radeon HD 6390,https://www.techpowerup.com/gpu-specs/radeon-h...,Redwood,https://www.techpowerup.com/gpu-specs/amd-redw...,"Jul 4th, 2011",2011,PCIe 2.0 x16,"1024 MB, DDR2, 128 bit",...,256 KB,,39 W,4.400 GPixel/s,8.800 GTexel/s,,352.0 GFLOPS,,1,2009


In [3]:
print("Data type:", type(AMD_data))

Data type: <class 'pandas.core.frame.DataFrame'>


In [4]:
print("Data dims : ", AMD_data.shape)

Data dims :  (218, 43)


In [5]:
print(AMD_data.dtypes)

Manufacturer              object
Generation                object
Product Name              object
GPU URL                   object
GPU Chip                  object
Chip URL                  object
Released                  object
Year                       int64
Bus                       object
Memory                    object
GPU clock                 object
Memory clock              object
Shaders / TMUs / ROPs     object
Architecture              object
Foundry                   object
Process Size              object
Transistors               object
Density                   object
Die Size                  object
DirectX                   object
OpenGL                   float64
OpenCL                    object
Vulkan                    object
Shader Model              object
WDDM                     float64
Tensor Cores             float64
RT Cores                 float64
Shading Units              int64
TMUs                       int64
ROPs                       int64
SM Count  

In [6]:
def convert_flops(value):
    value = value.replace(",", "")  # Remove commas
    if 'TFLOPS' in value:
        return float(value.replace('TFLOPS', '').strip()) * 1000
    elif 'GFLOPS' in value:
        return float(value.replace('GFLOPS', '').strip())
    else:
        return None  # Just in case there are other formats we haven't considered

# Apply the conversion to the 'FP32 (float)' column
AMD_data['FP32 (float) in GFLOPS'] = AMD_data['FP32 (float)'].apply(convert_flops)

# Cleaning the 'Memory' column by splitting into size, type, and bus width
AMD_data[['Memory Size', 'Memory Type', 'Bus Width']] = AMD_data['Memory'].str.extract(r'(\d+ GB|\d+ MB), (\w+), (\d+ bit)')

# Convert 'GPU clock' and 'Memory clock' to numeric values
AMD_data['GPU clock (MHz)'] = AMD_data['GPU clock'].str.replace(' MHz', '').astype(float)
AMD_data['Memory clock (MHz)'] = AMD_data['Memory clock'].str.replace(' MHz', '').astype(float)

# Convert 'Process Size' to numeric
AMD_data['Process Size (nm)'] = AMD_data['Process Size'].str.replace(' nm', '').astype(float)

# Convert 'Transistors' to numeric in millions
AMD_data['Transistors (millions)'] = AMD_data['Transistors'].str.replace(' million', '').str.replace(',', '').astype(float)

# Convert 'Density' to numeric
AMD_data['Density (M/mm²)'] = AMD_data['Density'].str.extract(r'([\d\.]+)').astype(float)

# Convert 'Die Size' to numeric
AMD_data['Die Size (mm²)'] = AMD_data['Die Size'].str.replace(' mm²', '').astype(float)

# Remove 'W' from 'Max. TDP' and convert to float
AMD_data['Max. TDP (W)'] = AMD_data['Max. TDP'].str.replace(' W', '').astype(float)

# Remove 'bit' from 'Bus Width' and convert to int
AMD_data['Bus Width (bits)'] = AMD_data['Bus Width'].str.replace(' bit', '').astype(int)

# Cleaning 'Pixel Rate' and 'Texture Rate'
AMD_data['Pixel Rate (GPixel/s)'] = AMD_data['Pixel Rate'].str.replace(' GPixel/s', '').astype(float)
AMD_data['Texture Rate (GTexel/s)'] = AMD_data['Texture Rate'].str.replace(' GTexel/s', '').str.replace(',', '').astype(float)

# Cleaning L2 Cahce
AMD_data['L2 Cache (KB)'] = AMD_data['L2 Cache'].str.replace(' KB', '').astype(float)

def convert_to_mb(size):
    if 'GB' in size:
        # Extract the number and multiply by 1024 to convert GB to MB
        return float(size.replace(' GB', '')) * 1024
    elif 'MB' in size:
        # Just convert MB to float
        return float(size.replace(' MB', ''))
    return None

# Apply the function to the 'Memory Size' column
AMD_data['Memory Size (MB)'] = AMD_data['Memory Size'].apply(convert_to_mb)

AMD_data.dtypes

Manufacturer                object
Generation                  object
Product Name                object
GPU URL                     object
GPU Chip                    object
Chip URL                    object
Released                    object
Year                         int64
Bus                         object
Memory                      object
GPU clock                   object
Memory clock                object
Shaders / TMUs / ROPs       object
Architecture                object
Foundry                     object
Process Size                object
Transistors                 object
Density                     object
Die Size                    object
DirectX                     object
OpenGL                     float64
OpenCL                      object
Vulkan                      object
Shader Model                object
WDDM                       float64
Tensor Cores               float64
RT Cores                   float64
Shading Units                int64
TMUs                

In [7]:
AMD_cleaned = AMD_data[['Manufacturer', 'Foundry', 'Architecture', 'Abritary Gen', 'Gen Year', 'FP32 (float) in GFLOPS', 'Memory Size (MB)', 'Memory Type', 'Bus Width (bits)', 'GPU clock (MHz)', 'Memory clock (MHz)', 'Process Size (nm)', 'Transistors (millions)', 'Density (M/mm²)', 'Die Size (mm²)', 'Max. TDP (W)', 'Pixel Rate (GPixel/s)', 'Texture Rate (GTexel/s)', 'L2 Cache (KB)', 'Shading Units']]
AMD_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 218 entries, 0 to 217
Data columns (total 20 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Manufacturer             218 non-null    object 
 1   Foundry                  218 non-null    object 
 2   Architecture             218 non-null    object 
 3   Abritary Gen             218 non-null    int64  
 4   Gen Year                 218 non-null    int64  
 5   FP32 (float) in GFLOPS   218 non-null    float64
 6   Memory Size (MB)         218 non-null    float64
 7   Memory Type              218 non-null    object 
 8   Bus Width (bits)         218 non-null    int64  
 9   GPU clock (MHz)          218 non-null    float64
 10  Memory clock (MHz)       218 non-null    float64
 11  Process Size (nm)        218 non-null    float64
 12  Transistors (millions)   218 non-null    float64
 13  Density (M/mm²)          218 non-null    float64
 14  Die Size (mm²)           2

In [8]:
AMD_cleaned.to_csv('AMD_cleaned.csv')