# Data Processing
### By: Adam Aharony, adam.aharony@gmail.com, 214435448.

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns

Merging all manufacturers to one DataFrame

In [None]:
folder = "manufacturers"
manufacturers = ("3dfx", "AMD", "ATI", "Intel", "Matrox", "NVIDIA", "Sony", "XGI")

ROWS = []

for manufacturer in manufacturers:
    df = pd.read_csv(f"{folder}/{manufacturer}.csv")

    for _, row in df.iterrows():
        row = dict(row)
        row.update({"Manufacturer": manufacturer})
        ROWS.append(row)


df = pd.DataFrame(ROWS)

Dropping duplicate columns

In [None]:
df.columns

In [None]:
cols = ['GRAPHICS PROCESSOR', 'PIXEL SHADERS', 'VERTEX SHADERS',
        'TMUS', 'ROPS', 'MEMORY SIZE', 'MEMORY TYPE', 'BUS WIDTH', 'Current Price']
df.drop(columns=cols, inplace=True)

In [None]:
df.to_csv("raw_data.csv", index=False)

In [None]:
df = pd.read_csv("raw_data.csv")

### Data Exploration:

In [None]:
df.columns, df.columns.shape

In [None]:
df.describe()

In [None]:
df

### Data Manipulation:

Removing columns with more than 60% null values

In [None]:
N = len(df)
notnull = df.notnull().sum()

ratio = 0.6
cols = notnull[notnull > ratio * N]
df[cols.index].to_csv("processing/clean_cols.csv", index=False)

Removing unnecessary columns for regression

In [None]:
df = pd.read_csv("processing/clean_cols.csv")

In [None]:
unnecessary = ["Name", "Link", "GPU Name", "Release Date", "Architecture",
               "Generation", "Predecessor", "Bus Interface", "GPU Variant"]
df.drop(columns=unnecessary, inplace=True)
df[df.isnull().sum(axis=1) < len(df.columns)/2]
df.to_csv("processing/clean_cols.csv", index=False)

In [None]:
df.columns

Converting data with units to simple numbers

In [None]:
cols = ("Process Size", "Transistors", "Die Size", "GPU Clock", "Memory Clock",
        "Memory Size", "Memory Bus", "Bandwidth", "Pixel Rate", "Texture Rate",
        "TDP", "FP32 (float) performance")
special_cols = ("Length", "Width", "Height", "Weight", "Reviews")

for col in cols:
    print(col, df[col].unique())
    print("\n\n")

In [None]:
df = pd.read_csv("processing/clean_cols.csv")

In [None]:
cols = ("Process Size", "Transistors", "Die Size", "GPU Clock", "TDP")
units = ("nm", "million", "mm²", "MHz", "W")
mixed_cols = ("Bandwidth", "Pixel Rate", "Texture Rate", "FP32 (float) performance",
              "Memory Size", "Memory Bus")
# Special: Memory Clock,


def memory_clock_unit_remap(x):
    try:
        if pd.isna(x) or x == "unknown":
            return np.nan
        elif x == "System Shared":
            return 0
        space = x.index(" ")
        return np.float32(x[:space])
    except Exception as e:
        print(e, x)


df["Memory Clock"] = df["Memory Clock"].map(memory_clock_unit_remap)
df.rename(columns={"Memory Clock": "Memory Clock [MHz]"}, inplace=True)


def memory_size_unit_remap(x):
    try:
        if pd.isna(x) or x == "unknown":
            return np.nan
        elif x == "System Shared":
            return 0
        space = x.index(" ")
        unit = x[space+1:]
        if unit == "MB":
            return np.float32(x[:space]) * 0.001
        elif unit == "GB":
            return np.float32(x[:space])
        elif unit == "KB":
            return np.float32(x[:space]) * 1e-6
    except Exception as e:
        print(e, x)


df["Memory Size"] = df["Memory Size"].map(memory_size_unit_remap)
df.rename(columns={"Memory Size": "Memory Size [GB]"}, inplace=True)


def bandwidth_unit_remap(x):
    try:
        if pd.isna(x) or x == "unknown":
            return np.nan
        elif x == "System Shared" or x == "System Dependent":
            return 0
        x = x.replace(",", "")
        space = x.index(" ")
        unit = x[space+1:]
        if unit == "MB/s":
            return np.float32(x[:space]) * 0.001
        elif unit == "GB/s":
            return np.float32(x[:space])
        elif unit == "KB/s":
            return np.float32(x[:space]) * 1e-6
    except Exception as e:
        print(e, x)


df["Bandwidth"] = df["Bandwidth"].map(bandwidth_unit_remap)
df.rename(columns={"Bandwidth": "Bandwidth [GB/s]"}, inplace=True)


def pixel_rate_unit_remap(x):
    try:
        if pd.isna(x) or x == "unknown":
            return np.nan
        elif x == "System Shared" or x == "System Dependent":
            return 0
        x = x.replace(",", "")
        space = x.index(" ")
        unit = x[space+1:]
        if unit == "MPixel/s":
            return np.float32(x[:space]) * 0.001
        elif unit == "GPixel/s":
            return np.float32(x[:space])
        elif unit == "KPixel/s":
            return np.float32(x[:space]) * 1e-6
    except Exception as e:
        print(e, x)


df["Pixel Rate"] = df["Pixel Rate"].map(pixel_rate_unit_remap)
df.rename(columns={"Pixel Rate": "Pixel Rate [GPixel/s]"}, inplace=True)


def texture_rate_unit_remap(x):
    try:
        if pd.isna(x) or x == "unknown":
            return np.nan
        elif x == "System Shared" or x == "System Dependent":
            return 0
        x = x.replace(",", "")
        space = x.index(" ")
        unit = x[space+1:]
        if unit == "MTexel/s":
            return np.float32(x[:space]) * 0.001
        elif unit == "GTexel/s":
            return np.float32(x[:space])
        elif unit == "KTexel/s":
            return np.float32(x[:space]) * 1e-6
    except Exception as e:
        print(e, x)


df["Texture Rate"] = df["Texture Rate"].map(texture_rate_unit_remap)
df.rename(columns={"Texture Rate": "Texture Rate [GTexel/s]"}, inplace=True)


def opengl_remap(x):
    try:
        if pd.isna(x) or x == "unknown":
            return np.nan
        elif x == "System Shared" or x == "System Dependent":
            return 0
        elif x[:2] == "ES":
            return np.float32(x[3:])
        elif x == "None":
            return 0
        elif len(x) == 3 and "." in x:
            return np.float32(x)
        return np.float32(x[:x.index(" ")])
    except Exception as e:
        print(e, x)
        return np.float32(x)


df["OpenGL"] = df["OpenGL"].map(opengl_remap)


def directx_remap(x):
    try:
        if isinstance(x, float):
            return x
        if pd.isna(x) or x == "unknown":
            return np.nan
        elif x == "System Shared" or x == "System Dependent":
            return 0
        if "." in x:
            thresh = x.index(".")
            return np.float32(x[:thresh+2])
        elif " " in x:
            thresh = x.index(" ")
            return np.float32(x[:thresh])

    except Exception as e:
        print(e, x)


df["DirectX"] = df["DirectX"].map(directx_remap)


def cores_remap(x):
    try:
        if isinstance(x, float):
            return x
        if pd.isna(x) or x == "unknown":
            return np.nan
        elif x == "System Shared" or x == "System Dependent":
            return 0
        if "x" in x:
            thresh = x.index(" ")
            return np.float32(x[:thresh]) * np.float32(x[thresh+2:])
        return np.float32(x)

    except Exception as e:
        print(e, x)


df["CORES"] = df["CORES"].map(cores_remap)
df.rename(columns={"CORES": "Cores"}, inplace=True)


def outputs_remap(x):
    try:
        if pd.isna(x) or x == "unknown":
            return np.nan
        elif x == "No outputs":
            return 0
        x = x.replace("\n", " ")
        arr = x.split(" ")
        arr = [i for i in arr if "x" in i]
        arr = [i.replace("x", "") for i in arr]
        arr = [np.float32(i) for i in arr]
        return np.sum(arr)
    except Exception as e:
        print(e, x)


df["Outputs"] = df["Outputs"].map(outputs_remap)


def power_connectors_remap(x):
    try:
        if pd.isna(x) or x == "unknown":
            return np.nan
        elif x == "None":
            return 0
        elif x == "Floppy":
            return 1
        x = x.replace(" + ", " ")
        arr = x.split(" ")
        arr = [i for i in arr if "x" in i and len(i) == 2]
        arr = [i.replace("x", "") for i in arr]
        arr = [np.float32(i) for i in arr]
        return np.sum(arr)
    except Exception as e:
        print(e, x)


df["Power Connectors"] = df["Power Connectors"].map(power_connectors_remap)


def fp32_unit_remap(x):
    try:
        if pd.isna(x) or x == "unknown":
            return np.nan
        elif x == "System Shared" or x == "System Dependent":
            return 0
        x = x.replace(",", "")
        space = x.index(" ")
        unit = x[space+1:]
        if unit == "GFLOPS":
            return np.float32(x[:space]) * 0.001
        elif unit == "TFLOPS":
            return np.float32(x[:space])
    except Exception as e:
        print(e, x)


df["FP32 (float) performance"] = df["FP32 (float) performance"].map(
    fp32_unit_remap)
df.rename(columns={
          "FP32 (float) performance": "FP32 (float) performance [TFLOPS]"
          }, inplace=True)


def memory_bus_unit_remap(x):
    try:
        if pd.isna(x) or x == "unknown":
            return np.nan
        elif x == "System Shared":
            return 0
        space = x.index(" ")
        unit = x[space+1:]
        return np.float32(x[:space])
    except Exception as e:
        print(e, x)


df["Memory Bus"] = df["Memory Bus"].map(memory_bus_unit_remap)
df.rename(columns={"Memory Bus": "Memory Bus [bit]"}, inplace=True)


def unit_remap(x):
    try:
        if pd.isna(x) or x == "unknown":
            return np.nan
        x = x.replace(",", "")
        space = x.index(" ")
        unit = x[space+1:]
        number = np.float32(x[:space])
        return number
    except Exception as e:
        print(e, x)


for i, (col, unit) in enumerate(zip(cols, units)):
    df[col] = df[col].map(unit_remap)
    df.rename(columns={col: f"{col} [{unit}]"}, inplace=True)


Converting categorical columns to numbers

In [None]:
categorical_cols = ("Foundry", "Production", "Memory Type", "Slot Width", "Manufacturer")

In [None]:
df.info()

In [None]:
df.to_csv("processing/units.csv", index=False)

In [None]:
df = pd.read_csv("processing/units.csv")

In [None]:
df

### Regarding Columns as Numeric:

Converting categorical columns to numeric values.

In [None]:
categorical = ("Foundry", "Production", "Memory Type", "Slot Width", "Manufacturer")
val_dict = dict()
for col in categorical:
    unique = df[col].unique()
    unique = unique[~pd.isna(unique)]
    val_dict[col] = {u: i for u, i in zip(unique, range(len(unique)))}
    df[col].replace(val_dict[col], inplace=True)

In [None]:
val_dict

### Correlation matrix:

In [None]:
sns.heatmap(df.corr())

In [None]:
df.to_csv("processing/numeric.csv", index=False)

### Filling Null Values:

For categorical columns, we will use the most common value (mode) as a filler, for int values, we can use the median, and for the rest, we can use the average value (mean).

In [None]:
df = pd.read_csv("processing/numeric.csv")

In [None]:
mode = ("Foundry", "Production", "Memory Type", "Slot Width", "Manufacturer")
median = ("Process Size [nm]", "Year", "Die Size [mm²]", "Memory Bus [bit]", "TMUs", "ROPs", "TDP [W]", "Outputs", "Power Connectors", "DirectX", "OpenGL", "Cores", "Shading Units", "Shader Model")
mean = ("Transistors [million]", "GPU Clock [MHz]", "Memory Clock [MHz]", "Memory Size [GB]", "Bandwidth [GB/s]", "Pixel Rate [GPixel/s]", "Texture Rate [GTexel/s]", "FP32 (float) performance [TFLOPS]")

for col in mode:
    df[col].fillna(df[col].mode()[0], inplace=True)
for col in median:
    df[col].fillna(df[col].median(), inplace=True)
for col in mean:
    df[col].fillna(df[col].mean(), inplace=True)

In [None]:
df.to_csv("processing/ready.csv", index=False)