# Machine Learning
### By: Adam Aharony, adam.aharony@gmail.com, 214435448.

In [1]:
import pandas as pd
import numpy as np
import eli5
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score, r2_score
from IPython.core.display import HTML


In [2]:
df = pd.read_csv("processing/ready.csv")
df


Unnamed: 0,Year,Foundry,Process Size [nm],Transistors [million],Die Size [mm²],Production,GPU Clock [MHz],Memory Clock [MHz],Memory Size [GB],Memory Type,...,TDP [W],Outputs,Power Connectors,DirectX,OpenGL,Manufacturer,Cores,Shading Units,FP32 (float) performance [TFLOPS],Shader Model
0,2001,0.0,180.0,30.0,131.0,0.0,200.0,200.0,0.064,0.0,...,30.0,1.0,1.0,8.0,1.1,0,336.0,320.0,2.727746,5.1
1,2001,0.0,180.0,25.0,144.0,0.0,200.0,200.0,0.064,0.0,...,30.0,3.0,0.0,8.0,1.1,0,336.0,320.0,2.727746,5.1
2,2001,0.0,180.0,25.0,144.0,0.0,200.0,200.0,0.064,0.0,...,60.0,3.0,0.0,8.0,1.1,0,336.0,320.0,2.727746,5.1
3,2000,0.0,250.0,14.0,112.0,0.0,166.0,166.0,0.016,1.0,...,15.0,1.0,0.0,6.0,1.1,0,336.0,320.0,2.727746,5.1
4,2000,0.0,250.0,14.0,112.0,0.0,166.0,166.0,0.032,1.0,...,15.0,1.0,0.0,6.0,1.1,0,336.0,320.0,2.727746,5.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2159,2003,1.0,130.0,90.0,144.0,0.0,350.0,500.0,0.256,6.0,...,48.0,3.0,0.0,9.0,1.4,7,336.0,320.0,2.727746,5.1
2160,2003,1.0,130.0,110.0,144.0,0.0,300.0,325.0,0.256,0.0,...,48.0,3.0,0.0,9.0,1.5,7,336.0,320.0,2.727746,5.1
2161,2003,1.0,130.0,110.0,144.0,0.0,300.0,450.0,0.256,6.0,...,48.0,2.0,0.0,9.0,1.5,7,336.0,320.0,2.727746,5.1
2162,2003,1.0,130.0,110.0,144.0,0.0,350.0,375.0,0.256,0.0,...,48.0,3.0,0.0,9.0,1.5,7,336.0,320.0,2.727746,5.1


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2164 entries, 0 to 2163
Data columns (total 27 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   Year                               2164 non-null   int64  
 1   Foundry                            2164 non-null   float64
 2   Process Size [nm]                  2164 non-null   float64
 3   Transistors [million]              2164 non-null   float64
 4   Die Size [mm²]                     2164 non-null   float64
 5   Production                         2164 non-null   float64
 6   GPU Clock [MHz]                    2164 non-null   float64
 7   Memory Clock [MHz]                 2164 non-null   float64
 8   Memory Size [GB]                   2164 non-null   float64
 9   Memory Type                        2164 non-null   float64
 10  Memory Bus [bit]                   2164 non-null   float64
 11  Bandwidth [GB/s]                   2164 non-null   float

Defining the main procedure.

In [4]:
def procedure(df, y_col):
    # Splitting the dataset
    y = df[y_col]
    X = df.drop([y_col], axis=1)
    # Splitting into train and test
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.25, random_state=111)
    # Training model
    clf = LinearRegression()
    clf.fit(X_train, y_train)
    # Predicting and calculating score
    y_pred = clf.predict(X_test)
    score = r2_score(y_test, y_pred)

    return clf, score


We will run the regression algorithm for each relevant column, in order to see figure out the best regression variable:

In [5]:
accs = dict()
cols = ('Transistors [million]', 'GPU Clock [MHz]', 'Memory Clock [MHz]', 'Bandwidth [GB/s]',
        'TMUs', 'ROPs', 'Pixel Rate [GPixel/s]', 'Texture Rate [GTexel/s]', 'TDP [W]', 'Outputs',
        'Power Connectors', 'FP32 (float) performance [TFLOPS]')

for col in cols:
    clf, score = procedure(df, col)
    accs.update({col: (clf, score)})

Let's see the scores for each column:

In [6]:
{col: score for col, (_, score) in sorted(accs.items(), key=lambda x: x[1][1], reverse=True)}

{'Texture Rate [GTexel/s]': 0.9910924181476412,
 'TMUs': 0.9844662389630248,
 'Pixel Rate [GPixel/s]': 0.9840668376637742,
 'FP32 (float) performance [TFLOPS]': 0.9796626895190147,
 'ROPs': 0.9766529904858253,
 'Bandwidth [GB/s]': 0.9337106659040911,
 'Transistors [million]': 0.799022513443884,
 'Memory Clock [MHz]': 0.7989523161102023,
 'GPU Clock [MHz]': 0.6791252801834733,
 'TDP [W]': 0.6257575335628116,
 'Power Connectors': 0.5448457340799433,
 'Outputs': 0.3012837066938109}

Let's look at a specific model weights:

In [7]:
y_col = "Texture Rate [GTexel/s]"
clf, score = accs[y_col]
feature_names = df.columns.drop([y_col])


In [8]:
eli5.show_weights(clf, feature_names=list(feature_names), top=50)

Weight?,Feature
201.487,<BIAS>
9.849,FP32 (float) performance [TFLOPS]
7.531,OpenGL
1.841,Power Connectors
1.308,Pixel Rate [GPixel/s]
1.295,Shader Model
1.037,Memory Size [GB]
0.924,TMUs
0.33,Memory Type
0.034,Bandwidth [GB/s]
