In [1]:
#import statements
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, learning_curve, KFold
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, StandardScaler
import random
from sklearn.svm import SVC
import sklearn.metrics as sk
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree

In [3]:
df1 = pd.read_csv('sgemm_product.csv',sep=',')
df = df1.sample(frac=0.4) #reducing data size for faster computation
df.head()

Unnamed: 0,MWG,NWG,KWG,MDIMC,NDIMC,MDIMA,NDIMB,KWI,VWM,VWN,STRM,STRN,SA,SB,Run1 (ms),Run2 (ms),Run3 (ms),Run4 (ms)
28798,32,32,32,8,32,8,32,2,2,1,1,1,1,0,41.63,41.47,41.62,41.64
108444,64,64,32,32,8,8,16,8,1,2,1,1,0,0,40.9,40.87,41.39,40.93
219762,128,128,32,8,8,32,8,2,4,2,0,0,1,0,1748.81,1757.98,1751.86,1731.7
187540,128,64,32,8,16,32,32,8,1,2,0,1,0,0,311.3,310.2,309.74,310.33
64955,32,128,32,32,8,8,32,8,1,4,1,0,1,1,33.42,35.02,33.81,33.4


In [None]:
#creating Runtime, target variable by taking average of Run1, Run2, Run3, Run4
df['Runtime']=df[['Run1 (ms)','Run2 (ms)','Run3 (ms)','Run4 (ms)']].mean(axis=1)

In [None]:
df.head()

In [None]:
#drop other Run time variables
df1=df.drop(columns =['Run1 (ms)','Run2 (ms)','Run3 (ms)','Run4 (ms)'], axis = 1)
df1.info()

In [None]:
#checking descriptive stats
df1.describe().T

In [None]:
#checking for NULL values
df1.isnull().sum() #no NULL values

In [None]:
#checking for outliers
plt.figure(figsize=(10,6))
sns.boxplot(df1['Runtime']);

In [None]:
#removing outliers
Q1=df1['Runtime'].quantile(0.25)
Q2=df1['Runtime'].quantile(0.75)
IQR = Q2 - Q1
LL=Q1-1.5*IQR
UL=Q2+1.5*IQR
df2 = df1[(df1.Runtime>LL) & (df1.Runtime<UL)]
df2.describe().T

In [None]:
plt.figure(figsize=(10,6))
sns.boxplot(df2['Runtime']);

In [None]:
#checking variable distribution
for index in range(10):
   df2.iloc[:,index] = (df2.iloc[:,index]-df2.iloc[:,index].mean()) / df2.iloc[:,index].std();
df2.hist(figsize= (14,16));

In [None]:
#plotting the distribution of Runtime
sns.distplot(df2['Runtime'])

In [None]:
df2['target']=np.log(df2.Runtime)
sns.distplot(df2['target'])

In [None]:
plt.figure(figsize=(14,14))
sns.set(font_scale=1)
sns.heatmap(df2.corr(),cmap='GnBu_r',annot=True, square = True ,linewidths=.5);
plt.title('Variable Correlation')

In [None]:
#Creating binary classification target variable
mean = df2['target'].mean()
df2.loc[df2['target'] <= mean, 'target'] = 0
df2.loc[df2['target'] > mean, 'target'] = 1
df_target=df2[['target']].values
df_features=df2.drop(columns=['target','Runtime'],axis=1).values
    x1_train, x1_test, y1_train, y1_test = train_test_split(df_features, df_target, test_size = 0.3, random_state = 0)

In [None]:
df2['target']

In [None]:
x1_train

In [None]:
sc = StandardScaler()
x1_train = sc.fit_transform(x1_train)
x1_test = sc.transform(x1_test)

In [None]:
#Linear SVM
print('Linear Model',end='\n')
lsvclassifier = SVC(kernel='linear')
lsvclassifier.fit(x1_train, y1_train)

#Applying k-Fold Cross Validation
accuracies = cross_val_score(estimator = lsvclassifier, X = x1_train, y = y1_train, cv = 5)
mean_svm_linear=accuracies.mean()
std_svm_linear=accuracies.std()

#After using 5 fold cross validation
print('After 5 fold cross validation:')
print('Mean of Accuracies: ',mean_svm_linear*100,end='\n')
print('Standard deviation of Accuracies',std_svm_linear*100,end='\n')

#Predict SVM
y_predl = lsvclassifier.predict(x1_test)

#Confusion Matrix
print('Test Output:')
print('Confusion Matrix:')
print(sk.confusion_matrix(y1_test,y_predl))
print('Classification Report:')
print(sk.classification_report(y1_test,y_predl))
print('Accuracy: ',sk.accuracy_score(y1_test, y_predl, normalize=True, sample_weight=None))