In [1]:
import matplotlib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import matplotlib as mpl
from matplotlib.backends import backend_agg
from matplotlib.colors import LinearSegmentedColormap
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

In [2]:
data = pd.read_csv('iris.csv', header = None)

# Getting features
iris_features = data.iloc[:,:4]

# Getting species 
iris_classes = data.iloc[:,4]

# The 6 possible combinations of 2 features
features = [[0,1],[0,2],[0,3],[1,2],[1,3],[2,3]]
feature_titles = ["AB","AC","AD","BC","BD","CD"]

In [3]:
# Stochastic Gradient Descent Functions
def SGD(selected_features): # SGD (data not scaled)
    SGD = SGDClassifier(loss="modified_huber")
    SGD.fit(selected_features, iris_classes)
    return SGD.score(selected_features, iris_classes)

def SGD_scaled(selected_features): # SGD (scaled data)
    scaler = StandardScaler()
    scaler.fit(selected_features)
    data = scaler.transform(selected_features)
    
    SGD = SGDClassifier(loss="modified_huber")
    SGD.fit(data, iris_classes)

    return SGD.score(data, iris_classes)

In [4]:
# Logistic Regression Functions
def LogisticReg(selected_features): # Logistic regresssion (data not scaled)
    LogReg = LogisticRegression(random_state = 0)
    LogReg.fit(selected_features, iris_classes)
    
    return LogReg.score(selected_features, iris_classes)

def LogisticReg_scaled(selected_features): # Logistic regression (scaled data)
    scaler = StandardScaler()
    scaler.fit(selected_features)
    data = scaler.transform(selected_features)
    
    LogReg = LogisticRegression(random_state = 0)
    LogReg.fit(data, iris_classes)
    
    return LogReg.score(data, iris_classes)

In [5]:
# Support Vector Machine Functions

def SVM(selected_features, kernel): # SVM (data not scaled)
    svm = SVC(kernel = kernel)
    svm.fit(selected_features, iris_classes)
    
    return svm.score(selected_features, iris_classes)

def SVM_scaled(selected_features, kernel): # SVM (scaled data)
    scaler = StandardScaler()
    scaler.fit(selected_features)
    data = scaler.transform(selected_features)
    
    svm = SVC(kernel = kernel)
    svm.fit(data, iris_classes)
    
    return svm.score(data, iris_classes)

## Stochastic Gradient Descent using the Modified-Huber Loss Function

Stochastic gradient descent is a method for finding the optimal parameter configuration for a machine learning model. The process  makes small adjustments in the model to minimize the error. The modified huber loss function isn't sensitive to existing outliers. 

Since the score changes after every iteration, I ran each function 1000 times and computed the average.

In [6]:
# Accuracy results for every 2 feature combination for SGD
SGD_Scores = [0]*6
Scaled_SGD_Scores = [0]*6

for i in range (6):
    selected_features = iris_features[features[i]]
    
    # SGD scores (no scaling)
    for j in range (1000):
        SGD_Scores[i] += SGD(selected_features)
    SGD_Scores[i] /= 1000
    
    # SGD scores (features scaled)
    for j in range (1000):
        Scaled_SGD_Scores[i] += SGD_scaled(selected_features)
    Scaled_SGD_Scores[i] /= 1000

In [7]:
# SGD results
print("Stochastic gradient descent results (No scaling):")
for i in range(6):
    print("Features",feature_titles[i],"results:",SGD_Scores[i])

print("\nStochastic gradient descent results (Scaled data):")
for i in range(6):
    print("Features", feature_titles[i], "results:", Scaled_SGD_Scores[i])

Stochastic gradient descent results (No scaling):
Features AB results: 0.6967999999999975
Features AC results: 0.7873399999999973
Features AD results: 0.7649866666666664
Features BC results: 0.8362800000000017
Features BD results: 0.8749800000000032
Features CD results: 0.8514400000000009

Stochastic gradient descent results (Scaled data):
Features AB results: 0.7492933333333298
Features AC results: 0.9259733333333372
Features AD results: 0.9048133333333376
Features BC results: 0.918766666666666
Features BD results: 0.9337200000000087
Features CD results: 0.9343800000000088


## Logistic Regression

Logistic regression is an example of a classification supervised machine learning model. Similar to the SGD, I will obtain the scores with and without scaling.

Since the accuracies don't vary after each iteration like the SGD, computating the average accuracy is redundant. Furthermore, logistic regression isn't too sensitive to outliers.

In [8]:
# Accuracy results for every 2 feature combination for Logistic Regression
LogRegScores = []
LogRegScaledScores = []

for i in range (6):
    selected_features = iris_features[features[i]]
    
    # logistic regression scores (no scaling)
    LogRegScores.append(LogisticReg(selected_features))
    
    # logistic regression scores (features scaled)
    LogRegScaledScores.append(LogisticReg_scaled(selected_features))

In [9]:
# Logistic regression results
print("Logistic regression results (no scaling):")
for i in range (6):
    print("Features", feature_titles[i], "results:", LogRegScores[i])

print("\nLogistic regression results (scaled data):")
for i in range(6):
    print("Features", feature_titles[i], "results:", LogRegScaledScores[i])

Logistic regression results (no scaling):
Features AB results: 0.82
Features AC results: 0.96
Features AD results: 0.96
Features BC results: 0.9533333333333334
Features BD results: 0.96
Features CD results: 0.9666666666666667

Logistic regression results (scaled data):
Features AB results: 0.8133333333333334
Features AC results: 0.9533333333333334
Features AD results: 0.96
Features BC results: 0.9533333333333334
Features BD results: 0.9533333333333334
Features CD results: 0.9533333333333334


## Support Vector Machines (SVM)

Support Vector Machines is another machine learning model that is suppervised that can perform regression, classification, and outlier detection. It is effective in high dimensional spaces.

Linear Kernal:

In [10]:
# Accuracy results for every 2 feature combination for SVM where kernel is linear
Linear_SVM_Scores = []
Scaled_Linear_SVM_Scores = []

for i in range (6):
    selected_features = iris_features[features[i]]
    
    # logistic regression scores (no scaling)
    Linear_SVM_Scores.append(SVM(selected_features, "linear"))
    
    # logistic regression scores (features scaled)
    Scaled_Linear_SVM_Scores.append(SVM_scaled(selected_features, "linear"))

In [11]:
# Results
print("SVM with a Linear Kernal (Nonscaled) results:")
for i in range (6):
    print("Features", feature_titles[i], "results:", Linear_SVM_Scores[i])

print("\nSVM with a Linear Kernal (Scaled) results:")
for i in range(6):
    print("Features", feature_titles[i], "results:", Scaled_Linear_SVM_Scores[i])

SVM with a Linear Kernal (Nonscaled) results:
Features AB results: 0.82
Features AC results: 0.9533333333333334
Features AD results: 0.96
Features BC results: 0.96
Features BD results: 0.96
Features CD results: 0.9666666666666667

SVM with a Linear Kernal (Scaled) results:
Features AB results: 0.8133333333333334
Features AC results: 0.9733333333333334
Features AD results: 0.96
Features BC results: 0.9533333333333334
Features BD results: 0.9533333333333334
Features CD results: 0.9533333333333334


## Kernal = Polynomial

In [12]:
# Accuracy results for every 2 feature combination for SVM where kernel is Polynomial
Polynomial_SVM_Scores = []
Scaled_Polynomial_SVM_Scores = []

for i in range (6):
    selected_features = iris_features[features[i]]
    
    # logistic regression scores (no scaling)
    Polynomial_SVM_Scores.append(SVM(selected_features, "poly"))
    
    # logistic regression scores (features scaled)
    Scaled_Polynomial_SVM_Scores.append(SVM_scaled(selected_features, "poly"))

In [13]:
# Results
print("SVM with a Polynomial Kernal (Nonscaled) results:")
for i in range (6):
    print("Features", feature_titles[i], "results:", Polynomial_SVM_Scores[i])

print("\nSVM with a Polynomial Kernal (Scaled) results:")
for i in range(6):
    print("Features", feature_titles[i], "results:", Scaled_Polynomial_SVM_Scores[i])

Support vector machine (Kernel = polynomial) results (no scaling):
Features AB results: 0.8133333333333334
Features AC results: 0.96
Features AD results: 0.9533333333333334
Features BC results: 0.9533333333333334
Features BD results: 0.96
Features CD results: 0.9666666666666667

Support vector machine (Kernel = polynomial) results (scaled data):
Features AB results: 0.7466666666666667
Features AC results: 0.9
Features AD results: 0.9066666666666666
Features BC results: 0.9066666666666666
Features BD results: 0.92
Features CD results: 0.96


## Kernal = RBF

In [14]:
# Accuracy results for every 2 feature combination for SVM where kernel is RB
RBF_SVM_Scores = []
Scaled_RBF_SVM_Scores = []

for i in range (6):
    selected_features = iris_features[features[i]]
    
    # getting logistic regression scores (no scaling)
    RBF_SVM_Scores.append(SVM(selected_features, "rbf"))
    
    # getting logistic regression scores (features scaled)
    Scaled_RBF_SVM_Scores.append(SVM_scaled(selected_features, "rbf"))

In [15]:
# Results
print("SVM with a RBF Kernal (Nonscaled) results:")
for i in range (6):
    print("Features", feature_titles[i], "results:", RBF_SVM_Scores[i])

print("\nSVM with a RBF Kernal (Scaled) results:")
for i in range(6):
    print("Features", feature_titles[i], "results:", Scaled_RBF_SVM_Scores[i])

Support vector machine (Kernel = rbf) results (no scaling):
Features AB results: 0.82
Features AC results: 0.96
Features AD results: 0.96
Features BC results: 0.9533333333333334
Features BD results: 0.96
Features CD results: 0.9533333333333334

Support vector machine (Kernel = rbf) results (scaled data):
Features AB results: 0.82
Features AC results: 0.96
Features AD results: 0.96
Features BC results: 0.9466666666666667
Features BD results: 0.9533333333333334
Features CD results: 0.96


***
## Results

For starters, I scaled the data for each model in order to improve the accuracies ever so slightly.

A) sepal length

B) sepal width

C) petal length

D) petal width

## SGD

### <center>SGD Results:</center>
|     | AB | AC | AD | BC | BD | CD | 
| :-: | :-: | :-: | :-: | :-: | :-: | :-: |
| Unscaled Data | 0.6967999999999975 | 0.7873399999999973 | 0.7649866666666664 | 0.8362800000000017 | 0.8749800000000032 | 0.8514400000000009 |
| Scaled Data   | 0.7492933333333298  | 0.9259733333333372   | 0.9048133333333376 | 0.918766666666666  | 0.9337200000000087 | 0.9343800000000088 |

As mentioned before, the accuracy improves ever so slightly after scaling the data. Furthermore, the results are the average taken from running the model a 1000 times. To improve on the model even more, I could run the model even more than 1000 times. Generally, the accuracies are pretty good with the exception being the feature combination of (sepal length, sepal width).

## Logistic Regression

### <center> Logistic Regression Results:</center>
|     | AB | AC | AD | BC | BD | CD | 
| :-: | :-: | :-: | :-: | :-: | :-: | :-: |
| Unscaled Data | 0.82               | 0.96               | 0.96 | 0.9533333333333334 | 0.96               | 0.9666666666666667 |
| Scaled Data   | 0.8133333333333334 | 0.9533333333333334 | 0.96 | 0.9533333333333334 | 0.9533333333333334 | 0.9533333333333334 |

An interesting observation I made is that in some feature combinations, scaling the results worsened the accuracies ever so slightly. This demonstrates that logistic regression can classify the data with ease whether the data is scaled or not.

## Support Vector Machines

### <center> Linear Results </center>
|     | AB | AC | AD | BC | BD | CD | 
| :-: | :-: | :-: | :-: | :-: | :-: | :-: |
| Unscaled Data | 0.82               | 0.9533333333333334 | 0.96 | 0.96               | 0.96               | 0.9666666666666667 |
| Scaled Data   | 0.8133333333333334 | 0.9733333333333334 | 0.96 | 0.9533333333333334 | 0.9533333333333334 | 0.9533333333333334 |

Almost all feature combinations performed worse after scaling the data, but not by much. AC and Ad ended up staying the same. Scaling the data proved that a linear kernal having a slightly harder time classifying values correctly.

### <center> Polynomial Results </center>
|     | AB | AC | AD | BC | BD | CD | 
| :-: | :-: | :-: | :-: | :-: | :-: | :-: |
| Unscaled Data | 0.8133333333333334 | 0.96 | 0.9533333333333334 | 0.9533333333333334 | 0.96 | 0.9666666666666667 |
| Scaled Data   | 0.7466666666666667 | 0.9  | 0.9066666666666666 | 0.9066666666666666 | 0.92 | 0.96               |

The table shows that scaling the data resulted in the Polynomial kernal performing worse then non-scaling the data. 

### <center> RBF Results </center>
|     | AB | AC | AD | BC | BD | CD | 
| :-: | :-: | :-: | :-: | :-: | :-: | :-: |
| Unscaled Data | 0.82 | 0.96 | 0.96 | 0.9533333333333334 | 0.96 | 0.9533333333333334 |
| Scaled Data   | 0.82 | 0.96 | 0.96 | 0.9466666666666667 | 0.9533333333333334 | 0.96 |

Similar to the other Support Vector Machine results, scaling the data caused a slight reduction in accuracy. Except for CD, the accuracies either stayed the same or were slightly worse. 

In conclusion, scaling the data caused the accuracies to stay the same, or to be reduced. However, it was important to scale the data for the Stochastic Gradient Descent and the Logistic regression, as the accuracies improved significantly for SGD, and ever so slightly for the logistic regression.