1) Take one of the supervised learning models you have built recently and apply at least three dimensionality reduction techniques to it (separately). Be sure to create a short summary of each technique you use. Indicate how each changed the model performance. Reference: https://machinelearningmastery.com/dimensionality-reduction-algorithms-with-python/

In [7]:
#imports
import pandas as pd
from numpy import mean
from numpy import std
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [8]:
#import dataset as a pandas dataframe
diabetes_df = pd.read_csv("diabetes.csv")
diabetes_df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [9]:
#establish X and y variables
X = diabetes_df.drop('Outcome', axis=1)
y = diabetes_df['Outcome']

#split data into training and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)

In [10]:
#Standardization
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.fit_transform(X_test)

In [23]:
#set up regression model
lr = LogisticRegression()
lr.fit(X_train, y_train)

#run prediction
y_pred = lr.predict(X_test)

#find accuracy score
accuracy_score = lr.score(X_test, y_test)
print(accuracy_score)

0.7916666666666666


In [15]:
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=42)
n_scores = cross_val_score(lr, X_test, y_pred, scoring='accuracy', cv=cv, n_jobs=1)
print((mean(n_scores), std(n_scores)))

(0.9844129554655869, 0.022796867209748224)


In [20]:
#Principal Component Analysis
from sklearn.decomposition import PCA

#set up pipeline
from sklearn.pipeline import Pipeline
steps_pca = [('pca', PCA(n_components=5)), ('m', LogisticRegression())]
lr_pca = Pipeline(steps=steps_pca)

#fit model
lr_pca.fit(X_train, y_train)

#run prediction
y_pred_pca = lr_pca.predict(X_test)

#find accuracy score
accuracy_score = lr_pca.score(X_test, y_test)
print(accuracy_score)

0.7578125


In [21]:
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=42)
n_scores = cross_val_score(lr_pca, X_test, y_pred_pca, scoring='accuracy', cv=cv, n_jobs=1)
print((mean(n_scores), std(n_scores)))

#looks like PCA made our model a little less accurate

(0.966149347728295, 0.023460108698357678)


In [25]:
#Single Value Decomposition
from sklearn.decomposition import TruncatedSVD

#set up pipeline
from sklearn.pipeline import Pipeline
steps_svd = [('svd', TruncatedSVD(n_components=5)), ('m', LogisticRegression())]
lr_svd = Pipeline(steps=steps_svd)

#fit model
lr_svd.fit(X_train, y_train)

#run prediction
y_pred_svd = lr_svd.predict(X_test)

#find accuracy score
accuracy_score = lr_svd.score(X_test, y_test)
print(accuracy_score)

0.7578125


In [26]:
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=42)
n_scores = cross_val_score(lr_svd, X_test, y_pred_svd, scoring='accuracy', cv=cv, n_jobs=1)
print((mean(n_scores), std(n_scores)))

#SVD accuracy comparable to PCA accuracy

(0.966149347728295, 0.023460108698357678)


In [27]:
#Linear Discriminant Analysis
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

#set up pipeline
from sklearn.pipeline import Pipeline
steps_lda = [('lda', LinearDiscriminantAnalysis(n_components=1)), ('m', LogisticRegression())]
lr_lda = Pipeline(steps=steps_lda)

#fit model
lr_lda.fit(X_train, y_train)

#run prediction
y_pred_lda = lr_lda.predict(X_test)

#find accuracy score
accuracy_score = lr_lda.score(X_test, y_test)
print(accuracy_score)

0.78125


In [28]:
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=42)
n_scores = cross_val_score(lr_lda, X_test, y_pred_lda, scoring='accuracy', cv=cv, n_jobs=1)
print((mean(n_scores), std(n_scores)))

#Logistic Regression accuracy score slightly higher with LDA compared to baseline fit
#but LDA mean score across cross validation outcomes is slightly lower than that of PCA, SVD

(0.9601439496176337, 0.0289273247883316)


In [31]:
#Isomap Embedding
from sklearn.manifold import Isomap

#set up pipeline
steps_iso = [('iso', Isomap(n_components=5)), ('m', LogisticRegression())]
lr_iso = Pipeline(steps=steps_iso)

#fit model
lr_iso.fit(X_train, y_train)

#run prediction
y_pred_iso = lr_iso.predict(X_test)

#find accuracy score
accuracy_score = lr_iso.score(X_test, y_test)
print(accuracy_score)

0.7213541666666666


In [32]:
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=42)
n_scores = cross_val_score(lr_iso, X_test, y_pred_iso, scoring='accuracy', cv=cv, n_jobs=1)
print((mean(n_scores), std(n_scores)))

#Isomap accuracy scores are lower than baseline and other dimensionality reduction methods explored so far
#with this dataset

(0.8967161493477284, 0.039500659138960025)


In [33]:
#Locally Linear Embedding
from sklearn.manifold import LocallyLinearEmbedding

#set up pipeline
steps_lle = [('lle', LocallyLinearEmbedding(n_components=5)), ('m', LogisticRegression())]
lr_lle = Pipeline(steps=steps_lle)

#fit model
lr_lle.fit(X_train, y_train)

#run prediction
y_pred_lle = lr_lle.predict(X_test)

#find accuracy score
accuracy_score = lr_lle.score(X_test, y_test)
print(accuracy_score)

0.6614583333333334


In [34]:
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=42)
n_scores = cross_val_score(lr_lle, X_test, y_pred_lle, scoring='accuracy', cv=cv, n_jobs=1)
print((mean(n_scores), std(n_scores)))

#Well jupyter is being grumpy about doing the cross-validation for this one
#but based on the standard Logistc Regression accuracy score, LLE is not a good fit for the diabetes model!

Traceback (most recent call last):
  File "/Applications/anaconda3/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Applications/anaconda3/lib/python3.8/site-packages/sklearn/pipeline.py", line 346, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/Applications/anaconda3/lib/python3.8/site-packages/sklearn/linear_model/_logistic.py", line 1374, in fit
    raise ValueError("This solver needs samples of at least 2 classes"
ValueError: This solver needs samples of at least 2 classes in the data, but the data contains only one class: 0

Traceback (most recent call last):
  File "/Applications/anaconda3/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Applications/anaconda3/lib/python3.8/site-packages/sklearn/pipeline.py", line 346, in fit
    self._fina

Traceback (most recent call last):
  File "/Applications/anaconda3/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Applications/anaconda3/lib/python3.8/site-packages/sklearn/pipeline.py", line 346, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/Applications/anaconda3/lib/python3.8/site-packages/sklearn/linear_model/_logistic.py", line 1374, in fit
    raise ValueError("This solver needs samples of at least 2 classes"
ValueError: This solver needs samples of at least 2 classes in the data, but the data contains only one class: 0

Traceback (most recent call last):
  File "/Applications/anaconda3/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Applications/anaconda3/lib/python3.8/site-packages/sklearn/pipeline.py", line 346, in fit
    self._fina

Traceback (most recent call last):
  File "/Applications/anaconda3/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Applications/anaconda3/lib/python3.8/site-packages/sklearn/pipeline.py", line 346, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/Applications/anaconda3/lib/python3.8/site-packages/sklearn/linear_model/_logistic.py", line 1374, in fit
    raise ValueError("This solver needs samples of at least 2 classes"
ValueError: This solver needs samples of at least 2 classes in the data, but the data contains only one class: 0

Traceback (most recent call last):
  File "/Applications/anaconda3/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Applications/anaconda3/lib/python3.8/site-packages/sklearn/pipeline.py", line 346, in fit
    self._fina

(nan, nan)


Traceback (most recent call last):
  File "/Applications/anaconda3/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Applications/anaconda3/lib/python3.8/site-packages/sklearn/pipeline.py", line 346, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/Applications/anaconda3/lib/python3.8/site-packages/sklearn/linear_model/_logistic.py", line 1374, in fit
    raise ValueError("This solver needs samples of at least 2 classes"
ValueError: This solver needs samples of at least 2 classes in the data, but the data contains only one class: 0

Traceback (most recent call last):
  File "/Applications/anaconda3/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Applications/anaconda3/lib/python3.8/site-packages/sklearn/pipeline.py", line 346, in fit
    self._fina

In [36]:
#Modified Locally Linear Embedding

#set up pipeline
steps_mlle = [('lle', LocallyLinearEmbedding(n_components=5, method='modified', n_neighbors=10)), ('m', LogisticRegression())]
lr_mlle = Pipeline(steps=steps_mlle)

#fit model
lr_mlle.fit(X_train, y_train)

#run prediction
y_pred_mlle = lr_mlle.predict(X_test)

#find accuracy score
accuracy_score = lr_mlle.score(X_test, y_test)
print(accuracy_score)

0.6666666666666666


In [37]:
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=42)
n_scores = cross_val_score(lr_mlle, X_test, y_pred_mlle, scoring='accuracy', cv=cv, n_jobs=1)
print((mean(n_scores), std(n_scores)))



(0.9948717948717949, 0.010256410256410265)


In [None]:
#standard regression score for MLLE was just okay, but average cross-validated accuracy looks to be the best
#of the bunch!

2) Write a function that will indicate if an inputted IPv4 address is accurate or not.
IP addresses are valid if they have 4 values between 0 and 255 (inclusive), punctuated by periods.

Input 1: 2.33.245.5

Output 1: True

Input 2: 12.345.67.89

Output 2: False

In [38]:
def valid_ip(address):
    #split ip address up by periods
    ip = address.split('.')
    #make sure the # of ip address splits = 4
    if len(ip) != 4:
        return False
    for val in ip:
        #make sure we don't have any non-digit characters in our ip address
        if not val.isdigit():
            return False
        #make sure each ip address section meets the value parameters
        i = int(val)
        if i < 0 or i > 255:
            return False
    return True

In [39]:
valid_ip('2.33.235.5')

True

In [40]:
valid_ip('12.345.67.89')

False