## Take one of the supervised learning models you've done recently and apply three dimensionality reduction techniques (separately). Create a short summary of each technique and how they changed the model performance.


In [2]:
import numpy as np
import pandas as pd
from sklearn import tree
from sklearn.metrics import confusion_matrix, classification_report, plot_confusion_matrix


diabetes_df = pd.read_csv("../diabetes.csv")
diabetes_df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [3]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X = diabetes_df.drop('Outcome', axis=1)
y = diabetes_df['Outcome']


# Split into training and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=42, stratify=y)

#Standardize
sc= StandardScaler()
X_train=sc.fit_transform(X_train)
X_test=sc.fit_transform(X_test)

In [15]:
# decision tree classifier
model = tree.DecisionTreeClassifier(max_depth = 10,random_state=42)

model = model.fit(X_train, y_train)
y_pred = model.predict(X_test)
y_pred_train = model.predict(X_train)

print('Baseline output')
print(classification_report(y_train, y_pred_train))
print(classification_report(y_test, y_pred))

Baseline output
              precision    recall  f1-score   support

           0       0.99      1.00      0.99       350
           1       0.99      0.98      0.99       187

    accuracy                           0.99       537
   macro avg       0.99      0.99      0.99       537
weighted avg       0.99      0.99      0.99       537

              precision    recall  f1-score   support

           0       0.78      0.90      0.84       150
           1       0.75      0.54      0.63        81

    accuracy                           0.77       231
   macro avg       0.77      0.72      0.73       231
weighted avg       0.77      0.77      0.76       231



#### Trying Truncated SVD
This is a good technique for sparse data (data with a lot of null values), although this dataset isn't sparse. 
The dataset didn't perform as well as the standard scaler. Probably because the data isn't that complex.

In [13]:
from sklearn.decomposition import TruncatedSVD

svd = TruncatedSVD(n_components=7)


X = diabetes_df.drop('Outcome', axis=1)
y = diabetes_df['Outcome']
# Split into training and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, 
                                                    random_state=42, stratify=y)

#fit transform the data
X_train_svd=svd.fit_transform(X_train)
X_test_svd=svd.fit_transform(X_test)

# decision tree classifier
model = tree.DecisionTreeClassifier(max_depth = 10,random_state=42).fit(X_train_svd, y_train)


y_pred = model.predict(X_test_svd)
y_pred_train = model.predict(X_train_svd)

print('TruncatedSVD output')
print(classification_report(y_train, y_pred_train))
print(classification_report(y_test, y_pred))

TruncatedSVD output
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       350
           1       0.99      0.96      0.98       187

    accuracy                           0.98       537
   macro avg       0.99      0.98      0.98       537
weighted avg       0.98      0.98      0.98       537

              precision    recall  f1-score   support

           0       0.74      0.75      0.74       150
           1       0.52      0.51      0.51        81

    accuracy                           0.66       231
   macro avg       0.63      0.63      0.63       231
weighted avg       0.66      0.66      0.66       231



#### Trying PCA
Should be good for dense data (few null values).
Also noticing worse performance, particularly in predicting in precision for positive cases. Possibly due to the simple nature (low dimenionality of the dataset).


In [16]:
from sklearn.decomposition import PCA

pca= PCA(n_components=7)


X = diabetes_df.drop('Outcome', axis=1)
y = diabetes_df['Outcome']
# Split into training and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, 
                                                    random_state=42, stratify=y)

#fit transform the data
X_train_pca=pca.fit_transform(X_train)
X_test_pca=pca.fit_transform(X_test)

# decision tree classifier
model = tree.DecisionTreeClassifier(max_depth = 10,random_state=42).fit(X_train_pca, y_train)


y_pred = model.predict(X_test_pca)
y_pred_train = model.predict(X_train_pca)

print('PCA output')
print(classification_report(y_train, y_pred_train))
print(classification_report(y_test, y_pred))

PCA output
              precision    recall  f1-score   support

           0       0.99      1.00      0.99       350
           1       1.00      0.97      0.99       187

    accuracy                           0.99       537
   macro avg       0.99      0.99      0.99       537
weighted avg       0.99      0.99      0.99       537

              precision    recall  f1-score   support

           0       0.72      0.68      0.70       150
           1       0.47      0.52      0.49        81

    accuracy                           0.62       231
   macro avg       0.60      0.60      0.60       231
weighted avg       0.63      0.62      0.63       231



### Trying LinearDiscrimnant Analysis
Useful for multi-class classification
Number of components is the mininum of either C-1 where C is classes or the number of features: min(n_classes-1, n_features). So for a binary classification like this one, it will only use one component.
This didn't perform as well as the decision tree with standard scaler but probably could with the right parameter tuning. I think the linear structure of the datset lends itself well.



In [67]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

lda= LinearDiscriminantAnalysis()


X = diabetes_df.drop('Outcome', axis=1)
y = diabetes_df['Outcome']
# Split into training and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, 
                                                    random_state=42, stratify=y)

#fit transform the data
X_train_lda=lda.fit_transform(X_train, y_train)
X_test_lda=lda.fit_transform(X_test, y_test)

# decision tree classifier
model = tree.DecisionTreeClassifier(max_depth = 10,random_state=42).fit(X_train_lda, y_train)


y_pred = model.predict(X_test_lda)
y_pred_train = model.predict(X_train_lda)

print('LDA output')
print(classification_report(y_train, y_pred_train))
print(classification_report(y_test, y_pred))

LDA output
              precision    recall  f1-score   support

           0       0.93      0.97      0.95       350
           1       0.93      0.87      0.90       187

    accuracy                           0.93       537
   macro avg       0.93      0.92      0.92       537
weighted avg       0.93      0.93      0.93       537

              precision    recall  f1-score   support

           0       0.76      0.87      0.81       150
           1       0.67      0.51      0.58        81

    accuracy                           0.74       231
   macro avg       0.72      0.69      0.69       231
weighted avg       0.73      0.74      0.73       231



### Isomap embedding to try a manifold technique
Isomap creates an "embedding" of the dataset and tries to preserve relationships. This is intended for data that has a non-linear structure.
This model did not perform well at all, but did better with the default number of components over my specified 7. I think the dataset is already too simple for this technique.

In [27]:
from sklearn.manifold import Isomap

iso= Isomap()


X = diabetes_df.drop('Outcome', axis=1)
y = diabetes_df['Outcome']
# Split into training and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, 
                                                    random_state=42, stratify=y)

#fit transform the data
X_train_iso=iso.fit_transform(X_train)
X_test_iso=iso.fit_transform(X_test)

# decision tree classifier
model = tree.DecisionTreeClassifier(max_depth = 10,random_state=42).fit(X_train_iso, y_train)


y_pred = model.predict(X_test_iso)
y_pred_train = model.predict(X_train_iso)

print('LDA output')
print(classification_report(y_train, y_pred_train))
print(classification_report(y_test, y_pred))

LDA output
              precision    recall  f1-score   support

           0       0.93      0.90      0.91       350
           1       0.82      0.87      0.85       187

    accuracy                           0.89       537
   macro avg       0.88      0.89      0.88       537
weighted avg       0.89      0.89      0.89       537

              precision    recall  f1-score   support

           0       0.64      0.77      0.70       150
           1       0.33      0.21      0.26        81

    accuracy                           0.57       231
   macro avg       0.48      0.49      0.48       231
weighted avg       0.53      0.57      0.54       231



## Write a function that will indicate if an inputted IPv4 address is accurate or not. IP addresses are valid if they have 4 values between 0 and 255 (inclusive) punctuated by periods.

In [40]:
x= '2.45.123.12'
y= x.split('.')
new = [int(num) for num in y]
len(new)

4

In [61]:
def validIP(address):
    """determines if an input IP address is valid"""
    test = []
    output =[]
    ip = address.split('.')
    test = [int(num) for num in ip]
    for num in test:
        if num in range(0,256):
            output.append(num)
    if len(output) == 4:
        return 'True'
    else:
        return 'False'
    

In [62]:
validIP('2.45.123.12')

'True'

In [63]:
validIP('12.344.67.89')

'False'

In [64]:
validIP('-4.13.230.68')

'False'

In [65]:
validIP('2.33.245.5')

'True'