In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
from sklearn.feature_selection import chi2
from sklearn.datasets import make_regression
from sklearn.ensemble import ExtraTreesClassifier
from scipy.stats import spearmanr
from scipy import stats
from IPython.display import display

In [3]:
boston = pd.read_csv('p2_mod.csv')
X = boston.drop(columns='Y', axis=1)
y = boston['Y']
boston.head()

Unnamed: 0,X1,X2,X3,X4,X5,X6,X7,X8,X9,X10,...,X15,X16,X17,X18,X19,X20,X21,X22,X23,Y
0,20000,2,2,1,24,2,2,-1,-1,-2,...,0,0,0,0,689,0,0,0,0,1
1,120000,2,2,2,26,-1,2,0,0,0,...,3272,3455,3261,0,1000,1000,1000,0,2000,1
2,90000,2,2,2,34,0,0,0,0,0,...,14331,14948,15549,1518,1500,1000,1000,1000,5000,0
3,50000,2,2,1,37,0,0,0,0,0,...,28314,28959,29547,2000,2019,1200,1100,1069,1000,0
4,50000,1,2,1,57,-1,0,-1,0,0,...,20940,19146,19131,2000,36681,10000,9000,689,679,0


In [4]:
# This is the entropy method we defined in the Entropy workshop
def entropy(y):
    probs = [] # Probabilities of each class label
    for c in set(y): # Set gets a unique set of values. We're iterating over each value
        num_same_class = sum(y == c)  # Remember that true == 1, so we can sum.
        p = num_same_class / len(y) # Probability of this class label
        probs.append(p)
    return np.sum(-p * np.log2(p) for p in probs)

# What is the entropy of the entire set?
print("Entire set entropy = %.2f" % entropy(y))

Entire set entropy = 0.76


  


In [5]:
# Let's write some functions that calculates the entropy after splitting on a particular value

def class_probability(feature, y):
    """Calculates the proportional length of each value in the set of instances"""
    # This is doc string, used for documentation
    probs = []
    for value in set(feature):
        select = feature == value # Split by feature value into two classes
        y_new = y[select]         # Those that exist in this class are now in y_new
        probs.append(float(len(y_new))/len(X))  # Convert to float, because ints don't divide well
    return probs

def class_entropy(feature, y):
    """Calculates the entropy for each value in the set of instances"""
    ents = []
    for value in set(feature):
        select = feature == value # Split by feature value into two classes
        y_new = y[select]         # Those that exist in this class are now in y_new
        ents.append(entropy(y_new))
    return ents

def proportionate_class_entropy(feature, y):
    """Calculatates the weighted proportional entropy for a feature when splitting on all values"""
    probs = class_probability(feature, y)
    ents = class_entropy(feature, y)
    return np.sum(np.multiply(probs, ents)) # Information gain equation

# Let's try calculating the entropy after splitting by all the values in "cap-shape"
new_entropy = proportionate_class_entropy(X["X1"], y)
print("Information gain of %.2f" % (entropy(y) - new_entropy))
# Should be an information gain of 0.02

Information gain of 0.02


  


In [6]:
# Now let's try doing the same when splitting based upon all values of "odor"
new_entropy = proportionate_class_entropy(X["X6"], y)
print("Information gain of %.2f" % (entropy(y) - new_entropy))
# Should be an information gain of 0.11

Information gain of 0.11


  


In [7]:
for c in X.columns:
    new_entropy = proportionate_class_entropy(X[c], y)
    print("%s %.2f" % (c, entropy(y) - new_entropy))

  


X1 0.02
X2 0.00
X3 0.00
X4 0.00
X5 0.00
X6 0.11
X7 0.07
X8 0.05
X9 0.05
X10 0.04
X11 0.04
X12 0.58
X13 0.57
X14 0.57
X15 0.56
X16 0.54
X17 0.53
X18 0.20
X19 0.19
X20 0.19
X21 0.17
X22 0.17
X23 0.17


In [8]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

SVD

In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA, TruncatedSVD
svd = TruncatedSVD(n_components = 6)
svd.fit(X_train)
svd_X = svd.transform(X_train)

In [11]:
svd_X.shape

(24000, 6)

In [12]:
svd_X_train, svd_X_test, svd_y_train, svd_y_test = train_test_split(svd_X, y_train, test_size = 0.2, random_state = 42)

In [13]:
svd_lr = LinearRegression()

# Train the model
svd_model = svd_lr.fit(svd_X_train, svd_y_train)

# Prediction
pca_y_pred =  svd_lr.predict(svd_X_test)

# Accuracy Score
svd_lr.score(svd_X_test, pca_y_pred)

1.0

SVM

In [14]:
from sklearn import svm
from sklearn.svm import SVC

In [15]:
svm = svm.SVC()
svm.fit(svd_X_train, svd_y_train)
svm.score(svd_X_train, svd_y_train)*100

77.88020833333333

In [16]:
svm_score = svm.score(svd_X_test, svd_y_test)*100
print("SVM Accuracy:", svm_score)

SVM Accuracy: 77.60416666666666


KNN

In [17]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
knn = KNeighborsClassifier()
classifier = KNeighborsClassifier(n_neighbors = 81)
classifier.fit(svd_X_train, svd_y_train)
knn.fit(svd_X_train, svd_y_train)

KNeighborsClassifier()

In [18]:
score = knn.score(svd_X_test, svd_y_test)
print("KNN Algorithm Score:", score*100)

KNN Algorithm Score: 75.25


DT

In [19]:
from sklearn.tree import DecisionTreeClassifier
dclf = DecisionTreeClassifier()
dclf.fit(svd_X_train, svd_y_train)

DecisionTreeClassifier()

In [20]:
j=dclf.score(svd_X_test, svd_y_test)
print("Decision Tree Algorithm Score:",j*100)

Decision Tree Algorithm Score: 68.75


RF

In [21]:
def run_randomForest(svd_X_train, svd_y_train,svd_X_test, svd_y_test):
    clf = RandomForestClassifier(n_estimators= 100, random_state = 42, n_jobs = -1)
    clf.fit(svd_X_train, svd_y_train)
    y_pred = clf.predict(svd_X_test)
    score = accuracy_score(svd_y_test,y_pred)*100
    print("Random Forest Accuracy:", score)

In [22]:
run_randomForest(svd_X_train, svd_y_train,svd_X_test, svd_y_test)

Random Forest Accuracy: 77.5625


LR

In [23]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(svd_X_train, svd_y_train)
lr = model.score(svd_X_test, svd_y_test)*100
print("Logistic Regression Accuracy:", lr)

Logistic Regression Accuracy: 77.60416666666666
