In [1]:
import math
import pickle
import gzip
import numpy as np
import pandas as pd
import matplotlib.pylab as plt
%matplotlib inline

In [2]:
from sklearn.preprocessing import StandardScaler
from numpy import linalg as LA
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score,make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeRegr



In [3]:
class PCA:
    def __init__(self, target_explained_variance=None):
        """
        explained_variance: float, the target level of explained variance
        """
        self.target_explained_variance = target_explained_variance
        self.feature_size = -1
        self.feature_tuple=[] #has the index of feature and amount of pca

    def standardize(self, X):
        """
        standardize features using standard scaler
        :param m X n: features data
        :return: standardized features
        """
        # YOUR CODE HERE
        scaler = StandardScaler()
        scaler.fit(X) #fit the data to find std and mean
        return scaler.transform(X)

    def compute_mean_vector(self, X_std):
        """
        compute mean vector
        :param X_std: data
        :return n X 1 matrix: mean vector
        """
        np_x=np.array(X_std)
        means=np.mean(np_x,axis=0)
        return means

    def compute_cov(self, X_std, mean_vec):
        """
        Covariance using mean, (don't use any numpy.cov)
        :param X_std:
        :param mean_vec:
        :return n X n matrix:: covariance matrix
        """
        len_x=len(X_std[0])
        print(len_x)
        scale=1/(len(X_std)-1)
        print("scale is ", scale)
        val=[]
        for i in range(len_x):
            row=X_std[:,i]-mean_vec[i] #compute ith row
            for j in range(len_x):
                col=X_std[:,j]-mean_vec[j]
                val.append(np.sum(np.multiply(row,col)))
        val=np.array(val)
        val=val.reshape(len_x,len_x)
        val=scale*val
        return val

    def compute_eigen_vector(self, cov_mat):
        """
        Eigenvector and eigen values using numpy
        :param cov_mat:
        :return: (eigen_vector,eigen_values)
        """
        v,w= LA.eig(cov_mat) #v is the eigen value and w is eigen vector
        return w,v

    def compute_explained_variance(self, eigen_vals):
        """
        sort eigen values and compute explained variance.
        explained variance informs the amount of information (variance)
        can be attributed to each of  the principal components.
        :param eigen_vals:
        :return: explained variance.
        """
        eigen_vals_copy=eigen_vals
        eigen_vals_sorted=np.sort(eigen_vals)[::-1]
        print("largest eigen_vals is ", eigen_vals_sorted[0])
        eigen_sum=np.sum(eigen_vals)
        var_exp=eigen_vals/eigen_sum
        for val in eigen_vals_sorted:
            for i in range(len(eigen_vals_copy)):
                if val == eigen_vals_copy[i]:
                    self.feature_tuple.append((i,val/eigen_sum))
                    
        return var_exp

    def cumulative_sum(self, var_exp):
        """
        return cumulative sum of explained variance.
        :param var_exp: explained variance
        :return: cumulative explained variance
        """
        return np.cumsum(var_exp)

    def compute_weight_matrix(self, eig_pairs, cum_var_exp):
        """
        compute weight matrix of top principal components conditioned on target
        explained variance.
        (Hint : use cumilative explained variance and target_explained_variance to find
        top components)
        
        :param eig_pairs: list of tuples containing eigenvector and eigen
        values
        :param cum_var_exp: cumulative expalined variance by features
        :return: weight matrix
        """
        for i in range(len(cum_var_exp)):
            if(cum_var_exp[i]>=self.target_explained_variance): #find the amount of features
                index=i
                break;
        print("i is ", index)
        w,v=zip(*eig_pairs) #unzip the values, note eig_pairs sorted in descending order
        w=np.array(w)
        weight_matrix=w[:,0:index]
        return weight_matrix

    def transform_data(self, X_std, matrix_w):
        """
        transform data to subspace using weight matrix
        :param X_std: standardized data
        :param matrix_w: weight matrix
        :return: data in the subspace
        """
        return X_std.dot(matrix_w)

    def fit(self, X):
        """
        entry point to the transform data to k dimensions
        standardize and compute weight matrix to transform data.
        :param   m X n dimension: train samples
        :return  m X k dimension: subspace data.
        """
    
        X_std=self.standardize(X)
        mean_vec=self.compute_mean_vector(X_std)
        co_var=self.compute_cov(X_std,mean_vec)
        w,v=self.compute_eigen_vector(co_var)
        var_exp=self.compute_explained_variance(v)
        cum_var_sum=self.cumulative_sum(var_exp)
        eigen_pairs=tuple(zip(w, v))
        sorted(eigen_pairs, key=lambda x: x[1])
        matrix_w=self.compute_weight_matrix(eigen_pairs,cum_var_sum)
        
        
        return self.transform_data(X_std=X_std, matrix_w=matrix_w), self.feature_tuple


In [4]:
file=pd.read_csv("pd_speech_features.csv", header=1)
file= file.drop(["id"], axis=1) #Don't want to predict on id
file

Unnamed: 0,gender,PPE,DFA,RPDE,numPulses,numPeriodsPulses,meanPeriodPulses,stdDevPeriodPulses,locPctJitter,locAbsJitter,...,tqwt_kurtosisValue_dec_28,tqwt_kurtosisValue_dec_29,tqwt_kurtosisValue_dec_30,tqwt_kurtosisValue_dec_31,tqwt_kurtosisValue_dec_32,tqwt_kurtosisValue_dec_33,tqwt_kurtosisValue_dec_34,tqwt_kurtosisValue_dec_35,tqwt_kurtosisValue_dec_36,class
0,1,0.85247,0.71826,0.57227,240,239,0.008064,0.000087,0.00218,0.000018,...,1.5620,2.6445,3.8686,4.2105,5.1221,4.4625,2.6202,3.0004,18.9405,1
1,1,0.76686,0.69481,0.53966,234,233,0.008258,0.000073,0.00195,0.000016,...,1.5589,3.6107,23.5155,14.1962,11.0261,9.5082,6.5245,6.3431,45.1780,1
2,1,0.85083,0.67604,0.58982,232,231,0.008340,0.000060,0.00176,0.000015,...,1.5643,2.3308,9.4959,10.7458,11.0177,4.8066,2.9199,3.1495,4.7666,1
3,0,0.41121,0.79672,0.59257,178,177,0.010858,0.000183,0.00419,0.000046,...,3.7805,3.5664,5.2558,14.0403,4.2235,4.6857,4.8460,6.2650,4.0603,1
4,0,0.32790,0.79782,0.53028,236,235,0.008162,0.002669,0.00535,0.000044,...,6.1727,5.8416,6.0805,5.7621,7.7817,11.6891,8.2103,5.0559,6.1164,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
751,0,0.80903,0.56355,0.28385,417,416,0.004627,0.000052,0.00064,0.000003,...,3.0706,3.0190,3.1212,2.4921,3.5844,3.5400,3.3805,3.2003,6.8671,0
752,0,0.16084,0.56499,0.59194,415,413,0.004550,0.000220,0.00143,0.000006,...,1.9704,1.7451,1.8277,2.4976,5.2981,4.2616,6.3042,10.9058,28.4170,0
753,0,0.88389,0.72335,0.46815,381,380,0.005069,0.000103,0.00076,0.000004,...,51.5607,44.4641,26.1586,6.3076,2.8601,2.5361,3.5377,3.3545,5.0424,0
754,0,0.83782,0.74890,0.49823,340,339,0.005679,0.000055,0.00092,0.000005,...,19.1607,12.8312,8.9434,2.2044,1.9496,1.9664,2.6801,2.8332,3.7131,0


In [5]:
y_vals= file["class"]
x_vals= file.drop(["class"], axis=1)
scaler = StandardScaler()
scaler.fit(x_vals) #fit the data to find std and mean
x_vals=scaler.transform(x_vals)
x, x_test, y, y_test = train_test_split(x_vals, y_vals, test_size = .25)
clf = RandomForestClassifier(random_state=42)
clf.fit(x, y)
y_pred=clf.predict(x_test)
x=clf.feature_importances_
#print(x)
x_sorted=np.sort(x)[::-1]
x_sorted

important_feats=[]

for val in x_sorted:
    for i in range(len(x)):
        if val==x[i]:
            important_feats.append(i)
print(important_feats[:10])


print('accuracy score: ',accuracy_score(y_pred,y_test))
print(classification_report(y_pred, y_test))



[440, 368, 331, 584, 477, 346, 370, 347, 580, 513]
accuracy score:  0.8412698412698413
              precision    recall  f1-score   support

           0       0.64      0.67      0.65        42
           1       0.90      0.89      0.90       147

    accuracy                           0.84       189
   macro avg       0.77      0.78      0.77       189
weighted avg       0.84      0.84      0.84       189



There are currently 754 attributes. With many of thoses being insignificant, we have 

In [6]:
y_vals= file["class"]
x_vals= file.drop(["class"], axis=1)
x_array=x_vals.values
pca_instance=PCA(.99)
t,feature_tuple=pca_instance.fit(x_array)

753
scale is  0.0013245033112582781
largest eigen_vals is  97.7331202728519
i is  270


### To get a 99% of the data we only need 270 of the attributes ###
Seems as though the data has already done PCA on the data and ordered the column features from most to least important with only a slight variation from what we found to be most important.
Now only keep the columns from the pca.

In [7]:
#A_and_C = a_dataframe.iloc[:, [0,2]] get columns with index
index_list,var_list= zip(*feature_tuple) #unzip the feature tuple
index_list=list(index_list[:270])
#print(index_list)
t_df=file.iloc[:,index_list]
t_df

Unnamed: 0,gender,PPE,DFA,RPDE,numPulses,numPeriodsPulses,meanPeriodPulses,stdDevPeriodPulses,locPctJitter,locAbsJitter,...,det_LT_TKEO_mean_1_coef,det_LT_entropy_log_8_coef,det_LT_entropy_log_9_coef,det_LT_entropy_log_10_coef,det_LT_TKEO_mean_5_coef,det_LT_TKEO_mean_8_coef,det_LT_TKEO_mean_4_coef,det_LT_TKEO_mean_9_coef,det_LT_entropy_log_7_coef,det_LT_entropy_log_5_coef
0,1,0.85247,0.71826,0.57227,240,239,0.008064,0.000087,0.00218,0.000018,...,1.620000e-07,-192.1166,-175.6679,-163.4186,0.000500,0.000229,0.000104,0.000132,-190.3697,-182.3583
1,1,0.76686,0.69481,0.53966,234,233,0.008258,0.000073,0.00195,0.000016,...,2.490000e-07,-173.3484,-177.7185,-166.2056,0.000417,0.001210,0.000253,0.000094,-209.3696,-197.9376
2,1,0.85083,0.67604,0.58982,232,231,0.008340,0.000060,0.00176,0.000015,...,1.270000e-07,-193.0940,-203.4729,-183.1261,0.000170,0.000007,0.000228,0.000012,-204.3601,-211.9958
3,0,0.41121,0.79672,0.59257,178,177,0.010858,0.000183,0.00419,0.000046,...,7.283200e-03,-45.5159,-53.0037,-37.7727,0.183850,2.315000,0.164130,0.418610,-56.7015,-70.0220
4,0,0.32790,0.79782,0.53028,236,235,0.008162,0.002669,0.00535,0.000044,...,6.293400e-03,-45.4301,-34.3376,-8.3446,0.196080,0.557800,0.241060,0.777480,-54.6934,-66.5583
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
751,0,0.80903,0.56355,0.28385,417,416,0.004627,0.000052,0.00064,0.000003,...,2.330000e-07,-158.8007,-161.6918,-134.1501,0.000637,0.002324,0.000052,0.000995,-166.0632,-212.9824
752,0,0.16084,0.56499,0.59194,415,413,0.004550,0.000220,0.00143,0.000006,...,8.850400e-03,-110.1041,-101.9914,-91.7575,0.181090,0.061777,0.161580,0.010785,-120.5312,-121.5467
753,0,0.88389,0.72335,0.46815,381,380,0.005069,0.000103,0.00076,0.000004,...,2.110000e-07,-128.4035,-117.1867,-85.7921,0.000012,0.002499,0.000077,0.007064,-207.2320,-229.7496
754,0,0.83782,0.74890,0.49823,340,339,0.005679,0.000055,0.00092,0.000005,...,1.810000e-07,-153.4097,-171.0194,-139.3763,0.000077,0.004613,0.000242,0.000350,-180.0185,-221.9074


In [12]:
x_vals=t_df
x, x_test, y, y_test = train_test_split(x_vals, y_vals, test_size = .25)
clf = RandomForestClassifier(random_state=42)
clf.fit(x, y)
y_pred=clf.predict(x_test)



In [13]:
print('accuracy score: ',accuracy_score(y_pred,y_test))
print(classification_report(y_pred, y_test))

accuracy score:  0.8042328042328042
              precision    recall  f1-score   support

           0       0.59      0.65      0.62        46
           1       0.88      0.85      0.87       143

    accuracy                           0.80       189
   macro avg       0.74      0.75      0.74       189
weighted avg       0.81      0.80      0.81       189



From the data, We can see that the model struggles with correctly classifying when a person does not have parkison. For the 0 class indenti

In [15]:
x_feats=clf.feature_importances_
#print(x)
x_sorted=np.sort(x_feats)[::-1]
x_sorted

important_feats=[]

for val in x_sorted:
    for i in range(len(x_feats)):
        if val==x_feats[i]:
            important_feats.append(i)
print(important_feats[:10])


[107, 132, 17, 171, 181, 58, 24, 122, 22, 126]


Now lets find the optimal amount of estimators using a param grid search

In [16]:
param_grid={'n_estimators' : [100,150,200,250,300], 'min_samples_split': [2,4,6]}
clf = RandomForestClassifier(random_state=42)
check = GridSearchCV(clf, param_grid, cv=3);
check.fit(x, y)


print(check.best_estimator_)  #Gives parameters that minimize the lost function the best
print("Best parameter is ", check.best_params_)
#print(check.cv_results_)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=6,
                       min_weight_fraction_leaf=0.0, n_estimators=200,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)
Best parameter is  {'min_samples_split': 6, 'n_estimators': 200}


The best parameters are min_samples_split=6 and n_estimators=200

In [17]:
x_vals=t_df
x, x_test, y, y_test = train_test_split(x_vals, y_vals, test_size = .25)
clf = RandomForestClassifier(random_state=42, min_samples_split=6, n_estimators=200)
clf.fit(x, y)
y_pred=clf.predict(x_test)

In [18]:
print('accuracy score: ',accuracy_score(y_pred,y_test))
print(classification_report(y_pred, y_test))

accuracy score:  0.8306878306878307
              precision    recall  f1-score   support

           0       0.50      0.84      0.63        32
           1       0.96      0.83      0.89       157

    accuracy                           0.83       189
   macro avg       0.73      0.84      0.76       189
weighted avg       0.88      0.83      0.85       189



Now that we have found the best parameters for a tree method let's create a new tree model (Adaboost!)