# Importing necessary libraries

In [None]:
import numpy as np 
import pandas as pd 
import seaborn as sns

import warnings

import statsmodels.api as sm

import matplotlib.pyplot as plt

from sklearn import datasets, linear_model, metrics, ensemble, naive_bayes, svm, tree, discriminant_analysis, neighbors, feature_selection
from sklearn.linear_model import lasso_path
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_validate, ShuffleSplit, GridSearchCV, RandomizedSearchCV, train_test_split
from sklearn.metrics import roc_curve, auc, classification_report

from scipy.stats import zscore, randint 

# 1.Load the dataset

In [None]:
df = pd.read_csv('parkinsons.data', sep = ',',header = 0)

# 2.It is always a good practice to eye-ball raw data to get a feel of the data in terms of number of structure of the file, number of attributes, types of attributes and a general idea of likely challenges in the dataset. (2.5 points)

In [None]:
df.head(10)

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.isna().sum()

No missing values

# Attribute Information:

Matrix column entries (attributes):

* name - ASCII subject name and recording number
* MDVP:Fo(Hz) - Average vocal fundamental frequency
* MDVP:Fhi(Hz) - Maximum vocal fundamental frequency
* MDVP:Flo(Hz) - Minimum vocal fundamental frequency
* MDVP:Jitter(%),MDVP:Jitter(Abs),MDVP:RAP,MDVP:PPQ,Jitter:DDP - Several measures of variation in fundamental frequency
* MDVP:Shimmer,MDVP:Shimmer(dB),Shimmer:APQ3,Shimmer:APQ5,MDVP:APQ,Shimmer:DDA - Several measures of variation in amplitude
* NHR,HNR - Two measures of ratio of noise to tonal components in the voice
* status - Health status of the subject (one) - Parkinson's, (zero) - healthy
* RPDE,D2 - Two nonlinear dynamical complexity measures
* DFA - Signal fractal scaling exponent
* spread1,spread2,PPE - Three nonlinear measures of fundamental frequency variation

# 3.Using univariate & bivariate analysis to check the individual attributes for their basic statistic such as central values, spread, tails etc. What are your observations? (15 points) 

# Univariate analysis

In [None]:
df.describe().T

# From the above table:
* For the variables MDVP:Fo(Hz), MDVP:Fhi(Hz), MDVP:Flo(Hz), MDVP:Jitter(%), MDVP:Jitter(Abs), MDVP:RAP, MDVP:PPQ, Jitter:DDP:
 * The mean value is greater than the median and the max value is > 75% of the value and this indicates a right tailed distribution or a positively skewed distribution
 * We have also plotted a histogram to visualize the same
* For the Several measures of variation in amplitude: MDVP:Shimmer, MDVP:Shimmer(dB), Shimmer:APQ3, Shimmer:APQ5, MDVP:APQ, Shimmer:DDA
 * The mean value is greater than the median and they have a right tailed distribution or a positively skewed distribution
* For NHR variable,the mean value is greater than the median and also the max value is very much greater than the 75% value which indicates a right tailed distribution or a positively skewed distribution
* For HNR variable mean< median indicating a left tailed distribution.
* Rest of the variables seems to have a positive skewed ditribution

# Distribution analysis

In [None]:
sns.distplot(df['MDVP:Fo(Hz)'])

In [None]:
sns.distplot(df['MDVP:Fhi(Hz)'])

In [None]:
sns.distplot(df['MDVP:Flo(Hz)'])

In [None]:
sns.distplot(df['MDVP:Jitter(%)'])

In [None]:
sns.distplot(df['MDVP:Jitter(Abs)'])

In [None]:
sns.distplot(df['MDVP:RAP'])

In [None]:
sns.distplot(df['MDVP:PPQ'])

In [None]:
sns.distplot(df['Jitter:DDP'])

In [None]:
sns.distplot(df['MDVP:Shimmer'])

In [None]:
sns.distplot(df['MDVP:Shimmer(dB)'])

In [None]:
sns.distplot(df['Shimmer:APQ3'])

In [None]:
sns.distplot(df['Shimmer:DDA'])

In [None]:
sns.distplot(df['NHR'])

In [None]:
sns.distplot(df['HNR'])

In [None]:
sns.distplot(df['DFA'])

In [None]:
sns.distplot(df['spread1'])

In [None]:
sns.distplot(df['PPE'])

# Distribution of the target variable : STATUS

In [None]:
df.status.value_counts()

In [None]:
sns.countplot(df.status)

Out of 195 records ,147 are having parkinsons and only 48 are healthy

From the univariate analysis we found most of the attribute distribution are right tailed distribution (positively skewed)

# Bivariate analysis

In [None]:
sns.pairplot(df, diag_kind='kde',hue='status')

In [None]:
plt.figure(figsize=(20,20))
corr = df.corr()
mask = np.zeros_like(corr)
mask[np.triu_indices_from(mask, 1)] = True
sns.heatmap(corr,mask= mask,annot = True)

Since the plot is too big to visually analyse lets make it simple

In [None]:
corr = df.corr().abs()
corr[corr == 1] = 0
corr_cols = corr.max().sort_values(ascending=False)
display(corr_cols[corr_cols > 0.7])

In [None]:
print("Attibutes co-relation between the target variable Status")
df.drop(["status","name"] , axis=1).apply(lambda x: x.corr(df.status)).sort_values(ascending=False)

In [None]:
plt.scatter(df['Shimmer:DDA'], df['Shimmer:APQ3'])

In [None]:
plt.scatter(df['MDVP:RAP'], df['Jitter:DDP'])

In [None]:
plt.scatter(df['NHR'], df['HNR'])

From the above plots we are able to find a positive co-relation between Shimmer:DDA/Shimmer:APQ3 and also MDVP:RAP/Jitter:DDP and we are able to find a negative co-relation between NHR/HNR.

From the bivariate analysis we found most of the attribute are having strong co-relations .

# 4.Split the dataset into training and test set in the ratio of 70:30 (Training:Test).

In [None]:
X = df.drop(["status","name"] , axis=1)
X = X.apply(zscore)

y = df['status']

In [None]:
train_set, test_set, train_labels, test_labels = train_test_split(X, y, test_size=.30, random_state=7)

# 5.Create the model using “entropy” method of reducing the entropy and fit it to training data. (5 points)

In [None]:
dt_model = DecisionTreeClassifier(criterion = 'entropy' ,random_state=7)
dt_model.fit(train_set, train_labels)

# 6.Test the model on test data and what is the accuracy achieved. Capture the predicted values and do a crosstab. (7.5 points)

In [None]:
y_predict = dt_model.predict(test_set)

print("The model training accuracy is :",dt_model.score(train_set , train_labels))

print("The models test accuracy is : ",dt_model.score(test_set , test_labels))

In [None]:
cm = pd.crosstab(y_predict, test_labels, rownames=['Predicted'], colnames=['Actual'], margins=True)
print("Cross Tab:")
print(cm)

* True positives >> the values which we predicted as parkinsons and the actual is also parkinsons->43
* False positives (Type 1 error)>> the values which we predicted as parkinsons but the actual value is healthy->3
* True Negatives -> the values which we predicted as healthy and the actual is also healthy->9
* False Negatives (Type 2 error)>> the values which we predicted as healthy but the actual value is parkinsons->4

In [None]:
report_dt = classification_report(y_predict, test_labels)
print("Classification report for Decision Tree Classifier")
print(report_dt)

In [None]:
fpr, tpr, thresholds = roc_curve(test_labels, y_predict)
dt_model_auc = auc(fpr, tpr)
print(dt_model_auc)

# 7. Use regularization parameters of max_depth, min_sample_leaf to recreate the model. What is the impact on the model accuracy? How does regularization help? (20 points)

# Regularizing the max_depth parameter

In [None]:
param_dist = {"max_depth": randint(1, 10), 
             "criterion": ["entropy"],
             "random_state": [7]}
dt=DecisionTreeClassifier()
dt_model_cv = RandomizedSearchCV(dt, param_dist, cv = 5) 
dt_model_cv.fit(X, y) 
print("Tuned Decision Tree Parameters:",dt_model_cv.best_params_) 
print("Best score is :",dt_model_cv.best_score_) 

Creating decision tree model with regularized max_depth

In [None]:
r_dt_model_depth = DecisionTreeClassifier(criterion = 'entropy',random_state =7,max_depth = 2)
r_dt_model_depth.fit(train_set, train_labels)

In [None]:
y_predict_depth = r_dt_model_depth.predict(test_set)
score_depth = r_dt_model_depth.score(test_set , test_labels)

print("accuracy score after regularizing max_depth parameter",score_depth)

In [None]:
cm_depth = pd.crosstab(y_predict_depth, test_labels, rownames=['Predicted'], colnames=['Actual'], margins=True)
print("Cross Tab:")
print(cm_depth)

* True positives >> the values which we predicted as parkinsons and the actual is also parkinsons->42
* False positives (Type 1 error)>> the values which we predicted as parkinsons but the actual value is healthy->4
* True Negatives -> the values which we predicted as healthy and the actual is also healthy->8
* False Negatives (Type 2 error)>> the values which we predicted as healthy but the actual value is parkinsons->5

In [None]:
report_dt_depth = classification_report(y_predict_depth, test_labels)
print("Classification report for Decision Tree Classifier")
print(report_dt_depth)

In [None]:
fpr, tpr, thresholds = roc_curve(test_labels, y_predict_depth)
dt_model_depth_auc = auc(fpr, tpr)
print(dt_model_depth_auc)

Conclusion: The max_depth regularization didnt improve the model score and the auc score

Normally regularization would help in reducing the variance which would end up in higher model scores. 

# Regularizing the min_samples_leaf parameter

In [None]:
param_dist = {"min_samples_leaf": randint(1, 10), 
              "criterion": ["entropy"],
             "random_state": [7]}
dt=DecisionTreeClassifier()
dt_model_cv = RandomizedSearchCV(dt, param_dist, cv = 5) 
dt_model_cv.fit(X, y) 
print("Tuned Decision Tree Parameters:",dt_model_cv.best_params_) 
print("Best score is :",dt_model_cv.best_score_) 

creating decision tree model with regularized min_samples_leaf

In [None]:
r_dt_model_leaf = DecisionTreeClassifier(criterion = 'entropy',random_state =7,min_samples_leaf = 2)
r_dt_model_leaf.fit(train_set, train_labels)

In [None]:
y_predict_leaf = r_dt_model_leaf.predict(test_set)
score_leaf = r_dt_model_leaf.score(test_set , test_labels)
print("accuracy score after regularizing min_samples_leaf",score_leaf)

In [None]:
cm_leaf = pd.crosstab(y_predict_leaf, test_labels, rownames=['Predicted'], colnames=['Actual'], margins=True)
print("Cross Tab:")
print(cm_leaf)

* True positives >> the values which we predicted as parkinsons and the actual is also parkinsons->42
* False positives (Type 1 error)>> the values which we predicted as parkinsons but the actual value is healthy->2
* True Negatives -> the values which we predicted as healthy and the actual is also healthy->10
* False Negatives (Type 2 error)>> the values which we predicted as healthy but the actual value is parkinsons->5

In [None]:
report_dt_leaf = classification_report(y_predict_leaf, test_labels)
print("Classification report for Decision Tree Classifier")
print(report_dt_leaf)

In [None]:
fpr, tpr, thresholds = roc_curve(test_labels, y_predict_leaf)
dt_model_leaf_auc = auc(fpr, tpr)
print(dt_model_leaf_auc)

Conclusion: The regularization of min_samples_leaf parameter didn't improve the model score,but has improved the auc score a bit.

# Regularizing the min_samples_leaf and max_depth parameters

In [None]:
param_dist = {"max_depth": randint(1, 10), 
              "min_samples_leaf": randint(1, 10), 
              "criterion": ["entropy"],
             "random_state": [7]} 
dt=DecisionTreeClassifier()
dt_model_cv = RandomizedSearchCV(dt, param_dist, cv = 5) 
dt_model_cv.fit(X, y) 
print("Tuned Decision Tree Parameters:",dt_model_cv.best_params_) 
print("Best score is :",dt_model_cv.best_score_) 

In [None]:
r_dt_model = DecisionTreeClassifier(criterion = 'entropy',random_state =7,max_depth = 3,min_samples_leaf = 1)
r_dt_model.fit(train_set, train_labels)

In [None]:
y_predict_depth_leaf = r_dt_model.predict(test_set)
score_depth_leaf = r_dt_model.score(test_set , test_labels)

print("accuracy score after regularizing both the parametes",score_depth_leaf)

In [None]:
cm_leaf_depth = pd.crosstab(y_predict_depth_leaf, test_labels, rownames=['Predicted'], colnames=['Actual'], margins=True)
print("Cross Tab:")
print(cm_leaf_depth)

* True positives >> the values which we predicted as parkinsons and the actual is also parkinsons->45
* False positives (Type 1 error)>> the values which we predicted as parkinsons but the actual value is healthy->3
* True Negatives >> the values which we predicted as healthy and the actual is also healthy->9
* False Negatives (Type 2 error)>> the values which we predicted as healthy but the actual value is parkinsons->2

In [None]:
report_dt_depth_leaf = classification_report(y_predict_depth_leaf, test_labels)
print("Classification report for Decision Tree Classifier")
print(report_dt_depth_leaf)

In [None]:
fpr, tpr, thresholds = roc_curve(test_labels, y_predict_depth_leaf)
dt_model_depth_leaf_auc = auc(fpr, tpr)
print(dt_model_depth_leaf_auc)

Conclusion: The regularization of min_samples_leaf and max_depth parameters improved the model

Overall conclusion : Regularizing min_sample_leaf and Regularizing both parameters have improved the model, while Regularizing max_depth haven't improved model performance

# 8.Next implement the decision tree using Random Forest. What is the optimal number of trees that gives the best result? (10 points)

# Random Forest:

In [None]:
rfcl = RandomForestClassifier(criterion = 'entropy',random_state=7)
rfcl = rfcl.fit(train_set, train_labels)
y_predict_rfcl = rfcl.predict(test_set)
print(rfcl)

In [None]:
cm_rfcl = pd.crosstab(y_predict_rfcl, test_labels, rownames=['Predicted'], colnames=['Actual'], margins=True)
print("Cross Tab:")
print(cm_rfcl)

* True positives >> the values which we predicted as parkinsons and the actual is also parkinsons->45
* False positives (Type 1 error)>> the values which we predicted as parkinsons but the actual value is healthy->3
* True Negatives >> the values which we predicted as healthy and the actual is also healthy->9
* False Negatives (Type 2 error)>> the values which we predicted as healthy but the actual value is parkinsons->2

In [None]:
report_rfcl = classification_report(y_predict_rfcl, test_labels)
print("Classification report for Decision Tree Classifier")
print(report_rfcl)

In [None]:
fpr, tpr, thresholds = roc_curve(test_labels, y_predict_rfcl)
rfcl_model_auc = auc(fpr, tpr)
print(rfcl_model_auc)

Finding the optimal number of trees which gives the best result

In [None]:
param_dist = {"n_estimators": randint(1, 10), 
              "criterion": ["entropy"],
             "random_state": [7]}
rf_cv = RandomizedSearchCV(rfcl, param_dist, cv = 5) 
  
rf_cv.fit(X, y) 
print("Tuned Random forest Parameters:",rf_cv.best_params_)
print("Best score is ",rf_cv.best_score_)

# Regularized random forest with optimal number of trees¶


In [None]:
rfcl = RandomForestClassifier(criterion = 'entropy',n_estimators =7,random_state=7)
r_rfcl = rfcl.fit(train_set, train_labels)
print(r_rfcl)

In [None]:
y_predict_r_rfcl = rfcl.predict(test_set)
print("testscore",rfcl.score(test_set , test_labels))

In [None]:
cm_r_rfcl = pd.crosstab(y_predict_r_rfcl, test_labels, rownames=['Predicted'], colnames=['Actual'], margins=True)
print("Cross Tab:")
print(cm_rfcl)

* True positives >> the values which we predicted as parkinsons and the actual is also parkinsons->45
* False positives (Type 1 error)>> the values which we predicted as parkinsons but the actual value is healthy->3
* True Negatives >> the values which we predicted as healthy and the actual is also healthy->9
* False Negatives (Type 2 error)>> the values which we predicted as healthy but the actual value is parkinsons->2

In [None]:
report_r_rfcl = classification_report(y_predict_r_rfcl, test_labels)
print("Classification report for Decision Tree Classifier")
print(report_r_rfcl)

In [None]:
fpr, tpr, thresholds = roc_curve(test_labels, y_predict_r_rfcl)
r_rfcl_model_auc = auc(fpr, tpr)
print(r_rfcl_model_auc)

# Summary

* The regularization parameters of max_depth, min_sample_leaf using randomizedsearchCV, we found a slight improvement in model performance.(model score before regularizing parameters/model score after regularizing parameters = 0.88/0.91 and AUC of decision tree before regularizing': 0.83, 'AUC of decision tree after regularizing': 0.85 the best value of params are max_depth = 3,min_samples_leaf = 1)and the AUC showed a little bit of improvement

* Model score and auc score after regularizing min_samples_leaf param:0.8813559322033898,0.8324468085106383

* Model score and auc score after regularizing max_depth param:0.847457627118644,0.7801418439716312

* Implementing the decision tree using Random Forest and found 7 is the optimal number of trees that gives the best result and the model score we got is 0.91 and AUC is 85