In [1]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('../input/parkinsons-disease-speech-signal-features/pd_speech_features.csv')

In [3]:
df.shape

In [5]:
df.head()

In [6]:
df.isnull().sum().sum()

#there is no null values

In [7]:
#count plot
sns.set_style('whitegrid')
sns.set_context('paper')
sns.set_palette('GnBu_d')
a = sns.catplot(x='class', data=df, kind='count')
a.fig.suptitle('Number of Samples in Each Class', y=1.03)
a.set(ylabel='Number of Samples', xlabel='Have Parkinson')
plt.show()

-There is no correlated , duplicated and constant features in this data. So let's do some PCA. let's see any constant features values exists.

# PCA

In [8]:
# PCA and tell us how many components are constant
pca = PCA().fit(df)
plt.figure(figsize=(10,4))
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlim(0,754,1)
plt.grid()
plt.xlabel('Number of components')
plt.ylabel('Cumulative explained variance')

In [9]:
x = df.drop('class',axis=1)
y= df['class']

import plotly.express as px
pca.fit(df)
exp_var_cumul = np.cumsum(pca.explained_variance_ratio_)

px.area(
    x=range(1, exp_var_cumul.shape[0] + 1),
    y=exp_var_cumul,
    labels={"x": "# Components", "y": "Explained Variance"}
)

In [10]:
df_copy = df.copy()

In [11]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
df_= scaler.fit_transform(df_copy)

In [12]:
df_scaled = pd.DataFrame(df_, columns=df.columns)

In [13]:
df_scaled.head()

# RESAMPLING

In [14]:
X = df_scaled.drop('class',axis=1)
Y= df_scaled['class']

In [15]:
from imblearn.over_sampling import SMOTE
smote = SMOTE()
X_, y_ = smote.fit_resample(X,Y)

X, Y = pd.DataFrame(x,y_,)

# Model Building

In [16]:
from sklearn.model_selection import train_test_split
X_train,X_test, y_train,y_test = train_test_split(X_,y_, random_state=41, shuffle=True,stratify=y_)

In [17]:
y_.value_counts()

In [19]:
pip install pycaret --quiet

In [22]:
from sklearn.metrics import accuracy_score,f1_score,roc_auc_score,recall_score,precision_score,cohen_kappa_score
from pycaret.classification import *

In [24]:
py = setup(data = df_scaled, target = 'class')

Certain models are prevented for comparison because of their longer run-time. In order to bypass this prevention, the turbo parameter can be set to False.

In [25]:
compare_models()

In [27]:
#sort it using the metrics.
compare_models(n_select = 3, sort='F1')

The “create_model” function takes in just the model ID as a string and performs the task.

In [28]:
xgboost = create_model('xgboost')

In [29]:
tuned_xg = tune_model(xgboost)
interpret_model(tuned_xg)

In [30]:
interpret_model(tuned_xg)

In [31]:
predict_model(tuned_xg)

In [32]:
predictions = predict_model(tuned_xg, data = X_test)

In [None]:
df_['Status'] = predictions['Label']