In [1]:
# %matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import warnings
warnings.filterwarnings("ignore")

In [2]:
# Load the Parkinson speech dataset
df = pd.read_csv('train_data.txt') # 读取数据
df.columns = ['subject id', 'jitter_local', 'jitter_local_absolute',
               'jitter_rap', 'jitter_ppq5', 'jitter_ddp', 'shimmer_local',
               'shimmer_local_db', 'shimmer_apq3', 'shimmer_apq5',
               'shimmer_apq11', 'shimmer_data', 'AC', 'NTH', 'HTN',
               'median_pitch', 'mean_pitch', 'standard_dev_pitch',
               'min_pitch', 'max_pitch', 'num_pulses', 'num_periods',
               'mean_period','standard_dev_period', 'frac_locally_unvoiced_frames',
               'num_voice_breaks','degree_of_voice_breaks', 'UPDRS', 'class'] # 给数据赋列名

df = df.drop(columns=['subject id', 'class']) # 去掉列名为'subject id'和'class'的列

In [3]:
gps = [
    ['jitter_local', 'jitter_local_absolute', 'jitter_rap', 'jitter_ppq5', 'jitter_ddp'],
    ['shimmer_local', 'shimmer_local_db', 'shimmer_apq3', 'shimmer_apq5', 'shimmer_apq11', 'shimmer_data'],
    ['median_pitch', 'mean_pitch', 'standard_dev_pitch', 'min_pitch', 'max_pitch'],
    ['AC', 'NTH', 'HTN'],
    ['num_pulses', 'num_periods', 'mean_period','standard_dev_period'],
    ['frac_locally_unvoiced_frames', 'num_voice_breaks','degree_of_voice_breaks'],
] # 定义一个列表，列表中又包含多个列表，这些列表中的元素和df的列名相同。
UPDRS = '1', '5', '8', '11', '12', '16', '20', '23', '24', '26', '31', '32', '40', '46', '55'

In [4]:
X = df.iloc[:, :-1] # 取df除最后一列的所有列作为x
y = df.iloc[:, -1] # 取df的最后一列作为y
X = StandardScaler().fit_transform(X) # 将fit和transform放在一起 先fit后transform

In [5]:
def pca(X): # 定义pca函数
  scaler = StandardScaler() # 用于将数据转换为均值为0，方差为1的标准正态分布。
  X_std = scaler.fit_transform(X)
  pca = PCA(n_components=X_std.shape[1])
  pca.fit(X_std)
  evs = pca.explained_variance_ratio_
  tot_ev = 0
  cnt = 0
  for i in evs:
    tot_ev += i
    cnt += 1
    if tot_ev >= 0.8:
      break
  ret = pca.transform(X_std)
  print(f"for {list(X.columns)}, keep {cnt} features to get an explained_variance_ratio of {tot_ev}")
  return ret[:,:cnt]

In [6]:
dfs_pca = []
for gp in gps:
  dfs_pca.append(pca(df[gp]).copy())

for ['jitter_local', 'jitter_local_absolute', 'jitter_rap', 'jitter_ppq5', 'jitter_ddp'], keep 1 features to get an explained_variance_ratio of 0.8898027996784917
for ['shimmer_local', 'shimmer_local_db', 'shimmer_apq3', 'shimmer_apq5', 'shimmer_apq11', 'shimmer_data'], keep 1 features to get an explained_variance_ratio of 0.8121322932673971
for ['median_pitch', 'mean_pitch', 'standard_dev_pitch', 'min_pitch', 'max_pitch'], keep 2 features to get an explained_variance_ratio of 0.9404758200268596
for ['AC', 'NTH', 'HTN'], keep 1 features to get an explained_variance_ratio of 0.9464691832031871
for ['num_pulses', 'num_periods', 'mean_period', 'standard_dev_period'], keep 3 features to get an explained_variance_ratio of 0.9998312651771138
for ['frac_locally_unvoiced_frames', 'num_voice_breaks', 'degree_of_voice_breaks'], keep 2 features to get an explained_variance_ratio of 0.9246679755940611


In [7]:
df_pca = np.concatenate(dfs_pca, axis=1) 
print(df_pca.shape) # 使用pca剩下10个特征

(1039, 10)


In [9]:
# 过采样：对降维的数据进行过采样
from imblearn.over_sampling import SMOTE

# 创建SMOTE对象
smote = SMOTE(random_state=42)
# 进行过采样
X_, y_ = smote.fit_resample(df_pca, y)

ImportError: cannot import name '_MissingValues' from 'sklearn.utils._param_validation' (C:\ProgramData\anaconda3\lib\site-packages\sklearn\utils\_param_validation.py)

In [None]:
# 划分测试集和训练集
from sklearn.model_selection import train_test_split
X_train_, X_test_, y_train_, y_test_ = train_test_split(X_, y_, random_state=47)

In [None]:
y_labels = list(df['UPDRS'].value_counts().index)
y_rates = [i/X.shape[0]*100 for i in list(df['UPDRS'].value_counts())]
print(y_rates)

In [None]:
print(X.shape[0])
print(sum(list(df['UPDRS'].value_counts())))

In [None]:
from imblearn.over_sampling import SMOTE

# 创建SMOTE对象
smote = SMOTE(random_state=42)
# 进行过采样
X, y = smote.fit_resample(X, y)

In [None]:
y_resample_rates = [i/X.shape[0]*100 for i in list(y.value_counts())]

In [None]:
import numpy as np
import matplotlib.pyplot as plt

# 数据
categories = y_labels
data1 = y_rates
data2 = y_resample_rates

# 设置柱状图的宽度
bar_width = 0.35

# 创建一个范围用于横向调整柱状的位置
x = np.arange(len(categories))

# 绘制柱状图
plt.bar(x - bar_width/2, data1, width=bar_width, label='Raw data proportions')
plt.bar(x + bar_width/2, data2, width=bar_width, label='Oversampled data proportions')

# 添加标题和坐标轴标签
plt.title("Data distribution by category")
plt.xlabel("Category index")
plt.ylabel("Proportion (%)")

# 设置 x 轴刻度标签
plt.xticks(x, categories)

# 添加图例
plt.legend()

# 显示图像
plt.show()

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, ConfusionMatrixDisplay, RocCurveDisplay
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=47)
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
accuracy_score(y_test, y_pred)

In [None]:
# print(classification_report(y_test, y_pred))
ConfusionMatrixDisplay.from_predictions(y_test, y_pred, cmap='Blues', display_labels=UPDRS);

In [None]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier(random_state=47)
dt.fit(X_train, y_train)
y_pred = dt.predict(X_test)
accuracy_score(y_test, y_pred)

In [None]:
ConfusionMatrixDisplay.from_predictions(y_test, y_pred, cmap = 'Blues', display_labels=UPDRS);

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(random_state=47)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
accuracy_score(y_test, y_pred)

In [None]:
ConfusionMatrixDisplay.from_predictions(y_test, y_pred, cmap='Blues', display_labels = UPDRS);

In [None]:
rf.feature_importances_

In [None]:
order = np.argsort(-rf.feature_importances_)

In [None]:
dfi = pd.DataFrame([tree.feature_importances_[order] for tree in rf.estimators_],
                   columns=df.columns[order])
dfi.head()

In [None]:
plt.figure(figsize=(15, 4))
xpos = range(dfi.shape[1])
plt.bar(xpos, dfi.mean(), yerr=dfi.std())
plt.xticks(xpos, dfi.columns)
plt.ylabel('Importance')
plt.xlabel('Feature');

In [None]:
from sklearn import svm
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
linear = svm.SVC(kernel='linear',decision_function_shape ='ovo')
linear.fit(X_train, y_train)
linear_pred = linear.predict(X_test)
accuracy_score(y_test, linear_pred)

In [None]:
ConfusionMatrixDisplay.from_predictions(y_test, linear_pred, cmap = 'Blues', display_labels=UPDRS);

In [None]:
rbf = svm.SVC(kernel='rbf', decision_function_shape ='ovo').fit(X_train, y_train)
rbf_pred = rbf.predict(X_test)
accuracy_score(y_test, rbf_pred)

In [None]:
sig = svm.SVC(kernel='sigmoid', decision_function_shape ='ovo').fit(X_train, y_train)
sig_pred = sig.predict(X_test)
accuracy_score(y_test, sig_pred)

In [None]:
ConfusionMatrixDisplay.from_predictions(y_test, sig_pred, cmap = 'Blues', display_labels=UPDRS);

In [None]:
models = [knn, dt, rf, linear, rbf, sig]
names = ['K-Nearest Neighbors', 'Decision Tree', 'Random Forest', 'SVM (Linear)', 'SVM (RBF)', 'SVM (Sigmoid)']
scores = [accuracy_score(y_test, x.predict(X_test)) for x in models]
scores

In [None]:
plt.barh(range(6), [accuracy_score(y_test, x.predict(X_test)) for x in models])
accuracies = [accuracy_score(y_test, x.predict(X_test)) for x in models]
for index, value in enumerate(accuracies):
    plt.text(value, index, f'{value:.2f}')
plt.xticks()
plt.yticks(range(6), names)
plt.title('Prediction accuracy');

In [None]:
# 对pca降维后的数据进行分析
# KNN
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, ConfusionMatrixDisplay, RocCurveDisplay
from sklearn.model_selection import train_test_split
knn = KNeighborsClassifier()
knn.fit(X_train_, y_train_)
y_pred = knn.predict(X_test_)
accuracy_score(y_test_, y_pred)

In [None]:
ConfusionMatrixDisplay.from_predictions(y_test_, y_pred, cmap = 'Blues', display_labels=UPDRS);

In [None]:
# DTC
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier(random_state=47)
dt.fit(X_train_, y_train_)
y_pred = dt.predict(X_test_)
accuracy_score(y_test_, y_pred)

In [None]:
ConfusionMatrixDisplay.from_predictions(y_test_, y_pred, cmap = 'Blues', display_labels=UPDRS);

In [None]:
# RFC
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(random_state=47)
rf.fit(X_train_, y_train_)
y_pred = rf.predict(X_test_)
accuracy_score(y_test_, y_pred)

In [None]:
ConfusionMatrixDisplay.from_predictions(y_test_, y_pred, cmap = 'Blues', display_labels=UPDRS);

In [None]:
# SVC(Linear)
from sklearn import svm
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
linear = svm.SVC(kernel='linear',decision_function_shape ='ovo')
linear.fit(X_train_, y_train_)
linear_pred = linear.predict(X_test_)
accuracy_score(y_test_, linear_pred)

In [None]:
ConfusionMatrixDisplay.from_predictions(y_test_, linear_pred, cmap = 'Blues', display_labels=UPDRS);

In [None]:
# SVC(RBF)
rbf = svm.SVC(kernel='rbf', decision_function_shape ='ovo').fit(X_train_, y_train_)
rbf_pred = rbf.predict(X_test_)
accuracy_score(y_test_, rbf_pred)

In [None]:
ConfusionMatrixDisplay.from_predictions(y_test_, rbf_pred, cmap = 'Blues', display_labels=UPDRS);

In [None]:
sig = svm.SVC(kernel='sigmoid', decision_function_shape ='ovo').fit(X_train_, y_train_)
sig_pred = sig.predict(X_test_)
accuracy_score(y_test_, sig_pred)

In [None]:
ConfusionMatrixDisplay.from_predictions(y_test_, sig_pred, cmap = 'Blues', display_labels=UPDRS);

In [None]:
models = [knn, dt, rf, linear, rbf, sig]
names = ['K-Nearest Neighbors', 'Decision Tree', 'Random Forest', 'SVM (Linear)', 'SVM (RBF)', 'SVM (Sigmoid)']
scores_ = [accuracy_score(y_test_, x.predict(X_test_)) for x in models]
scores_

In [None]:
import numpy as np
import matplotlib.pyplot as plt

# 数据
categories = names
data1 = scores
data2 = scores_

# 设置柱状图的宽度
bar_width = 0.35

# 创建一个范围用于横向调整柱状的位置
x = np.arange(len(categories))

# 绘制柱状图
plt.bar(x - bar_width/2, data1, width=bar_width, label='Original data')
plt.bar(x + bar_width/2, data2, width=bar_width, label='PCA-reduced data')

# 添加标题和坐标轴标签
plt.title("Accuracy of different models")
# plt.xlabel("Model name")
plt.ylabel("Accuracy")

# 设置 x 轴刻度标签
plt.xticks(x, categories)
plt.xticks(rotation=45)


# 添加图例
plt.legend()

# 显示图像
plt.show()