<a href="https://colab.research.google.com/github/anoukzwinkels/TM10007/blob/master/PCA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Run this to use from colab environment
!pip install -q --upgrade git+https://github.com/anoukzwinkels/TM10007

# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection    import StratifiedKFold, RepeatedStratifiedKFold
from sklearn.model_selection    import StratifiedShuffleSplit, RandomizedSearchCV
from sklearn.feature_selection  import VarianceThreshold, RFECV, SelectKBest, f_classif
from sklearn.metrics            import accuracy_score
from sklearn.preprocessing      import RobustScaler
from sklearn.neighbors          import KNeighborsClassifier
from sklearn.svm                import SVC
from sklearn.decomposition      import PCA

from plotly.express             import scatter_3d, bar
from sklearn.ensemble           import RandomForestClassifier

  Building wheel for brats (setup.py) ... [?25l[?25hdone


In [None]:
def def_data():
  '''
  Load the data
  
  :return features:     dataframe containing the features
  :return labels:       dataframe containing the labels
  '''

  from hn.load_data import load_data
  data = load_data()

  features = data.drop(columns=['label'])       # dit is alle data (input)
  labels = pd.DataFrame(data['label'])          # dit zijn alle labels

  return features, labels

In [None]:
def remove_outliers(X_train):
  '''
  Identify the outliers and replace them with the minimum or maximum value of the corresponding feature.

  :param X_train:      dataframe containing the features of the traindata
  :return X_train:     dataframe containing the features of the traindata with the outliers corrected
  '''

  X_train = X_train.copy()
  
  # calculate interquartile range
  q25, q75 = np.percentile(X_train, 25,axis=0), np.percentile(X_train, 75,axis=0)
  iqr = q75 - q25

  # calculate the outlier cutoff
  cut_off = iqr * 1.5
  lower, upper = q25 - cut_off, q75 + cut_off

  # identify outliers
  outliers = (X_train < lower) | (X_train > upper)
  #outliers = [x for x in features if x < lower or x > upper]

  feature_nan = X_train[:].copy()
  feature_nan[outliers] = np.nan

  feature_names = list(X_train.columns)
  for col in feature_names:
    outliers_col = outliers[col]
    for val in X_train.loc[outliers_col,col]:
      if val > np.nanmax(feature_nan[col]):
        X_train[col] = X_train[col].replace(to_replace = val, value = np.nanmax(feature_nan[col]))
      elif val < np.nanmin(feature_nan[col]):
        X_train[col] = X_train[col].replace(to_replace = val, value = np.nanmin(feature_nan[col]))
  return (X_train)

In [None]:
def scaling(X_train, X_test):
  '''
  To scale the test and train data.

  :param X_train:         dataframe containing the features of the traindata           
  :param X_test:          dataframe containing the features of the testdata               
  :return X_train:        dataframe containing the scaled features of the traindata
  :return X_test:         dataframe containing the scaled features of the testdata
  '''
  
  scaler = RobustScaler()
  scaler.fit(X_train)
  X_train = pd.DataFrame(scaler.transform(X_train), index=X_train.index, columns=X_train.columns)
  X_test = pd.DataFrame(scaler.transform(X_test), index=X_test.index, columns=X_test.columns)
  return X_train, X_test

In [None]:
def def_pca(features, labels):
  '''
  Determine the PCA of the dataset

  :param features:                dataframe containing the features
  :param labels:                  dataframe containing the labels
  '''
  
  pca = PCA(n_components=3)
  pca.fit(features)
  scores = pca.transform(features)
  scores_df = pd.DataFrame(scores, columns = [ 'PC1', 'PC2', 'PC3'])

  labels.reset_index(drop=True, inplace=True)
  df_scores = pd.concat([scores_df, labels],axis=1)

  loadings = pca.components_.T

  explained_variance = pca.explained_variance_ratio_
  explained_variance = np.insert(explained_variance,0,0)

  cumulative_variance = np.cumsum(np.round(explained_variance,decimals=3))

  # combining the dataframe
  pc_df = pd.DataFrame(['','PC1', 'PC2', 'PC3'], columns=['PC'])
  explained_variance_df = pd.DataFrame(explained_variance, columns=['Explained Variance'])
  cumulative_variance_df=pd.DataFrame(cumulative_variance, columns=['Cumulative Variance'])

  df_explained_variance = pd.concat([pc_df, explained_variance_df, cumulative_variance_df], axis=1)

  fig = bar(df_explained_variance, x = 'PC', y='Explained Variance', text = 'Explained Variance', width = 800)
  fig.update_traces(texttemplate='%{text:.3f}', textposition='outside')
  fig.show()

  fig = scatter_3d(df_scores, x = 'PC1', y= 'PC2',z='PC3', color='label')
  fig.show()

  para = pd.DataFrame(pca.components_,columns=features.columns,index = ['PC1','PC2','PC3'])

  para.transpose()
  PC_1 = para.loc['PC1'].nlargest(20)
  PC_2 = para.loc['PC2'].nlargest(20)
  PC_3 = para.loc['PC3'].nlargest(20)
  PC_features = {'PC1':PC_1.index, 'PC2':PC_2.index, 'PC3':PC_3.index}
  PC_features = pd.DataFrame(PC_features)
  display(PC_features)

  return PC_features

In [None]:
features, labels = def_data()

loops = 0
sss = StratifiedShuffleSplit(n_splits=10)
for train_index, test_index in sss.split(features, labels):
  if loops == 1:
    break
  else:
    loops += 1  
    X_train, X_test = features.iloc[train_index], features.iloc[test_index]
    y_train, y_test = labels.iloc[train_index], labels.iloc[test_index]
    
    # X_train = remove_outliers(X_train)
    X_train, X_test = scaling(X_train, X_test)
    PC_features = def_pca(X_train, y_train)

Unnamed: 0,PC1,PC2,PC3
0,tf_NGTDM_Contrast,tf_GLSZM_ZoneVariance,tf_LBP_kurtosis_R15_P36
1,tf_NGTDM_Busyness,tf_NGTDM_Contrast,tf_Gabor_0.05A0.79mean
2,tf_NGTDM_Complexity,tf_GLSZM_LargeAreaHighGrayLevelEmphasis,tf_GLSZM_ZonePercentage
3,tf_GLSZM_ZonePercentage,tf_GLSZM_LargeAreaEmphasis,tf_GLRLM_ShortRunEmphasis
4,tf_GLRLM_GrayLevelVariance,tf_GLSZM_LargeAreaLowGrayLevelEmphasis,tf_GLRLM_RunLengthNonUniformityNormalized
5,tf_Gabor_0.05A0.79mean,tf_NGTDM_Busyness,tf_Gabor_0.05A0.79std
6,hf_std,hf_peak,tf_GLRLM_RunPercentage
7,tf_GLRLM_ShortRunEmphasis,sf_volume_2D,tf_Gabor_0.05A0.0kurt
8,hf_range,tf_NGTDM_Complexity,tf_Gabor_0.2A2.36skew
9,tf_Gabor_0.05A0.79skew,tf_Gabor_0.5A1.57std,tf_GLSZM_ZoneVariance


In [None]:
pd.DataFrame.to_latex(PC_features,index=False)


'\\begin{tabular}{lll}\n\\toprule\n                                       PC1 &                                      PC2 &                                        PC3 \\\\\n\\midrule\n                         tf\\_NGTDM\\_Contrast &                    tf\\_GLSZM\\_ZoneVariance &                    tf\\_LBP\\_kurtosis\\_R15\\_P36 \\\\\n                         tf\\_NGTDM\\_Busyness &                        tf\\_NGTDM\\_Contrast &                     tf\\_Gabor\\_0.05A0.79mean \\\\\n                       tf\\_NGTDM\\_Complexity &  tf\\_GLSZM\\_LargeAreaHighGrayLevelEmphasis &                    tf\\_GLSZM\\_ZonePercentage \\\\\n                   tf\\_GLSZM\\_ZonePercentage &               tf\\_GLSZM\\_LargeAreaEmphasis &                  tf\\_GLRLM\\_ShortRunEmphasis \\\\\n                tf\\_GLRLM\\_GrayLevelVariance &   tf\\_GLSZM\\_LargeAreaLowGrayLevelEmphasis &  tf\\_GLRLM\\_RunLengthNonUniformityNormalized \\\\\n                    tf\\_Gabor\\_0.05A0.79mean &                    