# CA4

### Imports

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split


### Reading data

In [2]:

df = pd.read_csv('assets/train.csv', index_col=1)
df = df.drop(df.columns[0], axis=1)

### Data exploration and visualisation

In [3]:
# to get all the columns
pd.set_option('display.max_columns', None)
              
# Print the mean, median, and standard deviation of each column
print(df.head())
print(df.describe())


       AFP (ng/mL)    ALP (U/L)   ALT (U/L)  AST (U/L)  Age  Albumin (g/dL)  \
index                                                                         
1124     13.571425  1653.138489   40.405592  45.598427    4        4.477126   
1188     13.649342  1940.518305   21.336986  34.064095   55        3.190724   
530      10.898943  1557.369920   29.665496  16.044488   30        4.506351   
686      13.872275  1273.840525  142.418649  64.204257    0        3.665655   
296      10.102457  1461.622515   22.437303  23.940205   59        4.005109   

      Alcohol_Use (yes/no)  Bilirubin (mg/dL)  CRP (mg/L) Diabetes (yes/no)  \
index                                                                         
1124                   yes           0.541997    1.002121                no   
1188                   yes           1.199063    0.582746                no   
530                     no           0.740952    1.670375                no   
686                    yes           1.939879    1.

### Data cleaning

#### Turn categorical variables into numerical

In [4]:
# Turn the Categorical variables into number variables

label_to_number = LabelEncoder()
df['Alcohol_Use (yes/no)'] = label_to_number.fit_transform(df['Alcohol_Use (yes/no)'])
df['Diabetes (yes/no)'] = label_to_number.fit_transform(df['Diabetes (yes/no)'])
df['Obesity (yes/no)'] = label_to_number.fit_transform(df['Obesity (yes/no)'])
df['Gender'] = label_to_number.fit_transform(df['Gender'])

# Take this last to be able to transform back.
df['Diagnosis'] = label_to_number.fit_transform(df['Diagnosis']) # ".fit_transform" is short for ".fit" and then ".transform"

print(df.head())



       AFP (ng/mL)    ALP (U/L)   ALT (U/L)  AST (U/L)  Age  Albumin (g/dL)  \
index                                                                         
1124     13.571425  1653.138489   40.405592  45.598427    4        4.477126   
1188     13.649342  1940.518305   21.336986  34.064095   55        3.190724   
530      10.898943  1557.369920   29.665496  16.044488   30        4.506351   
686      13.872275  1273.840525  142.418649  64.204257    0        3.665655   
296      10.102457  1461.622515   22.437303  23.940205   59        4.005109   

       Alcohol_Use (yes/no)  Bilirubin (mg/dL)  CRP (mg/L)  Diabetes (yes/no)  \
index                                                                           
1124                      1           0.541997    1.002121                  0   
1188                      1           1.199063    0.582746                  0   
530                       0           0.740952    1.670375                  0   
686                       1           1.9

#### Remove missing values and outliers



In [5]:
# Check for nan values
print(df.isnull().values.any())

for i in df.columns:
    z_score = (df[i] - np.mean(df[i])) / np.std(df[i])    
    df.loc[np.abs(z_score) > 3, i] = df[i].mean()


print(df.describe())

False


  df.loc[np.abs(z_score) > 3, i] = df[i].mean()
  df.loc[np.abs(z_score) > 3, i] = df[i].mean()
  df.loc[np.abs(z_score) > 3, i] = df[i].mean()
  df.loc[np.abs(z_score) > 3, i] = df[i].mean()
  df.loc[np.abs(z_score) > 3, i] = df[i].mean()
  df.loc[np.abs(z_score) > 3, i] = df[i].mean()


       AFP (ng/mL)    ALP (U/L)   ALT (U/L)   AST (U/L)         Age  \
count   703.000000   703.000000  703.000000  703.000000  703.000000   
mean     15.329483  1608.100898   58.091552   46.276189   27.206259   
std      25.124926   485.933420   36.089107   29.855841   23.632185   
min     -22.836893   546.609117  -14.313734  -16.215535    0.000000   
25%       8.029213  1229.735749   29.939330   24.163363    5.000000   
50%      10.195074  1553.771408   48.857238   38.720602   23.000000   
75%      12.656547  1918.947294   84.356212   69.620080   45.000000   
max     186.999053  3150.963817  164.615953  129.741247   86.000000   

       Albumin (g/dL)  Alcohol_Use (yes/no)  Bilirubin (mg/dL)  CRP (mg/L)  \
count      703.000000            703.000000         703.000000  703.000000   
mean         4.167308              0.321479           1.044531    1.786713   
std          0.412962              0.467377           0.594682    1.675167   
min          2.948489              0.000000     

### Data preprocessing and visualisation

#### Split data

In [8]:
y = df['Diagnosis']
X = df.drop(columns=['Diagnosis'])

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3,
    stratify=y, random_state=43)

### Modelling

In [21]:
pipeline_svc = Pipeline([('scaler', StandardScaler()), 
                        ('pca', PCA()), # Choose the number of PCA components that cover 95% of the variance.
                        ('svc', SVC(max_iter=100000))])

C_range     = [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0] # For regularization parameter C.
gamma_range = [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0]         # For scaling parameter gamma in rbf-kernel.
random_state = [1, 10, 50, 100, 200, 500, 700, 1000]
kernal_range = ['linear', 'poly', 'rbf', 'sigmoid', 'precomputed']

PCA_range = [0.85, 0.90, 0.95]

# Does anyone know why we have split up the parameter-grid into a list of dictionaries here?
param_grid = [{'svc__C': C_range, 'svc__kernel': kernal_range, 'svc__gamma': gamma_range, 'svc__random_state': random_state, 
               'pca__n_components': PCA_range}]

# Set up the grid search
gs = GridSearchCV(estimator=pipeline_svc, 
                  param_grid=param_grid, 
                  scoring='f1_macro', 
                  cv=10,
                  n_jobs=-1,
                  verbose=1)

gs_res = gs.fit(X_train, y_train)
# Print results
print(gs_res.best_score_)
print(gs_res.best_params_)


"""
# Now with random forest
pipeline_rf = Pipeline([('scaler', StandardScaler()), 
                        ('pca', PCA(0.90)), # Choose the number of PCA components that cover 95% of the variance.
                        ('rf', RandomForestClassifier(random_state=1))])

param_grid = {'rf__n_estimators': [100, 200, 300, 400, 500],
                'rf__max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]}
gs = GridSearchCV(estimator=pipeline_rf, 
                  param_grid=param_grid, 
                  scoring='accuracy', 
                  cv=10,
                  n_jobs=-1)

gs_res = gs.fit(X_train, y_train)
# Print results
print(gs_res.best_score_)
print(gs_res.best_params_)

# Now with knn
from sklearn.neighbors import KNeighborsClassifier
pipeline_knn = Pipeline([('scaler', StandardScaler()), 
                        ('pca', PCA(0.90)), # Choose the number of PCA components that cover 95% of the variance.
                        ('knn', KNeighborsClassifier())])

param_grid = {'knn__n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}
gs = GridSearchCV(estimator=pipeline_knn, 
                  param_grid=param_grid, 
                  scoring='accuracy', 
                  cv=10,
                  n_jobs=-1)

gs_res = gs.fit(X_train, y_train)

# Print results
print(gs_res.best_score_)
print(gs_res.best_params_)
"""



Fitting 10 folds for each of 6720 candidates, totalling 67200 fits


13440 fits failed out of a total of 67200.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
896 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/bard/Library/Python/3.9/lib/python/site-packages/sklearn/model_selection/_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/bard/Library/Python/3.9/lib/python/site-packages/sklearn/base.py", line 1474, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/Users/bard/Library/Python/3.9/lib/python/site-packages/sklearn/pipeline.py", line 475, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
  File "/Users/bard/Library/Python/3.9/lib/python/site-packages/sklearn/bas

0.7246116326478999
{'pca__n_components': 0.9, 'svc__C': 100.0, 'svc__gamma': 0.001, 'svc__kernel': 'rbf', 'svc__random_state': 1}




"\n# Now with random forest\npipeline_rf = Pipeline([('scaler', StandardScaler()), \n                        ('pca', PCA(0.90)), # Choose the number of PCA components that cover 95% of the variance.\n                        ('rf', RandomForestClassifier(random_state=1))])\n\nparam_grid = {'rf__n_estimators': [100, 200, 300, 400, 500],\n                'rf__max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]}\ngs = GridSearchCV(estimator=pipeline_rf, \n                  param_grid=param_grid, \n                  scoring='accuracy', \n                  cv=10,\n                  n_jobs=-1)\n\ngs_res = gs.fit(X_train, y_train)\n# Print results\nprint(gs_res.best_score_)\nprint(gs_res.best_params_)\n\n# Now with knn\nfrom sklearn.neighbors import KNeighborsClassifier\npipeline_knn = Pipeline([('scaler', StandardScaler()), \n                        ('pca', PCA(0.90)), # Choose the number of PCA components that cover 95% of the variance.\n                        ('knn', KNeighborsClassifier

In [46]:
# Now with linear regression
from sklearn.linear_model import LogisticRegression


pipeline_lr = Pipeline([('scaler', StandardScaler()), 
                        ('pca', PCA()), # Choose the number of PCA components that cover 95% of the variance.
                        ('lr', LogisticRegression(max_iter=10000))])


multi_class = ['auto']
C =  [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0] # Regularization parameter
random_state = [1, 10, 50, 100, 200, 500, 700, 1000]
PCA_range = [0.80, 0.85, 0.90, 0.95]

param_grid = [{'lr__penalty': ['l2'],
                'lr__C': C,
                'lr__solver': ['lbfgs','newton-cg','newton-cholesky','sag'],
                'lr__multi_class': multi_class,
                'lr__random_state': random_state,
                'pca__n_components': PCA_range},
                {'lr__penalty': ['l1', 'l2'],
                'lr__C': C,
                'lr__solver': ['liblinear'],
                'lr__multi_class': multi_class,
                'lr__random_state': random_state,
                'pca__n_components': PCA_range},
                {'lr__penalty': ['elasticnet','l1', 'l2'],
                'lr__C': C,
                'lr__solver': ['saga'],
                'lr__multi_class': multi_class,
                'lr__random_state': random_state,
                'pca__n_components': PCA_range},]

gs = GridSearchCV(estimator=pipeline_lr, 
                  param_grid=param_grid, 
                  scoring='f1_macro', 
                  cv=10,
                  n_jobs=-1)

# Print results
gs_res = gs.fit(X_train, y_train)
print(gs_res.best_score_)
print(gs_res.best_params_)



In [37]:
pipeline_rf = Pipeline([('scaler', StandardScaler()), 
                        ('pca', PCA(0.9)),
                        ('rf', RandomForestClassifier(random_state=42))])

param_grid = {
    'rf__n_estimators': [100, 200, 300],
    'rf__max_depth': [None, 5, 10],
    'rf__min_samples_split': [2, 5, 10],
    'rf__min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(estimator=pipeline_rf, 
                           param_grid=param_grid, 
                           scoring='f1_macro', 
                           cv=10,
                           n_jobs=-1,
                           verbose=1)

# Fit the GridSearchCV object to the data
grid_search.fit(X_train, y_train)

# Print the best parameters and best score
print("Best Parameters:", grid_search.best_params_)
print("Best validation score:", grid_search.best_score_)

Fitting 10 folds for each of 81 candidates, totalling 810 fits
Best Parameters: {'rf__max_depth': 10, 'rf__min_samples_leaf': 2, 'rf__min_samples_split': 5, 'rf__n_estimators': 200}
Best validation score: 0.7256734693877551


### Final evaluation

In [22]:
df = pd.read_csv('assets/train.csv')
df = df.drop(df.columns[0], axis=1)
df.set_index('index', inplace=True)

# Turn the Categorical variables into number variables
label_to_number = LabelEncoder()
df['Alcohol_Use (yes/no)'] = label_to_number.fit_transform(df['Alcohol_Use (yes/no)'])
df['Diabetes (yes/no)'] = label_to_number.fit_transform(df['Diabetes (yes/no)'])
df['Obesity (yes/no)'] = label_to_number.fit_transform(df['Obesity (yes/no)'])
df['Gender'] = label_to_number.fit_transform(df['Gender'])

# Take this last to be able to transform back.
y = label_to_number.fit_transform(df['Diagnosis']) # ".fit_transform" is short for ".fit" and then ".transform"

X = df.drop(columns=['Diagnosis'])


for i in X.columns:
    z_score = (X[i] - np.mean(X[i])) / np.std(X[i])    
    X.loc[np.abs(z_score) > 3, i] = X[i].mean()


pipeline_svc = Pipeline([('scaler', StandardScaler()), 
                        ('pca', PCA(0.9)),
                         ('svc', SVC(random_state=1, max_iter=100000, C=100, gamma=0.001, kernel='rbf'))])

pipeline_svc.fit(X, y)
print(f'Train accuracy: {pipeline_svc.score(X, y)}')



Train accuracy: 0.8534850640113798


  X.loc[np.abs(z_score) > 3, i] = X[i].mean()
  X.loc[np.abs(z_score) > 3, i] = X[i].mean()
  X.loc[np.abs(z_score) > 3, i] = X[i].mean()
  X.loc[np.abs(z_score) > 3, i] = X[i].mean()
  X.loc[np.abs(z_score) > 3, i] = X[i].mean()


### Kaggle submission

In [23]:
df = pd.read_csv("assets/test.csv")
df = df.drop(df.columns[0], axis=1)
df.set_index('index', inplace=True)

# Turn the Categorical variables into number variables
gender_mapping = {'MALE': 0, 'FEMALE': 1}
df['Gender'] = df['Gender'].map(gender_mapping)

yes_no_mapping = {'no': 0, 'yes': 1}
df['Alcohol_Use (yes/no)'] = df['Alcohol_Use (yes/no)'].map(yes_no_mapping)
df['Diabetes (yes/no)'] = df['Diabetes (yes/no)'].map(yes_no_mapping)
df['Obesity (yes/no)'] = df['Obesity (yes/no)'].map(yes_no_mapping)

for i in df.columns:
    z_score = (df[i] - np.mean(df[i])) / np.std(df[i])    
    df.loc[np.abs(z_score) > 3, i] = df[i].mean()

y = pipeline_svc.predict(df)
y = label_to_number.inverse_transform(y)
df_results = pd.DataFrame(data=y, columns=["Diagnosis"])
df_results.index.names = ["index"]
df_results.to_csv('assets/results.csv')

  df.loc[np.abs(z_score) > 3, i] = df[i].mean()
  df.loc[np.abs(z_score) > 3, i] = df[i].mean()
  df.loc[np.abs(z_score) > 3, i] = df[i].mean()
  df.loc[np.abs(z_score) > 3, i] = df[i].mean()
  df.loc[np.abs(z_score) > 3, i] = df[i].mean()
