<a href="https://colab.research.google.com/github/cconsta1/AgeEst/blob/main/age_estimation_dataset_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Importing and installing all the necessary libraries**

In [None]:
!pip install scikit-optimize git+https://github.com/hyperopt/hyperopt-sklearn.git

In [None]:
# !pip freeze

In [None]:
# Google colab

from google.colab import data_table
from google.colab import files

data_table.enable_dataframe_formatter()

# hyperopt

import hyperopt

from hyperopt import tpe
from hpsklearn import HyperoptEstimator, any_classifier, any_preprocessing
from hpsklearn.components import all_classifiers, all_preprocessing, any_classifier, any_preprocessing, \
any_regressor, all_regressors


# Hyperparameter optimization

import skopt
from skopt import BayesSearchCV

# system

import os
import io

# data analysis and plotting

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns

from scipy.stats import zscore, shapiro
from random import randint

# data processing and model validation

from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler, LabelEncoder, Normalizer, MinMaxScaler
from sklearn.decomposition import PCA, KernelPCA
from sklearn.utils import shuffle
from sklearn import preprocessing
from sklearn.metrics import r2_score, explained_variance_score, confusion_matrix, \
accuracy_score, classification_report, log_loss, mean_absolute_error, mean_squared_error
from math import sqrt
from sklearn.model_selection import cross_val_score, train_test_split, RepeatedStratifiedKFold, KFold, \
LeaveOneOut, GridSearchCV, RandomizedSearchCV, RepeatedStratifiedKFold

# classification libraries

from sklearn.linear_model import LinearRegression, LogisticRegression, SGDClassifier, LogisticRegressionCV
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.neural_network import MLPClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF, DotProduct, WhiteKernel, Matern, RationalQuadratic
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, AdaBoostClassifier, \
ExtraTreesRegressor, ExtraTreesClassifier, RandomForestRegressor
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from xgboost import XGBClassifier, plot_importance

import lightgbm as lgb

# Importing imputation libs. 

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, KNNImputer

# Missing data models

from itertools import combinations
from joblib import parallel_backend

# Export models into pickle
import pickle

# Various parameter settings

#%matplotlib inline

# To install sklearn type "pip install numpy scipy scikit-learn" to the anaconda terminal

# To change scientific numbers to float
#np.set_printoptions(formatter={'float_kind':'{:f}'.format})

# Increases the size of sns plots
#sns.set(rc={'figure.figsize':(12,10)})

# import sys
# !conda list Check the packages installed

# Displaying all the rows/columns in a data set (the default option is not to show them)

pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', None)

# **Importing and preparing the data for the analysis**

In [None]:
uploaded = files.upload()

In [None]:
raw_data = pd.read_csv(io.BytesIO(uploaded['age_dataset.csv']))
# Dataset is now stored in a Pandas Dataframe

In [None]:
raw_data.head()

In [None]:
df = raw_data.iloc[:,[2, 4, 7, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 38, 41, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 3]]

In [None]:
df = pd.DataFrame(df.values[3:], columns=df.iloc[2])

df = df.astype(int)

df

In [None]:
df.describe()

In [None]:
# Add a new target vector called age groups

df['Age_groups'] = pd.cut(df['Age'], bins=[10,35,50,100], labels=False)

df = df.astype(int)

df

In [None]:
# View the data as a table

data_table.DataTable(df, include_index=False, num_rows_per_page=10, max_columns=40)

# **Variables dictionary**

In [None]:
df.columns

In [None]:
vars = {
    "Suchey Brooks 1990": [
        'Right Phase Suchey'
        ],
    "Meindl and Lovejoy": [
        'Right 1-midlamdoid',
        '2-lambda', 
        '3-obelion', 
        '4-anterior sagital',
        '5-bregma', 
        'Right 6-midcoronal', 
        'Right 7-pterion',
        'Right 8-sphenofrontal', 
        'Right 9-inferior sphenotemporal', 
        'Right 10-superior sphenotemporal'
        ],
    "Lovejoy et al": [
        "Right Phase"
    ],
    "Buckberry and Chamberlain": [
        'Right Transverse organization',
        'Right Surface texture',
        'Right Microposity', 
        'Right Macroporositty', 
        'Right Apical changes'
        ],
    "Suchey Brooks 1990 and Lovejoy et al": [
        'Right Phase Suchey',
        'Right Phase' 
    ],
    "Suchey Brooks 1990 and Buckberry Chamberlain": [
        'Right Transverse organization',
        'Right Surface texture',
        'Right Microposity', 
        'Right Macroporositty', 
        'Right Apical changes',
        'Right Phase Suchey'
    ],
    "All": [
        'Right Phase Suchey',
        'Right 1-midlamdoid',
        '2-lambda', 
        '3-obelion', 
        '4-anterior sagital',
        '5-bregma', 
        'Right 6-midcoronal', 
        'Right 7-pterion',
        'Right 8-sphenofrontal', 
        'Right 9-inferior sphenotemporal', 
        'Right 10-superior sphenotemporal',
        "Right Phase",
        'Right Transverse organization',
        'Right Surface texture',
        'Right Microposity', 
        'Right Macroporositty', 
        'Right Apical changes',
        'Right Phase Suchey',
        'Right Phase',
        'Right Transverse organization',
        'Right Surface texture',
        'Right Microposity', 
        'Right Macroporositty', 
        'Right Apical changes',
        'Right Phase Suchey'
    ]
} 


In [None]:
vars

# **Classification (sklearn)** 

In [None]:
dff = df[df["sex"]==2]
y = dff['Age_groups'].values

for key, value in vars.items():

  X = dff[value].values

  X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.75, test_size=0.25, stratify=y)

  filename = 'classification_right_women_'+key.replace(" ","_")+".dat"
  infofilename = 'classification_right_women_'+key.replace(" ","_")+".txt"

  file = open(infofilename, "w")

  model = HyperoptEstimator(classifier=any_classifier('cla'), preprocessing=any_preprocessing('pre'), \
                          algo=tpe.suggest, max_evals=75, trial_timeout=30, continuous_loss_fn=False, loss_fn=mean_absolute_error)

  model.fit(X_train, y_train)
  # summarize performance
  acc = model.score(X_test, y_test)

  cnfm = confusion_matrix(y_test, model.predict(X_test))

  pipe = Pipeline([('scaler', model.best_model()['preprocs'][0] ), ('clf', model.best_model()['learner'] )])

  pipe.fit(X, y)

  result_loocv = cross_val_score(estimator=pipe, X=X, y=y, scoring='accuracy', cv=LeaveOneOut(), error_score='raise')

  pickle.dump(pipe, open(filename, "wb"))

  file.write("---------------------------------\n")
  file.write(key + '\n')

  file.write("Dataset size: "+ str(len(X))+' '+ str(len(y))+'\n')

  file.write("Best classifier: " + str(model.best_model()) + '\n')


  file.write("\nAccuracy: "+ str(acc) +'\n')
  
  file.write("\nConfusion matrix: \n" + str(cnfm) + '\n')

  file.write("LOOCV accuracy: " + str(result_loocv.mean()) + '\n')

  # Close the file
  file.close()


In [None]:
!ls

In [None]:
with open('classification_right_women_Suchey_Brooks_1990.txt', 'r') as f:
    print(f.read())

# **Regression (sklearn)**

In [None]:
dff = df[df["sex"]==2]
y = dff['Age'].values

for key, value in vars.items():

  X = dff[value].values

  X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.75, test_size=0.25)

  filename = 'regression_right_women_'+key.replace(" ","_")+'.dat'
  infofilename = 'regression_right_women_'+key.replace(" ","_")+".txt"

  file = open(infofilename, "w")

  model = HyperoptEstimator(regressor=any_regressor('reg'), preprocessing=any_preprocessing('pre'), \
                          algo=tpe.suggest, max_evals=75, loss_fn=mean_absolute_error, trial_timeout=30,continuous_loss_fn=False)

  model.fit(X_train, y_train)
  # summarize performance
  acc = model.score(X_test, y_test)

  pipe = Pipeline([('scaler', model.best_model()['preprocs'][0] ), ('clf', model.best_model()['learner'] )])

  pipe.fit(X, y)

  pickle.dump(pipe, open(filename, "wb"))

  file.write("---------------------------------\n")
  file.write(key + '\n')

  file.write("Dataset size: "+ str(len(X))+' '+ str(len(y))+'\n')

  file.write("Best classifier: " + str(model.best_model()) + '\n')

  file.write("\nAccuracy: "+ str(acc) +'\n')

  # Close the file
  file.close()


In [None]:
!ls

In [None]:
plt.plot(model.predict(X_test),'ro')
plt.plot(y_test,'b*')

# **Neural Network**

In [None]:
import tensorflow as tf
from tensorflow import keras

# **Classification (tensorflow)**

In [None]:
from re import VERBOSE
dff = df[df["sex"]==2]
y = dff['Age_groups'].values

for key, value in vars.items():

  X = dff[value].values

  X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.75, test_size=0.25, stratify=y)

  modelfilename = 'ann_classification_right_women_'+key.replace(" ","_")+".dat"

  infofilename = 'ann_classification_right_women_'+key.replace(" ","_")+".txt"

  file = open(infofilename, "w")

  model = tf.keras.Sequential([
    tf.keras.layers.Flatten(input_shape=(X_train.shape[1],)),
    tf.keras.layers.Dense(20, activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(3, activation='softmax')
    ])
  
  model.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

  model.fit(X_train, y_train, validation_data=(X_test,y_test), epochs=500, verbose = 0)

  # summarize performance
  acc = model.evaluate(X_test, y_test)

  cnfm = confusion_matrix(y_test, model.predict(X_test).argmax(axis=-1)) 

  #result_loocv = cross_val_score(estimator=model, X=X, y=y, scoring='accuracy', cv=LeaveOneOut(), error_score='raise')

  pickle.dump(model, open(filename, "wb"))

  
  file.write('\n\n')
  file.write("---------------------------------\n")
  file.write(key + '\n')

  file.write("Dataset size: "+ str(len(X))+' '+ str(len(y))+'\n')

  file.write("\nAccuracy: "+ str(acc) +'\n')
  
  file.write("\nConfusion matrix: \n" + str(cnfm))

  #print("\nLOOCV accuracy: ", result_loocv.mean())

  # Close the file
  file.close()
  


In [None]:
!ls

In [None]:
with open('ann_classification_right_women_All.txt', 'r') as f:
    print(f.read())

# **Regression**

In [None]:
dff = df[df["sex"]==2]
y = dff['Age'].values

for key, value in vars.items():

  X = dff[value].values

  X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.75, test_size=0.25)

  modelfilename = 'ann_regression_right_women_'+key.replace(" ","_")+".dat"

  infofilename = 'ann_regression_right_women_'+key.replace(" ","_")+".txt"

  file = open(infofilename, "w")

  model = tf.keras.Sequential([
    tf.keras.layers.Flatten(input_shape=(X_train.shape[1],)),
    tf.keras.layers.Dense(20, activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(1)
    ])
  
  model.compile(
    optimizer='adam',
    loss='mean_squared_error',
    metrics=['accuracy']
)

  model.fit(X_train, y_train, validation_data=(X_test,y_test), epochs=500)

  # summarize performance
  acc = model.evaluate(X_test, y_test)

  pickle.dump(model, open(filename, "wb"))

  file.write('\n\n')
  file.write("---------------------------------\n")
  file.write(key + '\n')

  file.write("Dataset size: "+ str(len(X))+' '+ str(len(y))+'\n')

  file.write("\nAccuracy: "+ str(acc) +'\n')

  # Close the file
  file.close()

In [None]:
!ls

In [None]:
with open('ann_regression_right_men_Lovejoy_et_al.txt', 'r') as f:
    print(f.read())

In [None]:
files.view

In [None]:
!pwd

In [None]:
!zip -r /content/file.zip /content/*


In [None]:
files.download("/content/file.zip")
