In [11]:
import os

import os.path as osp

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

In [12]:
# reading the excel file
xls = pd.ExcelFile('metric.xls')

# reading the sheets into dataframes
df1 = pd.read_excel(xls, 'DEMOGRAPHICS')
df2 = pd.read_excel(xls, 'MEASUREMENTS')

# displaying the dataframe
print(df1.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2391 entries, 0 to 2390
Data columns (total 30 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   Subject Number        2391 non-null   int64         
 1   Country               2391 non-null   object        
 2   Site                  2391 non-null   object        
 3   Date                  2391 non-null   datetime64[ns]
 4   Time                  2391 non-null   object        
 5   Civilian              2391 non-null   object        
 6   Date of Birth         2388 non-null   datetime64[ns]
 7   Age (years)           2388 non-null   float64       
 8   Birth State           2391 non-null   object        
 9   Occupation            2391 non-null   object        
 10  Education             2391 non-null   object        
 11  Number of Children    2391 non-null   object        
 12  Fitness               2391 non-null   object        
 13  Car Make          

In [13]:
print(df2.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2391 entries, 0 to 2390
Data columns (total 49 columns):
 #   Column                                                 Non-Null Count  Dtype  
---  ------                                                 --------------  -----  
 0   Subject Number                                         2391 non-null   int64  
 1   Gender                                                 2391 non-null   object 
 2   Recorder                                               2391 non-null   object 
 3   Measurer                                               2391 non-null   object 
 4   Acromial Height, Sitting (mm)                          2391 non-null   int64  
 5   Ankle Circumference (mm)                               2391 non-null   int64  
 6   Spine-to-Shoulder (mm)                                 2391 non-null   int64  
 7   Spine-to-Elbow (mm)                                    2391 non-null   float64
 8   Arm Length (Spine to Wrist) (mm)                

In [14]:
subject = df1[['Subject Number']].to_numpy().reshape(-1)
heights = df1[['Reported Height (mm)']].to_numpy().reshape(-1)
weights = df1[['Reported Weight (kg)']].to_numpy().reshape(-1)


print("The shape of heights is: ", np.count_nonzero(~np.isnan(heights)))
print("Height mean: {:.3f}. Height std: {:.3f}. Height Median: {:.3f}.\n".format(np.nanmean(heights), np.nanstd(heights), np.nanmedian(heights)))

print("The shape of weights is: ", np.count_nonzero(~np.isnan(weights)))
print("Weight mean: {:.3f}. Weight std: {:.3f}. Weight Median: {:.3f}.".format(np.nanmean(weights), np.nanstd(weights), np.nanmedian(weights)))

The shape of heights is:  2387
Height mean: 1716.437. Height std: 107.988. Height Median: 1714.500.

The shape of weights is:  2383
Weight mean: 76.135. Weight std: 19.460. Weight Median: 72.562.


In [15]:
# basically, we don't need the ratios...

arm_lengths = df2[['Arm Length (Shoulder to Wrist) (mm)']].to_numpy().reshape(-1)
crotch_heights = df2[['Crotch Height (mm)']].to_numpy().reshape(-1)
chest_circumferences = df2[['Chest Circumference (mm)']].to_numpy().reshape(-1)
hip_circumferences = df2[['Hip Circumference, Maximum (mm)']].to_numpy().reshape(-1)
waist_circumferences = df2[['Waist Circumference, Pref (mm)']].to_numpy().reshape(-1)

print("The shape of arm lengths is: ", np.count_nonzero(~np.isnan(arm_lengths)))
print("Arm Length mean: {:.3f}. std: {:.3f}. Median: {:.3f}.\n".format(np.nanmean(arm_lengths), np.nanstd(arm_lengths), np.nanmedian(arm_lengths)))

print("The shape of crotch heights is: ", np.count_nonzero(~np.isnan(crotch_heights)))
print("Crotch Height mean: {:.3f}. std: {:.3f}. Median: {:.3f}.\n".format(np.nanmean(crotch_heights), np.nanstd(crotch_heights), np.nanmedian(crotch_heights)))

print("The shape of chest circumferences is: ", np.count_nonzero(~np.isnan(chest_circumferences)))   
print("Chest Circumference mean: {:.3f}. std: {:.3f}. Median: {:.3f}.\n".format(np.nanmean(chest_circumferences), np.nanstd(chest_circumferences), np.nanmedian(chest_circumferences)))

print("The shape of hip circumferences is: ", np.count_nonzero(~np.isnan(hip_circumferences)))
print("Hip Circumference mean: {:.3f}. std: {:.3f}. Median: {:.3f}.\n".format(np.nanmean(hip_circumferences), np.nanstd(hip_circumferences), np.nanmedian(hip_circumferences)))

print("The shape of waist circumferences is: ", np.count_nonzero(~np.isnan(waist_circumferences)))
print("Waist Circumference mean: {:.3f}. std: {:.3f}. Median: {:.3f}.\n".format(np.nanmean(waist_circumferences), np.nanstd(waist_circumferences), np.nanmedian(waist_circumferences)))


d = {'subject': subject, 'gender': df1['Gender'], 'heights': heights, 'arm_lengths': arm_lengths, 'crotch_heights': crotch_heights, 
     'chest_circumferences': chest_circumferences, 'hip_circumferences': hip_circumferences, 'waist_circumferences': waist_circumferences}

df = pd.DataFrame(data=d)
df["gender"] = df["gender"].map({"Male": 0, "Female": 1})
df["subject"] = df["subject"].apply(lambda x: "csr{:04d}a.ply".format(x) if x < 1000 else "csr{}a.ply".format(x))
df.to_csv('my_metric.csv', index=False)

The shape of arm lengths is:  2391
Arm Length mean: 612.611. std: 45.986. Median: 612.000.

The shape of crotch heights is:  2390
Crotch Height mean: 773.540. std: 55.731. Median: 771.000.

The shape of chest circumferences is:  2391
Chest Circumference mean: 996.745. std: 124.099. Median: 978.000.

The shape of hip circumferences is:  2389
Hip Circumference mean: 1050.170. std: 113.026. Median: 1031.000.

The shape of waist circumferences is:  2388
Waist Circumference mean: 848.005. std: 144.338. Median: 832.000.



In [16]:
# plt.hist(heights, bins=30, color='blue', edgecolor='black', alpha=0.7)
# plt.title('Histogram of Heights')
# plt.savefig('histogram/heights.pdf')
# plt.show()

# plt.hist(weights, bins=30, color='blue', edgecolor='black', alpha=0.7)
# plt.title('Histogram of Weights')
# plt.savefig('histogram/weights.pdf')
# plt.show()

# plt.hist(arm_lengths, bins=30, color='blue', edgecolor='black', alpha=0.7)
# plt.title('Histogram of Arm Length')
# plt.savefig('histogram/arm_lengths.pdf')
# plt.show()

# plt.hist(crotch_heights, bins=30, color='blue', edgecolor='black', alpha=0.7)
# plt.title('Histogram of crotch heights')
# plt.savefig('histogram/crotch_heights.pdf')
# plt.show()

# plt.hist(chest_circumferences, bins=30, color='blue', edgecolor='black', alpha=0.7)
# plt.title('Histogram of Chest Circumferences')
# plt.savefig('histogram/chest_circumferences.pdf')
# plt.show()

# plt.hist(hip_circumferences, bins=30, color='blue', edgecolor='black', alpha=0.7)
# plt.title('Histogram of hip circumferences')
# plt.savefig('histogram/hip_circumferences.pdf')
# plt.show()

# plt.hist(waist_circumferences, bins=30, color='blue', edgecolor='black', alpha=0.7)
# plt.title('Histogram of waist circumferences')
# plt.savefig('histogram/waist_circumferences.pdf')
# plt.show()

# plt.hist(waist_hip_ratios, bins=30, color='blue', edgecolor='black', alpha=0.7)
# plt.title('Histogram of waist hip ratios')
# plt.savefig('histogram/waist_hip_ratios.pdf')
# plt.show()

# plt.hist(waist_chest_ratios, bins=30, color='blue', edgecolor='black', alpha=0.7)
# plt.title('Histogram of waist chest ratios')
# plt.savefig('histogram/waist_chest_ratios.pdf')
# plt.show()

# plt.hist(hip_chest_ratios, bins=30, color='blue', edgecolor='black', alpha=0.7)
# plt.title('Histogram of hip chest ratios')
# plt.savefig('histogram/hip_chest_ratios.pdf')
# plt.show()

# plt.hist(leg_ratios, bins=30, color='blue', edgecolor='black', alpha=0.7)
# plt.title('Histogram of leg ratios')
# plt.savefig('histogram/leg_ratios.pdf')
# plt.show()


# Pick the columns and make it a new dataframe
There are several things to notice:
* There are total 2391 pieces of entries in the data frame. However, by using the command `ls | wc -l`, we can tell the number of files in `Models20K` is 2169. For example, The file `csr2897a.ply` itself does not exist in the dataset. (Even before the delete operation... )

In [17]:
def select_columns(type='standardization'):

    if type not in ['standardization', 'original']:
        raise ValueError('type must be standardization or original')
    
    # read into the csv file as a data frame
    df = pd.read_csv('my_metric.csv')
    
    # find out the null rows
    null_df = df[df.isna().any(axis=1)]
    if null_df.shape[0] != 0:
        print("The following is the null data frame:")
        print(null_df)
        # delete the meshes corresponding to the null data
        for index, row in null_df.iterrows():
            prefix = "../data/human-body2/raw/Models20K/"
            subject_number = row['subject']
            file_name = prefix + subject_number

            if os.path.exists(file_name):
                os.remove(file_name)
                print("The file {} has been removed".format(subject_number))
            else:
                print("The file {} does not exist".format(subject_number)) 
        # delete the null data
        df = df.dropna()
    else:
        print("There is no null data frame")
    
    # do the standardization or keep the original data
    if type == 'standardization':
        scaler = StandardScaler()
        df['heights'] = scaler.fit_transform(df[['heights']])
        df['arm_lengths'] = scaler.fit_transform(df[['arm_lengths']])
        df['crotch_heights'] = scaler.fit_transform(df[['crotch_heights']])
        df['chest_circumferences'] = scaler.fit_transform(df[['chest_circumferences']])
        df['hip_circumferences'] = scaler.fit_transform(df[['hip_circumferences']])
        df['waist_circumferences'] = scaler.fit_transform(df[['waist_circumferences']])

    # save the data frame
    df.to_csv('metric_{}.csv'.format(type), index=False)

In [18]:
select_columns(type='standardization')
select_columns(type='original')

The following is the null data frame:
           subject  gender  heights  arm_lengths  crotch_heights  \
315   csr0424a.ply       0   1752.6          624           752.0   
417   csr0535a.ply       0      NaN          670           779.0   
421   csr0539a.ply       0      NaN          608           737.0   
503   csr0632a.ply       0      NaN          646           798.0   
708   csr1212a.ply       1   1727.2          625             NaN   
1294  csr1802a.ply       1   1574.8          538           714.0   
1297  csr1805a.ply       1   1701.8          596           809.0   
1298  csr1806a.ply       1   1562.1          581           714.0   
1527  csr2036a.ply       1   1701.8          579           778.0   
2262  csr2897a.ply       1      NaN          658           829.0   

      chest_circumferences  hip_circumferences  waist_circumferences  
315                   1143              1099.0                   NaN  
417                   1154              1095.0                 963.0  


# Plot the scatter plot of the data

In [19]:
def scatter_plot(x_label, y_label):
    """
    The labels are like following: 
    ['heights', 'weights', 'arm_lengths', 'crotch_heights', 
     'chest_circumferences', 'hip_circumferences', 'waist_circumferences']
    """
    
    df = pd.read_csv('metric_original.csv')
    x = df[x_label].to_numpy().reshape(-1)
    y = df[y_label].to_numpy().reshape(-1)
    gender = df['gender'].to_numpy().reshape(-1)
    
    plt.scatter(x=x[gender==0], y=y[gender==0], s=3, c='b', label='Male')
    plt.scatter(x=x[gender==1], y=y[gender==1], s=3, c='r', label='Female')
    
    plt.xlabel(x_label)
    plt.ylabel(y_label)
    plt.legend()
    
    title = "Scatter Plot of {} and {}".format(x_label, y_label)
    plt.title(title)
    
    file_name = "{} vs {}.pdf".format(x_label, y_label)
    fp = osp.join('scatter', file_name)
    plt.savefig(fp)
    plt.show()

In [20]:
# scatter_plot('heights', 'weights')
# scatter_plot('heights', 'arm_lengths')
# scatter_plot('heights', 'crotch_heights')
# scatter_plot('heights', 'chest_circumferences')
# scatter_plot('heights', 'hip_circumferences')
# scatter_plot('heights', 'waist_circumferences')
# scatter_plot('chest_circumferences', 'hip_circumferences')
# scatter_plot('chest_circumferences', 'waist_circumferences')
# scatter_plot('hip_circumferences', 'waist_circumferences')
# scatter_plot('weights', 'hip_circumferences')
# scatter_plot('arm_lengths', 'waist_circumferences')