In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import math
import networkx as nx
import matplotlib.cm as cm
import matplotlib.colors as mcolors
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import seaborn as sns
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import pairwise_distances
from scipy import stats
from sklearn.metrics import roc_curve
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, make_scorer
from sklearn.svm import SVC
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules

In [None]:
df = pd.read_excel('data111.xlsx', sheet_name='prep')

In [None]:
# Set the font family to Times New Roman for all text elements
plt.rcParams['font.family'] = 'Times New Roman'
plt.rcParams['font.size'] = 12

year_count = df.groupby('Year')['Year'].count()

# set the figure size
plt.figure(figsize=(12, 6))

plt.plot(year_count.index, year_count.values)

# set the x-ticks to show each year
plt.xticks(year_count.index)

plt.xlabel('Year')
plt.ylabel('Number of papers')
plt.title('Number of Worked papers')
plt.show()

In [None]:
plt.rcParams['font.family'] = 'Times New Roman'
plt.rcParams['font.size'] = 12

year_count = df.groupby('Year')['Year'].count()
# Convert the year index to integers and reindex to include missing years
year_count = year_count.astype(int).reindex(range(2005, 2023), fill_value=0)

plt.figure(figsize=(12, 6))

# Normalize the y-values between 0 and 1
norm = mcolors.Normalize(vmin=year_count.min(), vmax=year_count.max())

# Define the colormap
cmap = cm.get_cmap('Blues')

# Plot the bars with varying colors based on y-values
bars = plt.bar(year_count.index, year_count.values, color=cmap(norm(year_count.values)))

plt.xticks(year_count.index)
plt.xlabel('Year')

# Set the y-axis limit to start from 0
plt.ylim(bottom=0)

# Use math.floor() to convert the y-axis tick labels to integers
plt.yticks([math.floor(y) for y in plt.yticks()[0]])
plt.ylabel('Number of papers')
plt.title('Number of Worked papers by year')

# Adjust the alpha (transparency) of the bars for better visibility
for bar in bars:
    bar.set_alpha(0.8)

plt.show()

In [None]:
decade_count = df.groupby((df['Year']//10)*10)['Year'].count()

plt.figure(figsize=(12, 6))

# create the second bar chart by decade
plt.subplot(1, 2, 2)
plt.bar(decade_count.index, decade_count.values)
plt.xlabel('Decade')
plt.ylabel('Number of papers')
plt.title('Number of Worked papers by decade')

# set the x-ticks to show only 2000s, 2010s, and 2020s
plt.xticks([2000, 2010, 2020])

plt.tight_layout()
plt.show()

In [None]:
# Split the "Used Techniques" column by "-" separator and create a list of all techniques
technique_df = df['Used Techniques'].str.split('-', expand=True)
technique_series = technique_df.stack().reset_index(drop=True)
technique_counts = technique_series.value_counts()

other_techniques = []
for i in range(len(technique_counts)):
    if technique_counts[i] == 1:
        other_techniques.append(technique_counts.index[i])

# Replace techniques mentioned only once with "Other Techniques"
other_techniques_count = sum([count for count in technique_counts if count <= 1])
technique_counts = technique_counts[technique_counts > 1]
technique_counts['Other Techniques'] = other_techniques_count

# Create the figure and axes
pie, ax = plt.subplots(figsize=[8, 8])

# Create the pie chart
wedges, texts, autotexts = ax.pie(
    technique_counts.values,
    labels=technique_counts.index,
    autopct='%1.1f%%', labeldistance=0.80
)

# Customize the text labels inside the pie chart
for text in texts:
    text.set_horizontalalignment('center')
    text.set_fontname('Times New Roman')

# Customize the percent labels
for autotext in autotexts:
    autotext.set_fontstyle('italic')
    autotext.set_fontname('Times New Roman')
    autotext.set_fontsize(14)

plt.rcParams['font.family'] = 'Times New Roman'

# Plot the legend
ax.legend(wedges, other_techniques, title="Other Techniques:", loc="best", bbox_to_anchor=(1, 0, 0.5, 1), handlelength=0)

plt.title('Used Techniques', fontsize=20)
plt.show()

In [None]:
plt.rcParams['font.family'] = 'Times New Roman'
plt.rcParams['font.size'] = 12


# Split the "Field of Data" column by "-" separator and create a list of all fields
fields_df = df['Field of Data'].str.split('-', expand=True)
fields_series = fields_df.stack().reset_index(drop=True)
fields_counts = fields_series.value_counts()

other_fields = []
for i in range(len(fields_counts)):
    if fields_counts[i] == 1:
        other_fields.append(fields_counts.index[i])


# Replace fields mentioned only once with "Other Fields"
other_fields_count = sum([count for count in fields_counts if count <= 1])
fields_counts = fields_counts[fields_counts > 1]
fields_counts['Other Fields'] = other_fields_count

fig, ax = plt.subplots(figsize=(8, 8))

plt.pie(fields_counts.values, labels=fields_counts.index, autopct='%1.1f%%')

ax.legend(wedges, other_fields, title="Other Fields:", loc="best", bbox_to_anchor=(1, 0, 0.5, 1), handlelength=0)

centre_circle = plt.Circle((0, 0), 0.70, fc='white')
fig = plt.gcf()
 
# Adding Circle in Pie chart
fig.gca().add_artist(centre_circle)

plt.title('Field of Data', fontsize=20)
plt.show()

In [None]:
#Regression analysis

df2 = pd.read_excel('data111.xlsx', sheet_name='data')
data = df2

In [None]:
# Define predictor variables
variables = ['SH', 'SUH', 'EH', 'AD', 'BPL', 'PH', 'KH', 'TH', 'HW']
y1 = data[['SH']]
y2 = data[['SUH']]
y3 = data[['EH']]
y4 = data[['AD']]
y5 = data[['BPL']]
y6 = data[['PH']]
y7 = data[['KH']]
y8 = data[['TH']]
y9 = data[['HW']]
# Define dependent variable
X = data['S']
# Fit the linear regression model
model1 = sm.OLS(y1, sm.add_constant(X)).fit()
model2 = sm.OLS(y2, sm.add_constant(X)).fit()
model3 = sm.OLS(y3, sm.add_constant(X)).fit()
model4 = sm.OLS(y4, sm.add_constant(X)).fit()
model5 = sm.OLS(y5, sm.add_constant(X)).fit()
model6 = sm.OLS(y6, sm.add_constant(X)).fit()
model7 = sm.OLS(y7, sm.add_constant(X)).fit()
model8 = sm.OLS(y8, sm.add_constant(X)).fit()
model9 = sm.OLS(y9, sm.add_constant(X)).fit()

In [None]:
# Print linear equation
print('Shoulder Height = {:.3f} + {:.3f} * Stature'.format (model1.params['const'], model1.params['S']))
# Print R-squared and adjusted R-squared
print('R-squared: {:.3f}'.format(model1.rsquared))
print('Adj. R-squared: {:.3f}'.format(model1.rsquared_adj))
print('Standard error of the regression: {:.3f}'.format(model1.bse['const']))


print('Linear regression:')
# Split the data into training and testing sets
X_train, X_test, y1_train, y1_test = train_test_split(X, y1, test_size=0.2, random_state=42)


# Reshape the data to a 2D array
X_train = X_train.values.reshape(-1, 1)
X_test = X_test.values.reshape(-1, 1)
y1_train = y1_train.values.reshape(-1, 1)
y1_test = y1_test.values.reshape(-1, 1)

# Fit the linear regression model on the training data
reg1 = LinearRegression().fit(X_train, y1_train)

# Print the coefficients and intercept
print('Coefficients:', reg1.coef_)
print('Intercept:', reg1.intercept_)

# Make predictions on the test data
y1_pred = reg1.predict(X_test)

# Calculate the R-squared value
r_squared = reg1.score(X_test, y1_test)
print('R-squared:', r_squared)
# Calculate the adjusted R-squared value
n = len(X_test)
p = X_train.shape[1]
adj_r_squared = 1 - (1 - r_squared) * (n - 1) / (n - p - 1)
print('Adjusted R-squared:', adj_r_squared)

In [None]:
print('Subscapular Height = {:.3f} + {:.3f} * Stature'.format (model2.params['const'], model2.params['S']))
print('R-squared: {:.3f}'.format(model2.rsquared))
print('Adj. R-squared: {:.3f}'.format(model2.rsquared_adj))
print('Standard error of the regression: {:.3f}'.format(model2.bse['const']))
print('Linear regression:')
X_train, X_test, y2_train, y2_test = train_test_split(X, y2, test_size=0.2, random_state=42)

X_train = X_train.values.reshape(-1, 1)
X_test = X_test.values.reshape(-1, 1)
y2_train = y2_train.values.reshape(-1, 1)
y2_test = y2_test.values.reshape(-1, 1)

reg2 = LinearRegression().fit(X_train, y2_train)

print('Coefficients:', reg2.coef_)
print('Intercept:', reg2.intercept_)

y2_pred = reg2.predict(X_test)

r_squared = reg2.score(X_test, y2_test)
print('R-squared:', r_squared)
n = len(X_test)
p = X_train.shape[1]
adj_r_squared = 1 - (1 - r_squared) * (n - 1) / (n - p - 1)
print('Adjusted R-squared:', adj_r_squared)

In [None]:
print('Elbow Height = {:.3f} + {:.3f} * Stature'.format (model3.params['const'], model3.params['S']))
print('R-squared: {:.3f}'.format(model3.rsquared))
print('Adj. R-squared: {:.3f}'.format(model3.rsquared_adj))
print('Standard error of the regression: {:.3f}'.format(model3.bse['const']))
print('Linear regression:')

X_train, X_test, y3_train, y3_test = train_test_split(X, y3, test_size=0.2, random_state=42)

X_train = X_train.values.reshape(-1, 1)
X_test = X_test.values.reshape(-1, 1)
y3_train = y3_train.values.reshape(-1, 1)
y3_test = y3_test.values.reshape(-1, 1)

reg3 = LinearRegression().fit(X_train, y3_train)

print('Coefficients:', reg3.coef_)
print('Intercept:', reg3.intercept_)

y3_pred = reg3.predict(X_test)

r_squared = reg3.score(X_test, y3_test)
print('R-squared:', r_squared)
n = len(X_test)
p = X_train.shape[1]
adj_r_squared = 1 - (1 - r_squared) * (n - 1) / (n - p - 1)
print('Adjusted R-squared:', adj_r_squared)

In [None]:
print('Abdominal Depth = {:.3f} + {:.3f} * Stature'.format (model4.params['const'], model4.params['S']))
print('R-squared: {:.3f}'.format(model4.rsquared))
print('Adj. R-squared: {:.3f}'.format(model4.rsquared_adj))
print('Standard error of the regression: {:.3f}'.format(model4.bse['const']))
print('Linear regression:')

X_train, X_test, y4_train, y4_test = train_test_split(X, y4, test_size=0.2, random_state=42)

X_train = X_train.values.reshape(-1, 1)
X_test = X_test.values.reshape(-1, 1)
y4_train = y4_train.values.reshape(-1, 1)
y4_test = y4_test.values.reshape(-1, 1)

reg4 = LinearRegression().fit(X_train, y4_train)

print('Coefficients:', reg4.coef_)
print('Intercept:', reg4.intercept_)

y4_pred = reg4.predict(X_test)

r_squared = reg4.score(X_test, y4_test)
print('R-squared:', r_squared)
n = len(X_test)
p = X_train.shape[1]
adj_r_squared = 1 - (1 - r_squared) * (n - 1) / (n - p - 1)
print('Adjusted R-squared:', adj_r_squared)

In [None]:
print('Buttock Popliteal Length = {:.3f} + {:.3f} * Stature'.format (model5.params['const'], model5.params['S']))
print('R-squared: {:.3f}'.format(model5.rsquared))
print('Adj. R-squared: {:.3f}'.format(model5.rsquared_adj))
print('Standard error of the regression: {:.3f}'.format(model5.bse['const']))
print('Linear regression:')

X_train, X_test, y5_train, y5_test = train_test_split(X, y5, test_size=0.2, random_state=42)

X_train = X_train.values.reshape(-1, 1)
X_test = X_test.values.reshape(-1, 1)
y5_train = y5_train.values.reshape(-1, 1)
y5_test = y5_test.values.reshape(-1, 1)

reg5 = LinearRegression().fit(X_train, y5_train)

print('Coefficients:', reg5.coef_)
print('Intercept:', reg5.intercept_)

y5_pred = reg5.predict(X_test)

r_squared = reg5.score(X_test, y5_test)
print('R-squared:', r_squared)
n = len(X_test)
p = X_train.shape[1]
adj_r_squared = 1 - (1 - r_squared) * (n - 1) / (n - p - 1)
print('Adjusted R-squared:', adj_r_squared)

In [None]:
print('Popliteal Height = {:.3f} + {:.3f} * Stature'.format (model6.params['const'], model6.params['S']))
print('R-squared: {:.3f}'.format(model6.rsquared))
print('Adj. R-squared: {:.3f}'.format(model6.rsquared_adj))
print('Standard error of the regression: {:.3f}'.format(model6.bse['const']))
print('Linear regression:')

X_train, X_test, y6_train, y6_test = train_test_split(X, y6, test_size=0.2, random_state=42)

X_train = X_train.values.reshape(-1, 1)
X_test = X_test.values.reshape(-1, 1)
y6_train = y6_train.values.reshape(-1, 1)
y6_test = y6_test.values.reshape(-1, 1)

reg6 = LinearRegression().fit(X_train, y6_train)

print('Coefficients:', reg6.coef_)
print('Intercept:', reg6.intercept_)

y6_pred = reg6.predict(X_test)

r_squared = reg6.score(X_test, y6_test)
print('R-squared:', r_squared)
n = len(X_test)
p = X_train.shape[1]
adj_r_squared = 1 - (1 - r_squared) * (n - 1) / (n - p - 1)
print('Adjusted R-squared:', adj_r_squared)

In [None]:
print('knee Height = {:.3f} + {:.3f} * Stature'.format (model7.params['const'], model7.params['S']))
print('R-squared: {:.3f}'.format(model7.rsquared))
print('Adj. R-squared: {:.3f}'.format(model7.rsquared_adj))
print('Standard error of the regression: {:.3f}'.format(model7.bse['const']))
print('Linear regression:')

X_train, X_test, y7_train, y7_test = train_test_split(X, y7, test_size=0.2, random_state=42)

X_train = X_train.values.reshape(-1, 1)
X_test = X_test.values.reshape(-1, 1)
y7_train = y7_train.values.reshape(-1, 1)
y7_test = y7_test.values.reshape(-1, 1)

reg7 = LinearRegression().fit(X_train, y7_train)

print('Coefficients:', reg7.coef_)
print('Intercept:', reg7.intercept_)

y7_pred = reg7.predict(X_test)

r_squared = reg7.score(X_test, y7_test)
print('R-squared:', r_squared)
n = len(X_test)
p = X_train.shape[1]
adj_r_squared = 1 - (1 - r_squared) * (n - 1) / (n - p - 1)
print('Adjusted R-squared:', adj_r_squared)

In [None]:
print('Thigh Thickness = {:.3f} + {:.3f} * Stature'.format (model8.params['const'], model8.params['S']))
print('R-squared: {:.3f}'.format(model8.rsquared))
print('Adj. R-squared: {:.3f}'.format(model8.rsquared_adj))
print('Standard error of the regression: {:.3f}'.format(model8.bse['const']))
print('Linear regression:')

X_train, X_test, y8_train, y8_test = train_test_split(X, y8, test_size=0.2, random_state=42)

X_train = X_train.values.reshape(-1, 1)
X_test = X_test.values.reshape(-1, 1)
y8_train = y8_train.values.reshape(-1, 1)
y8_test = y8_test.values.reshape(-1, 1)

reg8 = LinearRegression().fit(X_train, y8_train)

print('Coefficients:', reg8.coef_)
print('Intercept:', reg8.intercept_)

y8_pred = reg8.predict(X_test)

r_squared = reg8.score(X_test, y8_test)
print('R-squared:', r_squared)
n = len(X_test)
p = X_train.shape[1]
adj_r_squared = 1 - (1 - r_squared) * (n - 1) / (n - p - 1)
print('Adjusted R-squared:', adj_r_squared)

In [None]:
print('Hip Width = {:.3f} + {:.3f} * Stature'.format (model9.params['const'], model9.params['S']))
print('R-squared: {:.3f}'.format(model9.rsquared))
print('Adj. R-squared: {:.3f}'.format(model9.rsquared_adj))
print('Standard error of the regression: {:.3f}'.format(model9.bse['const']))
print('Linear regression:')

X_train, X_test, y9_train, y9_test = train_test_split(X, y9, test_size=0.2, random_state=42)

X_train = X_train.values.reshape(-1, 1)
X_test = X_test.values.reshape(-1, 1)
y9_train = y9_train.values.reshape(-1, 1)
y9_test = y9_test.values.reshape(-1, 1)

reg9 = LinearRegression().fit(X_train, y9_train)

print('Coefficients:', reg9.coef_)
print('Intercept:', reg9.intercept_)

y9_pred = reg9.predict(X_test)

r_squared = reg9.score(X_test, y9_test)
print('R-squared:', r_squared)
n = len(X_test)
p = X_train.shape[1]
adj_r_squared = 1 - (1 - r_squared) * (n - 1) / (n - p - 1)
print('Adjusted R-squared:', adj_r_squared)

In [None]:
plt.rcParams['font.family'] = 'Times New Roman'
plt.rcParams['font.size'] = 12

# Define the predictor and dependent variables
y = data[['SH', 'SUH', 'EH', 'AD', 'BPL', 'PH', 'KH', 'TH', 'HW']]
X = data['S']

fig, axs = plt.subplots(3,3, figsize=(16,16))
axs = axs.flatten()

# Create scatter plot with regression line for each feature
for i, feature in enumerate(y.columns):
    model = sm.OLS(y[feature], sm.add_constant(X)).fit()
    y_pred = model.predict(sm.add_constant(X))
    r2_adj = model.rsquared_adj.round(3)
    eq = f'y = {model.params[0].round(3)} + {model.params[1].round(3)}x'
    sns.regplot(x=X, y=feature, data=data, ax=axs[i], scatter_kws={'alpha':0.2})
    axs[i].set_xlabel('Stature')
    axs[i].set_ylabel(feature)
    axs[i].text(0.05, 0.9, eq, transform=axs[i].transAxes)
    axs[i].text(0.05, 0.8, f'Adj R2: {r2_adj}', transform=axs[i].transAxes)

fig.suptitle('Scatter plot of Stature and Anthopometry metrics', fontsize=20)  
plt.tight_layout()
plt.show()

In [None]:
#Clustering

df4 = pd.read_excel('data111.xlsx', sheet_name='data')
df4 = df4.dropna()

# encode 'Sex' and 'Degree' columns
le = LabelEncoder()
df4['Sex'] = le.fit_transform(df4['Sex'])
degree_map = {'Bachelor': 1, 'Master': 2, 'Phd': 3}
df4['Degree'] = df4['Degree'].map(degree_map)

In [None]:
plt.rcParams['font.family'] = 'Times New Roman'
plt.rcParams['font.size'] = 12

# select the features to cluster on
features = ['S', 'SH', 'SUH', 'EH', 'AD', 'BPL', 'PH', 'KH', 'TH', 'HW', 'Age', 'Weight']

# scale the data
scaler = StandardScaler()
scaled_data = scaler.fit_transform(df4[features])

# within-cluster sum of squares (WCSS)
wcss = []
for i in range(1, 10):
    kmeans = KMeans(n_clusters=i, init='k-means++', random_state=42)
    kmeans.fit(scaled_data)
    wcss.append(kmeans.inertia_)

# Plotting the WCSS
plt.plot(range(1, 10), wcss, marker='o', linestyle='-', color='b')
plt.title('Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.show()

In [None]:
plt.rcParams['font.family'] = 'Times New Roman'
plt.rcParams['font.size'] = 12

# Set the range of k values to test
k_values = [2, 3, 4]

# Create a figure and subplots with a 1x3 grid layout
fig, axes = plt.subplots(1, len(k_values), figsize=(12, 4))

# Plot scatter plots for each k value
for i, k in enumerate(k_values):
    # Fit k-means model
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(scaled_data)
    
    # Get cluster labels and distances
    labels = kmeans.labels_
    distances = pairwise_distances(scaled_data, kmeans.cluster_centers_)
    avg_distances = np.mean(np.min(distances, axis=1))

    # Plot scatter plot with different colors for each cluster
    axes[i].scatter(scaled_data[:, 0], scaled_data[:, 1], c=labels, cmap='viridis')
    axes[i].set_title(f'k = {k}, Avg Distance = {avg_distances:.2f}')

# Adjust spacing between subplots
plt.tight_layout()

# Save the combined plot as a single image
plt.savefig('combined_scatter_plots.png')

# Show the combined plot
plt.show()

In [None]:
# apply k-means clustering
kmeans = KMeans(n_clusters=3, init='k-means++', random_state=42)
kmeans.fit(scaled_data)


# apply hierarchical clustering
hierarchical = AgglomerativeClustering(n_clusters=3, affinity='euclidean', linkage='ward')
hierarchical.fit(scaled_data)

# add the cluster labels to the original dataframe and save it to a new file
df4['kmeans_cluster'] = kmeans.labels_
df4['hierarchical_cluster'] = hierarchical.labels_
data = df4.to_csv('clustered_data.csv', index=False)

In [None]:
data = pd.read_csv('clustered_data.csv')

In [None]:
#box plot of each variable
#with each box representing a different cluster to compare the distribution of variable across different clusters

plt.rcParams['font.family'] = 'Times New Roman'
plt.rcParams['font.size'] = 12

fig, axs = plt.subplots(nrows=4, ncols=3, figsize=(16, 18))

for i, ax in enumerate(axs.flatten()):
    ax.boxplot([data[data['kmeans_cluster']==0][features[i]],
                 data[data['kmeans_cluster']==1][features[i]],
                 data[data['kmeans_cluster']==2][features[i]]])
    ax.set_xticklabels(['Cluster 0', 'Cluster 1', 'Cluster 2'])
    ax.set_ylabel(features[i])

fig.suptitle('K-Means Clustering Results', fontsize=20)
plt.tight_layout()
plt.show()

In [None]:
# plot the Correlation heatmap
plt.rcParams['font.family'] = 'Times New Roman'
plt.rcParams['font.size'] = 12

sns.set(font_scale=1.2)
sns.heatmap(corr, cmap='coolwarm', annot=True, fmt='.2f', annot_kws={"size": 10}, 
            xticklabels=corr.columns, yticklabels=corr.columns)
plt.title('Correlation Matrix')
plt.show()

In [None]:
#Classification

df5 = pd.read_excel('data111.xlsx', sheet_name='data')
df = df5

In [None]:
# encode 'Sex'
le = LabelEncoder()
df['Sex'] = le.fit_transform(df['Sex'])

# create a function to categorize BMI into 4 categories
def categorize_bmi(bmi):
    if bmi < 18.5:
        return 'Underweight'
    elif bmi >= 18.5 and bmi <= 24.9:
        return 'Healthy Weight'
    elif bmi >= 25.0 and bmi <= 29.9:
        return 'Overweight'
    else:
        return 'Obesity'

# apply the function to create a new column called BMI Category
df['BMI Category'] = df['BMI'].apply(categorize_bmi)

In [None]:
# define the target variable and features
target = 'BMI Category'
features = ['SH', 'SUH', 'EH', 'AD', 'BPL', 'PH', 'KH', 'TH', 'HW', 'Age', 'Sex']
n = 42
# split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df[features], df[target], test_size=0.2, random_state=n)

# preprocess the data by scaling the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
# perform feature selection using a decision tree classifier
dt = DecisionTreeClassifier(random_state=n)
dt.fit(X_train, y_train)
importance = dt.feature_importances_
feature_importance = dict(zip(features, importance))
important_features = sorted(feature_importance, key=feature_importance.get, reverse=True)[:5]


In [None]:
# get the indices of important features in the original features list
important_feature_indices = [features.index(f) for f in important_features]

# select the important features for X_train and X_test
X_train = X_train[:, important_feature_indices]
X_test = X_test[:, important_feature_indices]

In [None]:
# create and visualize feature importance plot
sns.barplot(x=list(feature_importance.values()), y=list(feature_importance.keys()), orient='h')
plt.title('Feature Importance Plot')
plt.xlabel('Relative Importance')
plt.ylabel('Features')
print (list(feature_importance.values()))

In [None]:
# define a custom scoring function for precision, recall, and F1-score
scoring = {'accuracy': 'accuracy', 'precision': make_scorer(precision_score, average='weighted', zero_division=1),
           'recall': make_scorer(recall_score, average='weighted', zero_division=1),
           'f1_score': make_scorer(f1_score, average='weighted', zero_division=1)}

In [None]:
# train and evaluate the decision tree classifier
dt.fit(X_train, y_train)
y_pred = dt.predict(X_test)
dt_cv_results = cross_validate(dt, X_train, y_train, cv=5, scoring=scoring)
print("Decision Tree Classifier Cross Validation Evaluation Scores:")
print("Accuracy:", np.mean(dt_cv_results['test_accuracy']), ':', dt_cv_results['test_accuracy'])
print("Precision:", np.mean(dt_cv_results['test_precision']), ':', dt_cv_results['test_precision'])
print("Recall:", np.mean(dt_cv_results['test_recall']), ':', dt_cv_results['test_recall'])
print("F1-Score:", np.mean(dt_cv_results['test_f1_score']), ':', dt_cv_results['test_f1_score'])

In [None]:
# plot the confusion matrix for decision tree classifier
plt.figure(figsize=(8,6))
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, cmap='Blues', fmt='g')
plt.xlabel("Predicted Labels")
plt.ylabel("True Labels")
plt.title("Decision Tree Classifier Confusion Matrix")
plt.show()

In [None]:
svm = SVC()
# train and evaluate the support vector machine classifier
svm.fit(X_train, y_train)
y_pred = svm.predict(X_test)
svm_cv_results = cross_validate(svm, X_train, y_train, cv=5, scoring=scoring)
print("Support Vector Machine Cross Validation Evaluation Scores:")
print("Accuracy:", np.mean(svm_cv_results['test_accuracy']), ':', svm_cv_results['test_accuracy'])
print("Precision:", np.mean(svm_cv_results['test_precision']), ':', svm_cv_results['test_precision'])
print("Recall:", np.mean(svm_cv_results['test_recall']), ':', svm_cv_results['test_recall'])
print("F1-Score:", np.mean(svm_cv_results['test_f1_score']), ':', svm_cv_results['test_f1_score'])

In [None]:
# plot the confusion matrix for support vector machine classifier
plt.figure(figsize=(8,6))
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, cmap='Blues', fmt='g')
plt.xlabel("Predicted Labels")
plt.ylabel("True Labels")
plt.title("Support Vector Machine Classifier Confusion Matrix")
plt.show()

In [None]:
# define the logistic regression model
lr = LogisticRegression(random_state=n)

# train and evaluate the logistic regression model
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
lr_cv_results = cross_validate(lr, X_train, y_train, cv=5, scoring=scoring)

# print the evaluation scores
print("Logistic Regression Cross Validation Evaluation Scores:")
print("Accuracy:", np.mean(lr_cv_results['test_accuracy']), ':', lr_cv_results['test_accuracy'])
print("Precision:", np.mean(lr_cv_results['test_precision']), ':', lr_cv_results['test_precision'])
print("Recall:", np.mean(lr_cv_results['test_recall']), ':', lr_cv_results['test_recall'])
print("F1-Score:", np.mean(lr_cv_results['test_f1_score']), ':', lr_cv_results['test_f1_score'])

In [None]:
# plot the confusion matrix for logistic regression model
plt.figure(figsize=(8,6))
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, cmap='Blues', fmt='g')
plt.xlabel("Predicted Labels")
plt.ylabel("True Labels")
plt.title("Logistic Regression Classifier Confusion Matrix")
plt.show()

In [None]:
#Association Rule Learning/Mining

df = pd.read_excel('dataset11.xlsx', sheet_name='data')

# create a function to categorize BMI into 4 categories
def categorize_bmi(bmi):
    if bmi < 18.5:
        return 'Underweight'
    elif bmi >= 18.5 and bmi <= 24.9:
        return 'Healthy Weight'
    elif bmi >= 25.0 and bmi <= 29.9:
        return 'Overweight'
    else:
        return 'Obesity'

# apply the function to create a new column called BMI Category
df['BMI Category'] = df['BMI'].apply(categorize_bmi)

# create four new columns and set them to 1 or 0 based on BMI Category
df['Underweight'] = df['BMI Category'].apply(lambda x: 1 if x == 'Underweight' else 0)
df['Healthy Weight'] = df['BMI Category'].apply(lambda x: 1 if x == 'Healthy Weight' else 0)
df['Overweight'] = df['BMI Category'].apply(lambda x: 1 if x == 'Overweight' else 0)
df['Obesity'] = df['BMI Category'].apply(lambda x: 1 if x == 'Obesity' else 0)

In [None]:
# create a list of the column names to calculate quartiles for
columns_to_calculate_quartiles = ['SH', 'SUH', 'EH', 'AD', 'BPL', 'PH', 'KH', 'TH', 'HW']

# loop through the list of column names and calculate quartiles for each column
for col in columns_to_calculate_quartiles:
    df[f'Q1 {col}'] = df[col].apply(lambda x: 1 if x <= df[col].quantile(0.25) else 0)
    df[f'Q2 {col}'] = df[col].apply(lambda x: 1 if x > df[col].quantile(0.25) and x <= df[col].quantile(0.5) else 0)
    df[f'Q3 {col}'] = df[col].apply(lambda x: 1 if x > df[col].quantile(0.5) and x <= df[col].quantile(0.75) else 0)
    df[f'Q4 {col}'] = df[col].apply(lambda x: 1 if x > df[col].quantile(0.75) else 0)

In [None]:
# Selecting the variables of interest
vars_to_use = ['Underweight', 'Healthy Weight', 'Overweight', 'Obesity',
                    'Q1 SH', 'Q1 SUH', 'Q1 EH', 'Q1 TH', 'Q1 AD', 'Q1 BPL', 'Q1 PH', 'Q1 KH', 'Q1 HW',
                    'Q2 SH', 'Q2 SUH', 'Q2 EH', 'Q2 TH', 'Q2 AD', 'Q2 BPL', 'Q2 PH', 'Q2 KH', 'Q2 HW',
                    'Q3 SH', 'Q3 SUH', 'Q3 EH', 'Q3 TH', 'Q3 AD', 'Q3 BPL', 'Q3 PH', 'Q3 KH', 'Q3 HW',
                    'Q4 SH', 'Q4 SUH', 'Q4 EH', 'Q4 TH', 'Q4 AD', 'Q4 BPL', 'Q4 PH', 'Q4 KH', 'Q4 HW',
                    'Neck', 'BSH', 'Back']
df2 = df1[vars_to_use]


# Generate frequent itemsets
frequent_itemsets = apriori(df2, min_support=0.15, use_colnames=True)

# Generate association rules
rules = association_rules(frequent_itemsets, metric='confidence', min_threshold=0.6)

In [None]:
rules

In [None]:
rule_list = [tuple(rule['antecedents']) + tuple(rule['consequents']) for _, rule in rules.iterrows()]

In [None]:
r = 13
plt.rcParams['font.family'] = 'Times New Roman'
plt.rcParams['font.size'] = 12

np.random.seed(r)  # to stop randomness

# Create the graph
G = nx.DiGraph()
for rule in rule_list:
    G.add_edge(rule[0], rule[1])

np.random.seed(r)  # to stop randomness

# Assign different colors to nodes based on their category
color_map = {'Q1 SH': 'red', 'Q1 SUH': 'green', 'Q1 EH': 'blue', 'Q1 TH': 'purple', 'Q1 AD': 'orange', 
             'Q1 BPL': 'cyan', 'Q1 PH': 'magenta', 'Q1 KH': 'yellow', 'Q1 HW': 'brown', 
             'Q2 TH': 'gray', 'Q2 AD': 'indigo', 
             'Q2 BPL': 'olive', 'Q2 HW': 'darkgray', 
             'Q3 EH': 'peru', 'Q3 AD': 'darkorange', 'Q3 HW': 'saddlebrown', 
             'Underweight': 'darkslateblue', 'Overweight': 'limegreen',
             'Healthy Weight': 'blueviolet'}

np.random.seed(r)  # to stop randomness

# Create the plot
plt.figure(figsize=(12, 8))  # increase the figure size for better visibility
pos = nx.spring_layout(G, k=0.5, iterations=50)
nx.draw_networkx_nodes(G, pos, node_size=2700, node_color=[color_map[n] for n in G.nodes()])
nx.draw_networkx_edges(G, pos, width=1.5, arrowstyle='->', arrowsize=45)
nx.draw_networkx_labels(G, pos, font_size=10, font_family="Times New Roman")
plt.axis("off")
plt.title('Association Rules Network Graph', fontsize=20)

np.random.seed(r)  # to stop randomness

# Add legend to the plot
patches = [plt.plot([], [], marker="o", ms=10, ls="", mec=None, color=color_map[n], label=n)[0] for n in color_map.keys()]
plt.legend(handles=patches, fontsize=10, loc='best')

np.random.seed(r)  # to stop randomness

# Show the plot
plt.show()