In [None]:
from prevelop import preparation, exploration, clustering, evaluation

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

__Data Preparation and Preprocessing__

In [None]:
# file = '/Users/kaspar/Documents/FIR/Prevelop/data/Simus/csvfolding-2025-04-08_12-21-53.csv'

In [None]:
file = 'c:/Users/IM-KI/Documents/PrEvelOp_Docs/Export_classmate/csvfolding-2025-04-08_12-21-53.csv'

In [None]:
### load cad-data
cad_data, num_columns_cad, cat_columns_cad = preparation.load_simus_data(file)

In [None]:
cad_data

In [None]:
# load the data
# process_data = preparation.load_data('/Users/kaspar/Documents/FIR/Prevelop/data/Roemheld/Römheld_prozess_2.xlsx')
# link_data = preparation.load_data('/Users/kaspar/Documents/FIR/Prevelop/data/Roemheld/roemheld_teile_zeichnungen_3.xlsx')

In [None]:
# load the data
process_data = preparation.load_data('c:/Users/IM-KI/Documents/PrEvelOp_Daten_CAPicard_2/Projektmappe Arbeitspläne.xlsx')
link_data = preparation.load_data('c:/Users/IM-KI/Documents/PrEvelOp_Daten_CAPicard_2/Projektmappe Teile-Zeichnungen.xlsx')

In [None]:
# drop rows with NaN in colum 'Teil'
link_data = link_data.dropna(subset=['Teil'])
# drop rows with NaN in colum 'Teil'
process_data = process_data.dropna(subset=['Teil'])

In [None]:
process_data

In [None]:
# aggreagate process data
process_data, num_columns_process, cat_columns_process = preparation.aggregate_data(process_data, key='Teil', columns=['Aktivität','Basisressource','Produktionsmenge'], methods=['encode','encode','mean'])

In [None]:
### prepare the data
# select numerical, binary and categorical columns for preprocessing
num_columns = num_columns_cad + num_columns_process
cat_columns = cat_columns_cad + cat_columns_process
data, data_preprocessed = preparation.prepare_data(cad_data, num_columns, cat_columns, process_data=process_data, link_data=link_data)

In [None]:
### extract data from nomennclature in column 'Teil'
data['Material'] = data.index.str[5:7]
data['Nr. of flights and direction'] = data.index.str[7]
data['Design'] = data.index.str[8]
data['Pitch'] = data.index.str[10:13]
data['Length'] = data.index.str[14:17]

# drop rows with non-numeric values in column 'Pitch' and 'Length'
data = data[data['Pitch'].str.isnumeric()]
data = data[data['Length'].str.isnumeric()]

# make values in columns 'Pitch' and 'Length' numeric
data['Pitch'] = data['Pitch'].astype(float)
data['Length'] = data['Length'].astype(float)

# drop rows with values in Nr of flights and direction not in [0,1,2,3,4,5,6]
data = data[data['Nr. of flights and direction'].isin(['0','1','2','3','4','5','6'])]

# drop rows with values in Design not in [0,1,2,3,4,5]
data = data[data['Design'].isin(['0','1','2','3','4','5'])]

In [None]:
### preprocess the data
# select numerical, binary and categorical columns for preprocessing
num_columns.append('Pitch')
num_columns.append('Length')
cat_columns.append('Material')
cat_columns.append('Nr. of flights and direction')
cat_columns.append('Design')

data_preprocessed = preparation.preprocessing(data, num_columns, cat_columns)

__Exploration__

In [None]:
# show boxplots of the numerical columns
exploration.boxplots(data, num_columns)

In [None]:
# show violonplot of data
exploration.violinplots(data, num_columns)

In [None]:
# show histograms of categorical columns
exploration.distributions(data, num_columns)

In [None]:
# show parallel_coordinates_plot of data
# exploration.barplots(data, cat_columns)

In [None]:
# show heatmap of data
exploration.heatmap(data, num_columns)

In [None]:
# z-score analysis of data
exploration.z_score_analysis(data, num_columns)

In [None]:
# apply isolation forest to detect outliers
exploration.isolation_forest(data, num_columns)

In [None]:
# apply tsne_visualization to visualize the data
exploration.tsne_visualization(data_preprocessed, num_columns)

__Adjustments based on EDA__

In [None]:
# drop columns Lrot, Da max.
data = data.drop(columns=['Lrot', 'Da max.'])
data_preprocessed = data_preprocessed.drop(columns=['Lrot', 'Da max.']) 

__Clustering__

In [None]:
### calculate distance matrix
distance_matrix = clustering.gower_distance(data)

In [None]:
### plot the elbow plot for the first 20 clusters
clustering.elbow_plot_agglomerative(data_preprocessed, distance_matrix, 100, 202)

In [None]:
### plot the dendrogram
clustering.plot_dendrogram(distance_matrix, labels=data.index, orientation='left')

In [None]:
### find clusters with k-medoids and apply elbow and shilouette method
clustering.elbow_plot_kmedoids(data_preprocessed, 300, 500)

In [None]:
clustering.silhouette_score_kmedoids(data_preprocessed, 300, 500)

__Evaluation__

In [None]:
labels = clustering.agglomerative_clustering(distance_matrix, 33)

In [None]:
# # export the results to a excel file, dataframe with columms 'ID' and 'Cluster'
# df = pd.DataFrame(data={'ID': data.index, 'Cluster': labels})
# df.to_excel('results/results_roemheld_aggl_66.xlsx', index=False)

In [None]:
### Evaluate the clustering results
evaluation.evaluate_clustering(data_preprocessed, labels)

In [None]:
### visualize the feature importance
evaluation.feature_importance(data_preprocessed, labels)

In [None]:
### visualize the results
evaluation.plot_results_2d(data_preprocessed.values, labels)

In [None]:
labels = clustering.kmedoids_clustering(distance_matrix, 50)

In [None]:
# # export the results to a excel file, dataframe with columms 'ID' and 'Cluster'
# df = pd.DataFrame(data={'ID': data.index, 'Cluster': labels})
# df.to_excel('results/results_roemheld_kmedoids_50.xlsx', index=False)

In [None]:
### Evaluate the clustering results
evaluation.evaluate_clustering(data_preprocessed, labels)

In [None]:
### visualize the feature importance
evaluation.feature_importance(data_preprocessed, labels)

In [None]:
### visualize the results
evaluation.plot_results_2d(data_preprocessed.values, labels)