In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from nltk.corpus import stopwords
import nltk

In [5]:
df = pd.read_csv('with_tfidf.csv')

In [None]:
from sklearn.preprocessing import LabelEncoder

categorical_columns = ['BRAND', 'ID', 'SECTOR', 'PRODUCT CATEGORY', 'COUNTRY', 'BUSINESS MODEL', 
                       'CONGLOMERATE/Gruppo di riferimento', 'Market segment', 'Core business', 
                       'Business segment', 'Targets', 'Listing', 'Country', 'Business model', 
                       'Conglomerate/ Company/ Gruppo di riferimento', 'Non-financial disclosure', 
                       'Environmental Goal Set']

label_value_mapping = []

for col in categorical_columns:
    df_agg[col] = df_agg[col].fillna('NaN').copy() 
    le = LabelEncoder()
    df_agg[col+'_encoded'] = le.fit_transform(df_agg[col])
    label_value_mapping.extend([{'Column': col, 
                                 'Encoded_Label': encoded_label, 
                                 'Original_Value': original_value} 
                                for encoded_label, original_value in enumerate(le.classes_)])

label_value_mapping_df = pd.DataFrame(label_value_mapping)

print(label_value_mapping_df)


In [None]:
list(df_agg.columns)

In [None]:
df_encoded = df_agg[['BRAND_encoded',
 'ID_encoded',
 'SECTOR_encoded',
 'PRODUCT CATEGORY_encoded',
 'COUNTRY_encoded',
 'BUSINESS MODEL_encoded',
 'CONGLOMERATE/Gruppo di riferimento_encoded',
 'Market segment_encoded',
 'Employees',
 'Core business_encoded',
 'Business segment_encoded',
 'Targets_encoded',
 'Listing_encoded',
 'Country_encoded',
 'Business model_encoded',
 'Conglomerate/ Company/ Gruppo di riferimento_encoded',
 'Non-financial disclosure_encoded',
 'Environmental Goal Set_encoded','Total Goals',
 'Aggregated_Goals',]]

In [None]:
df_encoded.head(2)

In [None]:
from sklearn.impute import SimpleImputer
import pandas as pd
import numpy as np

imputer = SimpleImputer(strategy='median')

column_values = df_encoded['Employees'].values.reshape(-1, 1)
df_encoded['Employees'] = imputer.fit_transform(column_values)

df_encoded.head(2)


In [None]:
# Convert 'Employees' column to integer type
df_encoded['Employees'] = df_encoded['Employees'].astype(int)


In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer

nltk.download('stopwords')
nltk.download('punkt')

english_stopwords = set(stopwords.words('english'))
italian_stopwords = set(stopwords.words('italian'))

all_stopwords = english_stopwords.union(italian_stopwords)

def preprocess_text(text):
    tokens = word_tokenize(text)
    filtered_tokens = [word.lower() for word in tokens if word.isalnum() and word.lower() not in all_stopwords]
    return ' '.join(filtered_tokens)

df_encoded['Aggregated_Goals_processed'] = df_encoded['Aggregated_Goals'].apply(preprocess_text)

In [None]:
df_encoded.head(2)

In [None]:
list(df_encoded_with_tfidf_200.columns)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vect = TfidfVectorizer(max_features=200)

X_tfidf_200 = tfidf_vect.fit_transform(df_encoded['Aggregated_Goals_processed'])

tfidf_df_200 = pd.DataFrame(X_tfidf_200.toarray(), columns=tfidf_vect.get_feature_names_out())

df_encoded.reset_index(drop=True, inplace=True)

df_encoded_with_tfidf_200 = pd.concat([df_encoded, tfidf_df_200], axis=1)


In [None]:
df_encoded_with_tfidf_200.head(1)

In [None]:
df_encoded_with_tfidf_200['Cluster'].value_counts()

In [None]:
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=5, random_state=42) 
clusters = kmeans.fit_predict(X_pca)

df_encoded_with_tfidf_200['Cluster'] = clusters


In [None]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
import pandas as pd
import numpy as np

X = df_encoded_with_tfidf_200[[
    'SECTOR_encoded',
    'PRODUCT CATEGORY_encoded',
    'COUNTRY_encoded',
    'BUSINESS MODEL_encoded',
    'CONGLOMERATE/Gruppo di riferimento_encoded',
    'Market segment_encoded',
    'Employees',
    'Core business_encoded',
    'Business segment_encoded',
    'Targets_encoded',
    'Listing_encoded',
    'Country_encoded',
    'Business model_encoded',
    'Conglomerate/ Company/ Gruppo di riferimento_encoded',
    'Non-financial disclosure_encoded'
]]
y = df_encoded_with_tfidf_200['Cluster']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

ros = SMOTE(sampling_strategy='auto', k_neighbors=2, random_state=42)
X_train_resampled, y_train_resampled = ros.fit_resample(X_train, y_train)

classifiers = {
    "k-NN": KNeighborsClassifier(),
    "Naïve Bayes": GaussianNB(),
    "J48 Decision Tree": DecisionTreeClassifier(),
    "Multilayer Perceptron": MLPClassifier(),
    "Support Vector Machine": SVC(),
    "Random Forest" : RandomForestClassifier()
}

for name, classifier in classifiers.items():
    scores = cross_val_score(classifier, X_train_resampled, y_train_resampled, cv=5)
    
    print(f"Cross-validation scores for {name}:")
    print("Mean:", np.mean(scores))
    print("Standard Deviation:", np.std(scores))
    
    classifier.fit(X_train_resampled, y_train_resampled)
    
    y_pred = classifier.predict(X_test)

    print(f"\nClassification Report for {name}:")
    print(classification_report(y_test, y_pred))
    print(f"Confusion Matrix for {name}:")
    print(confusion_matrix(y_test, y_pred))
    print("\n")


In [None]:
import optuna
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier

def objective(trial):
    n_estimators = trial.suggest_int('n_estimators', 50, 200)
    max_depth = trial.suggest_int('max_depth', 2, 32, log=True)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 20)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 20)

    max_features = trial.suggest_categorical('max_features', ['sqrt', 'log2', 'int', 'float'])
    if max_features == 'int':
        max_features = trial.suggest_int('max_features_int', 1, 20)
    elif max_features == 'float':
        max_features = trial.suggest_float('max_features_float', 0.1, 0.9)
    
    clf = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        max_features=max_features,
        random_state=42
    )
    
    scores = cross_val_score(clf, X_train_resampled, y_train_resampled, cv=5)
    
    return scores.mean()

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

print('Best trial:')
trial = study.best_trial
print('  Value: {:.3f}'.format(trial.value))
print('  Params: ')
for key, value in trial.params.items():
    print('    {}: {}'.format(key, value))


In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold


X = df_encoded_with_tfidf_200[[
    'SECTOR_encoded',
    'PRODUCT CATEGORY_encoded',
    'COUNTRY_encoded',
    'BUSINESS MODEL_encoded',
    'CONGLOMERATE/Gruppo di riferimento_encoded',
    'Market segment_encoded',
    'Employees',
    'Core business_encoded',
    'Business segment_encoded',
    'Targets_encoded',
    'Listing_encoded',
    'Country_encoded',
    'Business model_encoded',
    'Conglomerate/ Company/ Gruppo di riferimento_encoded',
    'Non-financial disclosure_encoded'
]]
y = df_encoded_with_tfidf_200['Cluster']

pipeline = Pipeline([
    ('SMOTE', SMOTE(sampling_strategy='auto', k_neighbors=2, random_state=42)),
    ('RandomForestClassifier', RandomForestClassifier(n_estimators=187,
                                                      max_depth=11,
                                                      min_samples_split=2,
                                                      min_samples_leaf=1,
                                                      max_features=0.845665859920012,
                                                      random_state=42))
])

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

scores = cross_val_score(pipeline, X, y, cv=cv, scoring='accuracy')

print("Cross-validation scores:", scores)
print("Mean accuracy:", scores.mean())
print("Standard deviation of accuracy:", scores.std())
