In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from nltk.corpus import stopwords
import nltk

In [None]:
df = pd.read_csv('df_final.csv')

In [None]:
aziende_df = pd.read_csv('aziende.csv')
esgs_df = pd.read_csv('esgs.csv')

In [None]:
aziende_df = aziende_df.drop('Unnamed: 0', axis=1)

In [None]:
aziende_df.head(2)

In [None]:
esgs_df.head(2)

In [None]:
# esgs_df = esgs_df.drop('Unnamed: 0', axis=1)
esgs_df = esgs_df.transpose()

In [None]:
esgs_df.rename(columns={'Azione': 'BRAND'}, inplace=True)

In [None]:
esgs_df.replace(1, 2030)

In [None]:
df = pd.merge(aziende_df, esgs_df, on='BRAND', how='inner')

# Display the first few rows of the merged DataFrame to confirm the merge
df.head()

In [None]:
df.to_csv('df1.csv')

In [None]:
# Count the number of companies per sector
sector_counts = df['SECTOR'].value_counts()

# Generate a bar chart for the number of companies per sector
plt.figure(figsize=(12, 8))
sns.barplot(x=sector_counts.values, y=sector_counts.index, palette="viridis")
plt.title('Number of Companies per Sector')
plt.xlabel('Number of Companies')
plt.ylabel('Sector')
plt.show()

In [None]:
# For demonstration, let's assume there's a column 'Environmental Goal Set' indicating if a company has set any environmental goal.
# Since the actual dataset structure for environmental goals is not specified, this step assumes a hypothetical scenario.

# Create a hypothetical 'Environmental Goal Set' column for demonstration (replace with actual data logic)
df['Environmental Goal Set'] = df.iloc[:, 10:20].notnull().any(axis=1)  # Assuming columns 10 to 20 relate to environmental goals

# Count the number of companies with environmental goals set per sector
env_goals_by_sector = df.groupby('SECTOR')['Environmental Goal Set'].sum()

# Generate a pie chart for the distribution of companies with environmental goals by sector
plt.figure(figsize=(10, 7))
env_goals_by_sector.plot(kind='pie', autopct='%1.1f%%', startangle=140)
plt.title('Distribution of Companies with Environmental Goals by Sector')
plt.ylabel('')  # Hide the y-label as it's not needed for a pie chart
plt.show()


In [None]:
goal_columns = df.columns[10:20]  # Placeholder for actual environmental goal columns
companies_per_goal = df[goal_columns].notnull().sum().sort_values(ascending=False)

# Visualize Number of Companies per Goal
plt.figure(figsize=(12, 8))
sns.barplot(x=companies_per_goal.values, y=companies_per_goal.index, palette="coolwarm")
plt.title('Number of Companies per Environmental Goal')
plt.xlabel('Number of Companies')
plt.ylabel('Environmental Goals')
for index, value in enumerate(companies_per_goal.values):
    plt.text(value, index, str(value))
plt.show()

In [None]:
# 2. Number of Goals per Company (Top 20)
# Calculate the number of goals set by each company
goals_per_company = df[goal_columns].notnull().sum(axis=1)
df['Total Goals'] = goals_per_company  # Add a total goals column for visualization

# Get the top 20 companies with the most goals
top_companies = df.nlargest(20, 'Total Goals')

# Visualize Number of Goals per Company (Top 20)
plt.figure(figsize=(12, 8))
sns.barplot(x='Total Goals', y='BRAND', data=top_companies, palette="magma")
plt.title('Top 20 Companies by Number of Environmental Goals')
plt.xlabel('Number of Environmental Goals')
plt.ylabel('Company')
for index, value in enumerate(top_companies['Total Goals']):
    plt.text(value, index, str(value))
plt.show()


In [None]:
# 3. Average Number of Goals per Sector
# Calculate the average number of goals set by companies within each sector
average_goals_per_sector = df.groupby('SECTOR')['Total Goals'].mean().sort_values(ascending=False)

# Visualize Average Number of Goals per Sector
plt.figure(figsize=(12, 8))
sns.barplot(x=average_goals_per_sector.values, y=average_goals_per_sector.index, palette="cubehelix")
plt.title('Average Number of Environmental Goals per Sector')
plt.xlabel('Average Number of Environmental Goals')
plt.ylabel('Sector')
for index, value in enumerate(average_goals_per_sector.values):
    plt.text(value, index, f"{value:.2f}")
plt.show()


In [None]:
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.corpus import stopwords
import nltk
from collections import Counter
import numpy as np

# Download NLTK stopwords for English and Italian (run this once)
nltk.download('stopwords')

# Combine English and Italian stopwords from NLTK
stopwords_combined = set(stopwords.words('english')) | set(stopwords.words('italian'))

# Example environmental goals (replace with your actual data)
goals = df.columns[20:]

# Basic Preprocessing: Lowercasing and removing punctuation/stopwords
processed_text = ' '.join(goals).lower()  # Lowercase all text
tokens = processed_text.split()  # Tokenize by splitting
tokens = [word for word in tokens if word not in stopwords_combined]  # Remove stopwords

# Generate Word Cloud
wordcloud = WordCloud(stopwords=stopwords_combined, background_color="white", max_words=100, width=800, height=400).generate(' '.join(tokens))
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

# Generate Bar Chart for Top Terms
word_counts = Counter(tokens)
most_common_words = word_counts.most_common(10)  # Adjust as needed
words, frequencies = zip(*most_common_words)

plt.figure(figsize=(10, 6))
sns.barplot(x=np.array(frequencies), y=np.array(words), palette="viridis")
plt.title('Top Terms in Environmental Goals')
plt.xlabel('Frequency')
plt.ylabel('Terms')
for index, value in enumerate(frequencies):
    plt.text(value, index, f" {value}", va='center')
plt.show()


In [None]:
df_dropped = df.dropna(axis=1, how='all')

In [None]:
df.columns[20]

In [None]:
df

In [None]:
# Download and prepare Italian stopwords
nltk.download('stopwords')
italian_stopwords = stopwords.words('italian')

In [None]:
goal_columns = df.columns[20:] 
df['Aggregated_Goals'] = df.apply(lambda x: ', '.join(goal_columns[x[goal_columns].notnull()]), axis=1)
df

In [None]:
vectorizer = TfidfVectorizer(stop_words=italian_stopwords)
X = vectorizer.fit_transform(df['Aggregated_Goals'])
X

In [None]:
kmeans = KMeans(n_clusters=5, random_state=42)
clusters = kmeans.fit_predict(X)
print(clusters)

In [None]:
# Add cluster labels to the DataFrame
df['Cluster'] = clusters

### Step 4: Visualization - Number of Brands per Cluster

plt.figure(figsize=(10, 6))
sns.countplot(x='Cluster', data=df, palette='Set2')
plt.title('Distribution of Brands Across Clusters')
plt.xlabel('Cluster')
plt.ylabel('Number of Brands')
plt.show()

In [None]:
# Summary statistics for each cluster
print(df['Cluster'].value_counts())

# Proportions of brands in each cluster
print(df['Cluster'].value_counts(normalize=True))


In [None]:
for cluster in sorted(df['Cluster'].unique()):
    cluster_data = df[df['Cluster'] == cluster]
    # Assuming ESG goals were aggregated into a 'Goals' column, replace with actual column names
    print(f"\nCluster {cluster} common goals:")
    print(cluster_data['Aggregated_Goals'].value_counts().head(5))


In [None]:
sns.scatterplot(
    x=df.index,  # Use the DataFrame index as the x value
    y='Total Goals',  # Number of goals on the y-axis
    hue='Cluster',  # Color by cluster
    data=df,
    palette='viridis',  # Color palette for clusters
    legend='full',  # Include legend to explain colors
    s=100  # Size of the markers
)
plt.title('Visual Map of Brands by Number of Goals and Cluster')
plt.xlabel('Brand Index')
plt.ylabel('Total Goals')
plt.legend(title='Cluster')
plt.show()

In [None]:
sns.set_style("whitegrid")  # Setting seaborn style

# Creating the count plot
ax = sns.countplot(data=df, x='Cluster', hue='SECTOR')
plt.xlabel('Cluster')
plt.ylabel('Count')
plt.title('Distribution of Sectors in Clusters')
plt.xticks(rotation=45)
plt.legend(title='Sector')

# Adding numbers on the bars
for p in ax.patches:
    ax.annotate(format(p.get_height(), '.0f'),
                (p.get_x() + p.get_width() / 2., p.get_height()),
                ha = 'center', va = 'center',
                xytext = (0, 9),
                textcoords = 'offset points')

plt.show()


In [None]:
import pandas as pd
df = pd.read_csv('df_final.csv')

In [None]:
X = df[['SECTOR', 'PRODUCT CATEGORY', 'COUNTRY', 'BUSINESS MODEL', 
        'CONGLOMERATE/Gruppo di riferimento', 'Market segment', 
        'Core business', 'Business segment', 'Targets', 'Listing', 
        'Country', 'Business model', 'Conglomerate/ Company/ Gruppo di riferimento', 
        'Employees', 'Fatturato (Euro) FY, 2022', 'Utile lordo FY, 2022', 
        'Utile/perdita FY, 2022', 'Non-financial disclosure']]
y = df['Cluster']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)


In [None]:
df.isna().sum()

In [None]:
df_cols = list(df.columns)
df_cols

In [None]:
pred_cols = ['BRAND',
 'ID',
 'SECTOR',
 'PRODUCT CATEGORY',
 'COUNTRY',
 'BUSINESS MODEL',
 'CONGLOMERATE/Gruppo di riferimento',
 'Market segment',
 'Core business',
 'Business segment',
 'Targets',
 'Listing',
 'Country',
 'Business model',
 'Conglomerate/ Company/ Gruppo di riferimento',
 'Employees',
 'Fatturato (Euro) FY, 2022',
 'Utile lordo FY, 2022',
 'Utile/perdita FY, 2022',
 'Non-financial disclosure', 'Cluster']
df_pred = df[pred_cols]
df_pred.head(2)

In [None]:
from sklearn.preprocessing import LabelEncoder

# Identify categorical columns
categorical_cols = df_pred.select_dtypes(include=['object', 'category']).columns

# Apply LabelEncoder to each categorical column
for col in categorical_cols:
    le = LabelEncoder()
    df_pred[col] = le.fit_transform(df_pred[col].astype(str)) 


In [None]:
df1 = pd.read_csv('df_final.csv')

In [None]:
#df_agg = df1[['BRAND', 'ID', 'SECTOR', 'PRODUCT CATEGORY', 'COUNTRY', 'BUSINESS MODEL', 'CONGLOMERATE/Gruppo di riferimento', 'Market segment', 'Core business', 'Business segment', 'Targets', 'Listing', 'Country', 'Business model', 'Conglomerate/ Company/ Gruppo di riferimento', 'Employees', 'Fatturato (Euro) FY, 2022', 'Utile lordo FY, 2022', 'Utile/perdita FY, 2022', 'Non-financial disclosure', 'Environmental Goal Set', 'Total Goals', 'Aggregated_Goals']]
df_agg.head(1)

In [None]:
from sklearn.preprocessing import LabelEncoder

categorical_columns = ['BRAND', 'ID', 'SECTOR', 'PRODUCT CATEGORY', 'COUNTRY', 'BUSINESS MODEL', 
                       'CONGLOMERATE/Gruppo di riferimento', 'Market segment', 'Core business', 
                       'Business segment', 'Targets', 'Listing', 'Country', 'Business model', 
                       'Conglomerate/ Company/ Gruppo di riferimento', 'Non-financial disclosure', 
                       'Environmental Goal Set']

label_value_mapping = []

for col in categorical_columns:
    df_agg[col] = df_agg[col].fillna('NaN').copy() 
    le = LabelEncoder()
    df_agg[col+'_encoded'] = le.fit_transform(df_agg[col])
    label_value_mapping.extend([{'Column': col, 
                                 'Encoded_Label': encoded_label, 
                                 'Original_Value': original_value} 
                                for encoded_label, original_value in enumerate(le.classes_)])

label_value_mapping_df = pd.DataFrame(label_value_mapping)

print(label_value_mapping_df)


In [None]:
list(df_agg.columns)

In [None]:
df_encoded = df_agg[['BRAND_encoded',
 'ID_encoded',
 'SECTOR_encoded',
 'PRODUCT CATEGORY_encoded',
 'COUNTRY_encoded',
 'BUSINESS MODEL_encoded',
 'CONGLOMERATE/Gruppo di riferimento_encoded',
 'Market segment_encoded',
 'Employees',
 'Core business_encoded',
 'Business segment_encoded',
 'Targets_encoded',
 'Listing_encoded',
 'Country_encoded',
 'Business model_encoded',
 'Conglomerate/ Company/ Gruppo di riferimento_encoded',
 'Non-financial disclosure_encoded',
 'Environmental Goal Set_encoded','Total Goals',
 'Aggregated_Goals',]]

In [None]:
df_encoded.head(2)

In [None]:
from sklearn.impute import SimpleImputer
import pandas as pd
import numpy as np

imputer = SimpleImputer(strategy='median')

column_values = df_encoded['Employees'].values.reshape(-1, 1)
df_encoded['Employees'] = imputer.fit_transform(column_values)

df_encoded.head(2)


In [None]:
# Convert 'Employees' column to integer type
df_encoded['Employees'] = df_encoded['Employees'].astype(int)


In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer

nltk.download('stopwords')
nltk.download('punkt')

english_stopwords = set(stopwords.words('english'))
italian_stopwords = set(stopwords.words('italian'))

all_stopwords = english_stopwords.union(italian_stopwords)

def preprocess_text(text):
    tokens = word_tokenize(text)
    filtered_tokens = [word.lower() for word in tokens if word.isalnum() and word.lower() not in all_stopwords]
    return ' '.join(filtered_tokens)

df_encoded['Aggregated_Goals_processed'] = df_encoded['Aggregated_Goals'].apply(preprocess_text)

In [None]:
df_encoded.head(2)

In [None]:
list(df_encoded_with_tfidf_200.columns)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vect = TfidfVectorizer(max_features=200)

X_tfidf_200 = tfidf_vect.fit_transform(df_encoded['Aggregated_Goals_processed'])

tfidf_df_200 = pd.DataFrame(X_tfidf_200.toarray(), columns=tfidf_vect.get_feature_names_out())

df_encoded.reset_index(drop=True, inplace=True)

df_encoded_with_tfidf_200 = pd.concat([df_encoded, tfidf_df_200], axis=1)


In [None]:
df_encoded_with_tfidf_200.head(1)

In [None]:
df_encoded_with_tfidf_200['Cluster'].value_counts()

In [None]:
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=5, random_state=42) 
clusters = kmeans.fit_predict(X_pca)

df_encoded_with_tfidf_200['Cluster'] = clusters


In [None]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
import pandas as pd
import numpy as np

X = df_encoded_with_tfidf_200[[
    'SECTOR_encoded',
    'PRODUCT CATEGORY_encoded',
    'COUNTRY_encoded',
    'BUSINESS MODEL_encoded',
    'CONGLOMERATE/Gruppo di riferimento_encoded',
    'Market segment_encoded',
    'Employees',
    'Core business_encoded',
    'Business segment_encoded',
    'Targets_encoded',
    'Listing_encoded',
    'Country_encoded',
    'Business model_encoded',
    'Conglomerate/ Company/ Gruppo di riferimento_encoded',
    'Non-financial disclosure_encoded'
]]
y = df_encoded_with_tfidf_200['Cluster']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

ros = SMOTE(sampling_strategy='auto', k_neighbors=2, random_state=42)
X_train_resampled, y_train_resampled = ros.fit_resample(X_train, y_train)

classifiers = {
    "k-NN": KNeighborsClassifier(),
    "Naïve Bayes": GaussianNB(),
    "J48 Decision Tree": DecisionTreeClassifier(),
    "Multilayer Perceptron": MLPClassifier(),
    "Support Vector Machine": SVC(),
    "Random Forest" : RandomForestClassifier()
}

for name, classifier in classifiers.items():
    scores = cross_val_score(classifier, X_train_resampled, y_train_resampled, cv=5)
    
    print(f"Cross-validation scores for {name}:")
    print("Mean:", np.mean(scores))
    print("Standard Deviation:", np.std(scores))
    
    classifier.fit(X_train_resampled, y_train_resampled)
    
    y_pred = classifier.predict(X_test)

    print(f"\nClassification Report for {name}:")
    print(classification_report(y_test, y_pred))
    print(f"Confusion Matrix for {name}:")
    print(confusion_matrix(y_test, y_pred))
    print("\n")


In [None]:
import optuna
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier

def objective(trial):
    n_estimators = trial.suggest_int('n_estimators', 50, 200)
    max_depth = trial.suggest_int('max_depth', 2, 32, log=True)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 20)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 20)

    max_features = trial.suggest_categorical('max_features', ['sqrt', 'log2', 'int', 'float'])
    if max_features == 'int':
        max_features = trial.suggest_int('max_features_int', 1, 20)
    elif max_features == 'float':
        max_features = trial.suggest_float('max_features_float', 0.1, 0.9)
    
    clf = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        max_features=max_features,
        random_state=42
    )
    
    scores = cross_val_score(clf, X_train_resampled, y_train_resampled, cv=5)
    
    return scores.mean()

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

print('Best trial:')
trial = study.best_trial
print('  Value: {:.3f}'.format(trial.value))
print('  Params: ')
for key, value in trial.params.items():
    print('    {}: {}'.format(key, value))


In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold


X = df_encoded_with_tfidf_200[[
    'SECTOR_encoded',
    'PRODUCT CATEGORY_encoded',
    'COUNTRY_encoded',
    'BUSINESS MODEL_encoded',
    'CONGLOMERATE/Gruppo di riferimento_encoded',
    'Market segment_encoded',
    'Employees',
    'Core business_encoded',
    'Business segment_encoded',
    'Targets_encoded',
    'Listing_encoded',
    'Country_encoded',
    'Business model_encoded',
    'Conglomerate/ Company/ Gruppo di riferimento_encoded',
    'Non-financial disclosure_encoded'
]]
y = df_encoded_with_tfidf_200['Cluster']

pipeline = Pipeline([
    ('SMOTE', SMOTE(sampling_strategy='auto', k_neighbors=2, random_state=42)),
    ('RandomForestClassifier', RandomForestClassifier(n_estimators=187,
                                                      max_depth=11,
                                                      min_samples_split=2,
                                                      min_samples_leaf=1,
                                                      max_features=0.845665859920012,
                                                      random_state=42))
])

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

scores = cross_val_score(pipeline, X, y, cv=cv, scoring='accuracy')

print("Cross-validation scores:", scores)
print("Mean accuracy:", scores.mean())
print("Standard deviation of accuracy:", scores.std())
