<a href="https://colab.research.google.com/github/altaki/Data-Science-Projects/blob/main/Code_Batoul.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor

In [None]:
# Load the data from the Excel sheet
survey_data = pd.read_excel('Ba.xlsx')

# Check for NaN values in the dataset
nan_counts = survey_data.isna().sum()
print("NaN counts in the dataset:")
print(nan_counts)

NaN counts in the dataset:
N°Obs                                                        0
2. 1. Avez-vous déjà vu une campagne de produit partage ?    0
4. 3. Avez-vous déjà acheté un produit partage ?             0
6. Les produits de Volvic me rassurent                       0
7. J’ai confiance dans la qualité des produits de Volvic     0
                                                            ..
Genre                                                        0
Situation_familiale                                          0
Niveau_etude                                                 0
Profession                                                   0
Revenu_net_mensuel                                           0
Length: 76, dtype: int64


In [None]:
# Handle NaN values by imputing with the mean
if nan_counts.sum() > 0:
    survey_data = survey_data.fillna(survey_data.mean())

In [None]:

# Define the demographic columns
demographic_columns = ['Genre', 'Profession', 'Situation_familiale', 'Age', 'Niveau_etude']

# Identify the question columns
question_columns = [col for col in survey_data.columns if col not in demographic_columns]

# Separate the features and targets
X = survey_data[demographic_columns]
y = survey_data[question_columns]

# Initialize and train a Random Forest regressor on all features
regressor = RandomForestRegressor()
regressor.fit(X, y)

# Number of new individuals to generate
new_individuals_count = 300

In [None]:
# Define quotas for demographic features only
quotas = {
    'Age': {1: 0.08, 2: 0.12, 3: 0.12, 4: 0.13, 5: 0.13, 6: 0.17},
    'Genre': {1: 0.48, 2: 0.52},  # 1: Male, 2: Female
    'Situation_familiale': {1: 0.4, 2: 0.4, 3: 0.2, 4: 0.0},
    'Niveau_etude': {1: 0.0, 2: 0.0, 3: 0.2, 4: 0.23, 5: 0.27, 6: 0.3},
    'Profession': {1: 0.18, 2: 0.12, 3: 0.13, 4: 0.0, 5: 0.15, 6: 0.25, 7: 0.0, 8: 0.12, 9: 0.05}
}

In [None]:
# Function to generate new individuals with specified quotas
def generate_new_individuals(quotas, total_count):
    new_data = {column: [] for column in quotas.keys()}

    for column, column_quotas in quotas.items():
        remaining_count = total_count
        category_counts = []

        for category, proportion in column_quotas.items():
            count = int(total_count * proportion)
            category_counts.append(count)
            remaining_count -= count

        # Adjust the last category to ensure the total count is exactly total_count
        if remaining_count != 0:
            category_counts[-1] += remaining_count

        for i, (category, _) in enumerate(column_quotas.items()):
            new_data[column].extend([category] * category_counts[i])

    # Shuffle the new data to ensure random distribution
    for column in new_data:
        np.random.shuffle(new_data[column])

    return pd.DataFrame(new_data)

In [None]:
# Generate the new individuals based on quotas
new_individuals_demographics = generate_new_individuals(quotas, new_individuals_count)

In [None]:
# Generate the new individuals based on quotas
new_individuals_demographics = generate_new_individuals(quotas, new_individuals_count)

# Generate random responses for non-demographic questions
random_responses = np.random.randint(1, 6, size=(new_individuals_count, len(question_columns)))

# Create a DataFrame for new individuals with demographic data and random question responses
new_individuals = pd.concat([new_individuals_demographics.reset_index(drop=True),
                             pd.DataFrame(random_responses, columns=question_columns)], axis=1)

# Predict final answers for the survey questions using the trained model
predicted_answers = regressor.predict(new_individuals[demographic_columns])
new_individuals[question_columns] = predicted_answers.astype(int)

# Concatenate the original data and new individuals
combined_data = pd.concat([survey_data, new_individuals], ignore_index=True)

# Sort the combined data by index
combined_data_sorted = combined_data.sort_index()

# Export the sorted combined data to an Excel sheet
combined_data_sorted.to_excel('combined_data_sorted.xlsx', index=False)

In [None]:
combined_data_sorted.shape

# Correlation between features
To identify which variables or features are correlated with each other, especially in a large dataset with many questions, you can calculate the correlation matrix and then filter it to find pairs of variables with high correlation values. This can be done efficiently using a for loop or by leveraging pandas' capabilities to filter and sort the correlation matrix.

Here's how you can achieve this:

Calculate the correlation matrix.

Unstack the matrix to turn it into a long format.
Filter the pairs to find those with high correlation values.
Optionally, visualize the correlations for better understanding.
Below is the Python code to perform these steps:

In [None]:
df = combined_data_sorted

In [None]:
# Calculate the correlation matrix
corr_matrix = df.corr()

# Unstack the correlation matrix to turn it into a long format
corr_pairs = corr_matrix.unstack()

# Convert to a DataFrame
corr_pairs = pd.DataFrame(corr_pairs, columns=['correlation'])

# Reset index for easier filtering
corr_pairs.reset_index(inplace=True)
corr_pairs.columns = ['Feature1', 'Feature2', 'correlation']

# Remove self-correlations by filtering out pairs where Feature1 == Feature2
corr_pairs = corr_pairs[corr_pairs['Feature1'] != corr_pairs['Feature2']]

# Find highly correlated pairs (e.g., correlation > 0.8 or correlation < -0.8)
high_corr_pairs = corr_pairs[(corr_pairs['correlation'] > 0.8) | (corr_pairs['correlation'] < -0.8)]

# Sort by correlation value
high_corr_pairs = high_corr_pairs.sort_values(by='correlation', ascending=False)

# Print the highly correlated pairs
print("Highly correlated pairs:")
print(high_corr_pairs)

(450, 76)