In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from math import pi
import pandas as pd
import os

In [None]:
# Set working directory
# Set the working directory to the path where your files are located
os.chdir('path_to_your_data')

In [None]:
# Load your dataset
data = pd.read_csv('path_to_your_data/DATA.csv', encoding='ISO-8859-1')

In [None]:
#Check your data
print(data.columns)

**PCA Analysis next**


Before performing Principal Component Analysis (PCA):

- It is important to ensure that your data is clean
  
  This may include: Excluding non-numeric identifiers, such as the ID column.
  
- It is crucial to standardize the numeric data to ensure all variables are on a similar scale. PCA is sensitive to the magnitude of the variables, meaning that features with larger ranges could dominate the analysis and distort the results. Standardization transforms the data so that each variable has a mean of 0 and a standard deviation of 1, ensuring that all variables contribute equally to the analysis.

In the steps below, we use the StandardScaler from the sklearn library to standardize the numeric data. The process involves:

- Applying the scaler to transform the numeric dataset into a standardized format suitable for PCA.
This ensures that the subsequent PCA will be based purely on the relative relationships among variables, rather than being influenced by differing scales or units.

In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import pandas as pd
import matplotlib.pyplot as plt
from factor_analyzer import FactorAnalyzer

In [None]:
# Check which columns have any NaN values
print(data.isnull().any())

In [None]:
# Check for any remaining NaNs
print(data.isna().sum())

# Ensure all columns are numeric
print(data.dtypes)

# Convert all columns to numeric if necessary
data = data.apply(pd.to_numeric, errors='coerce')

# Re-check if any NaN values remain
print(data.isna().sum())


You may need to exclude the first column in the analysis because of the data format. If you need to do so, the following are some options to do so

In [None]:
# Exclude the first column (e.g., ID column) and use it as the index if needed. Choose one option below and comment out the other one 
data.set_index(data.columns[0], inplace=True)  # Optional: set the first column as the index
data_features = data.iloc[:, 1:]  # Select all columns except the first one

In [None]:
# Standardize the numeric data
data2 = data.iloc[:, 1:]  # Exclude the 'ID' column
scaler = StandardScaler()
data_scaled = scaler.fit_transform(data2)

In [None]:
# Apply PCA - test out how different numbers of components impact your results
# The step below includes a step to rotate using varimax
pca = PCA(n_components=10)  # Adjust components as needed (start with 10 then change it to see the result)
pca_result = pca.fit_transform(data2)

In [None]:


# Fit the PCA model with rotation
fa = FactorAnalyzer(rotation='varimax', n_factors=10)
fa.fit(data_scaled)

# Get the rotated loadings
rotated_loadings = fa.loadings_

# Create a DataFrame for the rotated loadings
rotated_loadings_df = pd.DataFrame(
    rotated_loadings,
    index=data2.columns,
    columns=[f'Rotated PC{i+1}' for i in range(rotated_loadings.shape[1])]
)

# Save the rotated loadings to CSV
rotated_loadings_df.to_csv('rotated_pca_loadings.csv', index=True)

# Display the rotated loadings
print(rotated_loadings_df)


In [None]:
# Get eigenvalues (equivalent to explained variance)
eigenvalues = pca.explained_variance_

# Get explained variance and cumulative explained variance
explained_variance = pca.explained_variance_ratio_
cumulative_explained_variance = explained_variance.cumsum()

#  Create a DataFrame to display the results
variance_df = pd.DataFrame({
    'Principal Component': [f'PC{i+1}' for i in range(len(explained_variance))],
    'Eigenvalue': eigenvalues,
    'Explained Variance': explained_variance,
    'Cumulative Explained Variance': cumulative_explained_variance
})

# Display the DataFrame
print(variance_df)

# Save to CSV if needed
variance_df.to_csv('pca_variance_explained.csv', index=False)



**Scree Plot**


The Scree Plot is a key visualization tool in Principal Component Analysis (PCA) that helps determine the number of principal components to retain for meaningful analysis. It plots the eigenvalues of the components in descending order against their corresponding principal component numbers. Eigenvalues represent the amount of variance explained by each principal component, and the plot helps identify where the explained variance begins to level off, often referred to as the "elbow point." This point indicates the optimal number of components to retain, balancing dimensionality reduction with the preservation of data variance. In this section, we create a Scree Plot to guide the selection of principal components for further analysis.

In [None]:
# Scree plot

plt.figure(figsize=(10, 6))
plt.plot(range(1, len(eigenvalues) + 1), eigenvalues, marker='o', linestyle='--')
plt.xlabel('Principal Component')
plt.ylabel('Eigenvalue')
plt.title('Scree Plot')
plt.grid()
plt.show()


**Cumulative Explained Variance Plot**


The Cumulative Explained Variance Plot is an essential tool in Principal Component Analysis (PCA) for understanding how much of the total variance in the data is captured by the principal components. It displays the cumulative proportion of variance explained as the number of components increases. This plot helps identify the minimum number of components required to capture a desired level of variance (e.g., 90% or 95%). By examining this plot, we can make informed decisions about the dimensionality reduction, ensuring that the selected components retain the majority of the information from the original dataset while reducing complexity. In this section, we calculate and visualize the cumulative explained variance to guide the selection of principal components.

In [None]:
# Calculate cumulative explained variance
cumulative_variance = pca.explained_variance_ratio_.cumsum()

# Plot cumulative explained variance
plt.figure(figsize=(10, 6))
plt.plot(range(1, len(cumulative_variance) + 1), cumulative_variance, marker='o', linestyle='--')
plt.xlabel('Number of Principal Components')
plt.ylabel('Cumulative Explained Variance')
plt.title('Cumulative Explained Variance Plot')
plt.grid()
plt.show()


**The Biplot**

The Biplot is a powerful visualization in Principal Component Analysis (PCA) that combines information about both the observations and the variables. It provides insights into the relationships among variables, the distribution of observations, and how the variables contribute to the principal components.

In this biplot:

- The scores (dots) represent the projections of the observations onto the first two principal components (PC1 and PC2).
- The loadings (arrows) show the contributions of the original variables to these principal components. The direction and length of the arrows indicate the variable's influence and its relationship with the components.
  
This visualization helps interpret the principal components by linking them back to the original variables, making it easier to understand how the reduced dimensions capture the variability in the dataset. In this section, we construct a biplot for the first two principal components to explore these relationships visually. You can plot different components to get an understanding of your data

In [None]:


# Step 1: Extract PC scores (first two principal components)
pc_scores = pca.fit_transform(data_scaled)[:, :2]

# Step 2: Extract loadings (first two components)
loadings = pca.components_.T[:, :2]

# Step 3: Create a biplot
plt.figure(figsize=(12, 8))
plt.scatter(pc_scores[:, 0], pc_scores[:, 1], alpha=0.6, color='gray', edgecolor='k', label='Observations')

# Plot the loadings (arrows) and variable names
for i, (x, y) in enumerate(loadings):
    plt.arrow(0, 0, x, y, color='red', alpha=0.7)
    plt.text(x * 1.15, y * 1.15, data2.columns[i], color='red', ha='center', va='center')

# Customize the plot
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.title('Biplot of the First Two Principal Components')
plt.axhline(0, color='black', linewidth=0.5)
plt.axvline(0, color='black', linewidth=0.5)
plt.grid()
plt.legend()
plt.show()


In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Adjust the scaling factor for the loadings
scaling_factor = 3  # Increase to separate the arrows

# Step 1: Extract PC scores (first two principal components)
pc_scores = pca.fit_transform(data_scaled)[:, :2]

# Step 2: Extract loadings and scale them
loadings = pca.components_.T[:, :2] * scaling_factor

# Step 3: Create an improved biplot
plt.figure(figsize=(12, 8))
plt.scatter(pc_scores[:, 0], pc_scores[:, 1], alpha=0.6, color='gray', edgecolor='k', label='Observations')

# Plot the scaled loadings (arrows) and variable names
for i, (x, y) in enumerate(loadings):
    if np.abs(x) > 0.3 or np.abs(y) > 0.3:  # Display only variables with high loadings
        plt.arrow(0, 0, x, y, color='red', alpha=0.7)
        plt.text(x * 1.15, y * 1.15, data2.columns[i], color='red', ha='center', va='center')

# Customize the plot
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.title('Improved Biplot of the First Two Principal Components')
plt.axhline(0, color='black', linewidth=0.5)
plt.axvline(0, color='black', linewidth=0.5)
plt.grid()
plt.legend()
plt.show()


**Principal Component Score Plot**

The Principal Component (PC) Score Plot is a visualization that displays the distribution of observations in the reduced dimensional space defined by the first two principal components (PC1 and PC2). Each point in the plot represents an observation, with its coordinates determined by the scores for PC1 and PC2.

This plot provides insights into:

- Clustering: Identifying groups or patterns among observations.
- Outliers: Detecting observations that are distinctly separated from others.
- Relationships: Exploring the spread and orientation of observations along the principal components.
  
In this section, we create a scatter plot of the first two principal components to analyze the distribution and relationships among the observations in the reduced space. This visualization serves as a foundation for interpreting the PCA results and understanding the structure of the data. Plot different components as needed to better understand your data

In [None]:
# Step 1: Extract the PC scores (first two components)
pc1 = pc_scores[:, 0]
pc2 = pc_scores[:, 1]

# Step 2: Create a scatter plot of the first two PCs
plt.figure(figsize=(10, 6))
plt.scatter(pc1, pc2, color='blue', alpha=0.6, edgecolor='k')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.title('PC Score Plot: PC1 vs PC2')
plt.axhline(0, color='black', linewidth=0.5)
plt.axvline(0, color='black', linewidth=0.5)
plt.grid()
plt.show()


All the previous steps—data preparation, PCA, visualization of principal components, and exploration of relationships among variables—were designed to deepen our understanding of the dataset's structure and uncover potential patterns. These analyses provided valuable insights into the variance in the data, the relationships among variables, and the distribution of observations in the reduced dimensional space. By identifying patterns, clusters, or groupings within the data, we are now better equipped to perform a focused and meaningful cluster analysis. The next step will leverage these findings to group similar observations into clusters, facilitating a more detailed interpretation of underlying patterns and enabling data-driven decision-making.

**NEXT IS THE CLUSTER ANALYSIS**

In [None]:
# Data needs to be structured:
# Columns that I am using at this initial stage are those related to the extended TPB THAT HAVE BEEN IDENTIFIED BASED ON THE PCA.

# Consolidation of each construct by averaging its related columns is a vital fist step
# Then column names are adjusted to match data structure
data2['Behavior'] = data2[['Behavior1 Coded', 'Behavior2 Coded', 'Behavior3 Coded',	'Behavior4 Coded', 'Behavior5 Coded', 'Behavior6 Coded', 'Behavior7 Coded', 'Behavior8 Coded', 'Behavior9 Coded']].mean(axis=1)
data2['Attitude'] = data2[['Attitude3 Coded', 'Attitude4 Coded', 'Attitude5 Coded', 'Attitude6 Coded', 'Attitude7 Coded', 'Attitude8 Coded', 'Attitude9 Coded']].mean(axis=1)
data2['Social_Norms'] = data2[['SNorms2 Coded', 'SNorms3 Coded']].mean(axis=1)
data2['PBC'] = data2[['PBC1 Coded', 'PBC2 Coded',	'PBC3 Coded', 'PBC4 Coded']].mean(axis=1)
data2['Moral_Norms'] = data2[['Moral1 Coded', 'Moral2 Coded', 'Moral3 Coded', 'Moral4 Coded']].mean(axis=1)
data2['Perceived_Risk'] = data2[['RiskP2 Coded', 'RiskP3 Coded', 'RiskP4 Coded', 'RiskP5 Coded', 'RiskP6 Coded']].mean(axis=1)
data2['Trust'] = data2[['Trust2 Coded', 'Trust3 Coded']].mean(axis=1)
data2['Climate_Perc'] = data2[['CCPerception1 Coded', 'CCPerception2 Coded', 'CCPerception4 Coded', 'CCPerception5 Coded', 'CCPerception6 Coded', 'CCPerception7 Coded', 'CCPerception8 Coded', 'CCPerception9 Coded', 'CCPerception11 Coded', 'CCPerception12 Coded', 'CCPerception13 Coded']].mean(axis=1)

# Optional: Drop the original individual columns if they are no longer needed and create a new one
#data2 = data[['Behavior', 'Attitude', 'Social_Norms', 'PBC', 'Moral_Norms', 'Perceived_Risk', 'Trust', 'Climate_Perc']]

In [None]:
print(data2.head())

In [None]:
data2.to_csv('survey with structured data2.csv', index=True)

In [None]:
# Check for any remaining NaNs
print(data2.isna().sum())

# Ensure all columns are numeric
print(data2.dtypes)

# Convert all columns to numeric if necessary
data = data2.apply(pd.to_numeric, errors='coerce')

# Re-check if any NaN values remain
print(data2.isna().sum())


In [None]:
# Drop columns with any NaN values
data2 = data2.dropna(axis=1)

In [None]:
# Next clustering and visualization 
# Sample DataFrame with standardized TPB variables
# Replace this with your actual data

# Step 1: Clustering
kmeans = KMeans(n_clusters=4, random_state=42)
data2['cluster'] = kmeans.fit_predict(data2)

# Step 2: Calculate mean values for each cluster using the new data
cluster_means = data2.groupby('cluster').mean()



In [None]:
# Step 2: Verify the new column
print(data2.head())  # Check the first few rows to confirm
print(data2['cluster'].value_counts())  # Check the number of observations in each cluster

In [None]:
#Check cluster assignments
print(data2['cluster'].value_counts())


In [None]:
data2.to_csv('survey with structured data and clusters.csv', index=True)

In [None]:
# Step 3: Radar Chart Setup
categories = ['Attitude', 'Social_Norms', 'PBC', 'Moral_Norms', 'Perceived_Risk', 'Trust', 'Climate_Perc']
num_vars = len(categories)

# Create radar chart for each cluster
fig, axs = plt.subplots(2, 2, figsize=(10, 10), subplot_kw=dict(polar=True))
fig.suptitle("Radar Charts for Each Cluster")

# Define the angles for each axis
angles = [n / float(num_vars) * 2 * pi for n in range(num_vars)]
angles += angles[:1]

# Calculate the maximum value across all categories and clusters
max_value = cluster_means[categories].max().max()

for i in range(4):
    # Get the mean values for the relevant categories only
    values = cluster_means.loc[i, categories].values.flatten().tolist()
    values += values[:1]  # Repeat the first value to close the circle

    # Create subplot for each cluster
    ax = axs[i // 2, i % 2]
    ax.set_theta_offset(pi / 2)
    ax.set_theta_direction(-1)

    # Draw one axis per variable and add labels
    ax.set_xticks(angles[:-1])
    ax.set_xticklabels(categories)

    # Set consistent y-axis scale for all charts
    ax.set_ylim(0, max_value)
    ax.set_yticks([1, 2, 3, 4, max_value])

    # Plot data and fill area
    ax.plot(angles, values, linewidth=2, linestyle='solid')
    ax.fill(angles, values, alpha=0.4)

    # Title for each subplot
    ax.set_title(f"Cluster {i+1}")

plt.tight_layout()
plt.show()

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from math import pi

# Define categories and labels
categories = ['Attitude', 'Social_Norms', 'PBC', 'Moral_Norms', 'Perceived_Risk', 'Trust', 'Climate_Perc']
cluster_labels = ['Risk-Averse Adopters', 'Socially Influenced Adopters', 'Innovative Adopters', 'Traditionalists']
num_vars = len(categories)

# Define the angles for each axis
angles = [n / float(num_vars) * 2 * pi for n in range(num_vars)]
angles += angles[:1]

# Set a fixed maximum value for all radar charts
max_value = 5

# Create radar charts for each cluster
fig, axs = plt.subplots(2, 2, figsize=(12, 12), subplot_kw=dict(polar=True))
fig.suptitle("Radar Charts for Each Cluster (Standardized Scale)")

for i in range(4):
    # Get the mean values for the relevant categories only
    values = cluster_means.loc[i, categories].values.flatten().tolist()
    values += values[:1]  # Repeat the first value to close the circle

    # Create subplot for each cluster
    ax = axs[i // 2, i % 2]
    ax.set_theta_offset(pi / 2)
    ax.set_theta_direction(-1)

    # Draw one axis per variable and add labels
    ax.set_xticks(angles[:-1])
    ax.set_xticklabels(categories)

    # Set consistent y-axis scale for all charts
    ax.set_ylim(0, max_value)
    ax.set_yticks([0, 1, 2, 3, 4, max_value])

    # Plot data and fill area
    ax.plot(angles, values, linewidth=2, linestyle='solid')
    ax.fill(angles, values, alpha=0.4)

    # Title for each subplot with cluster labels
    ax.set_title(f"Cluster {i+1}: {cluster_labels[i]}")

# Adjust layout and display the plot
plt.tight_layout()
plt.show()


**TRYING SANKEY DIAGRAM**


A Sankey Diagram is a powerful visualization tool used to depict flows or connections between different categories or variables. The width of each flow is proportional to its value, making it easy to understand the relative importance or magnitude of connections. Sankey diagrams are particularly useful for visualizing relationships, such as transitions, distributions, or resource flows, across multiple stages or categories. In this section, we will create a Sankey diagram to represent the relationships in our data (demographic variables to our clusters), using the appropriate Python libraries (e.g., plotly or matplotlib). By constructing this diagram, we can gain valuable insights into how different categories interact, helping to identify key pathways or dominant flows within the dataset.


In [None]:
import pandas as pd
import plotly.graph_objects as go

In [None]:
# Load your data
data = pd.read_csv("path-to-your-data/SANKEY.csv")

In [None]:
# Define mappings for labels
age_map = {0: 'Young', 1: 'Middle-aged', 2: 'Senior'}
education_map = {
    0: 'Prefer not to say',
    1: '12th Grade or Less',
    2: 'High School Diploma',
    3: 'Trade/Vocational',
    4: 'Bachelor’s Degree',
    5: 'Post Graduate'
}
political_map = {
    0: 'Prefer not to say',
    1: 'Democrat',
    2: 'Republican',
    3: 'Independent'
}
cluster_map = {
    0: 'Risk-Averse',
    1: 'Socially Influenced',
    2: 'Conditional',
    3: 'Traditionalists'
}

In [None]:
# Categorize Age into groups
data['Age Group'] = pd.cut(data['Age'], bins=[0, 35, 60, 100], labels=['Young', 'Middle-aged', 'Senior'])

# Group the data based on Age, Education, and Political Affiliation with clusters
age_flow = data.groupby(['Age Group', 'cluster']).size().reset_index(name='count')
education_flow = data.groupby(['Highest level of education', 'cluster']).size().reset_index(name='count')
political_flow = data.groupby(['Political affiliation', 'cluster']).size().reset_index(name='count')

# Define labels for Sankey diagram using mappings
labels = (list(age_map.values()) +
          list(education_map.values()) +
          list(political_map.values()) +
          list(cluster_map.values()))

# Create a label mapping
label_map = {label: i for i, label in enumerate(labels)}

# Initialize lists for sources, targets, and values
sources = []
targets = []
values = []

# Age Group to Clusters
for _, row in age_flow.iterrows():
    sources.append(label_map[row['Age Group']])
    targets.append(label_map[cluster_map[row['cluster']]])
    values.append(row['count'])

# Education Level to Clusters
for _, row in education_flow.iterrows():
    sources.append(label_map[education_map[row['Highest level of education']]])
    targets.append(label_map[cluster_map[row['cluster']]])
    values.append(row['count'])

# Political Affiliation to Clusters
for _, row in political_flow.iterrows():
    sources.append(label_map[political_map[row['Political affiliation']]])
    targets.append(label_map[cluster_map[row['cluster']]])
    values.append(row['count'])

# Create the Sankey diagram
fig = go.Figure(go.Sankey(
    node=dict(
        pad=15,
        thickness=20,
        line=dict(color="black", width=0.5),
        label=labels,
    ),
    link=dict(
        source=sources,
        target=targets,
        value=values
    )
))

# Update layout and show the diagram
fig.update_layout(title_text="Sankey Diagram of Farmer Clusters by Age, Education, and Political Affiliation", font_size=10)
fig.show()

In [None]:
# Categorize Age into groups
# Ensure Age Group column is created correctly
data['Age Group'] = pd.cut(data['Age'], bins=[0, 35, 60, 100], labels=[0, 1, 2])  # Use numeric labels (0, 1, 2)
age_map = {0: 'Young', 1: 'Middle-aged', 2: 'Senior'}


In [None]:
# Create separate Sankey diagrams for each variable

def create_sankey(source_column, source_map, title):
    # Group the data
    flow_data = data.groupby([source_column, 'cluster']).size().reset_index(name='count')
    
    # Define labels
    labels = list(source_map.values()) + list(cluster_map.values())
    label_map = {label: i for i, label in enumerate(labels)}

    # Initialize lists for sources, targets, and values
    sources = []
    targets = []
    values = []

    # Create flows
    for _, row in flow_data.iterrows():
        sources.append(label_map[source_map[row[source_column]]])
        targets.append(label_map[cluster_map[row['cluster']]])
        values.append(row['count'])

    # Create the Sankey diagram
    fig = go.Figure(go.Sankey(
        node=dict(
            pad=15,
            thickness=20,
            line=dict(color="black", width=0.5),
            label=labels,
        ),
        link=dict(
            source=sources,
            target=targets,
            value=values
        )
    ))

    # Update layout and show the diagram
    fig.update_layout(title_text=title, font_size=10)
    fig.show()

# Sankey Diagram for Age
create_sankey('Age Group', age_map, "Sankey Diagram: Age vs. Farmer Clusters")

# Sankey Diagram for Political Affiliation
create_sankey('Political affiliation', political_map, "Sankey Diagram: Political Affiliation vs. Farmer Clusters")

# Sankey Diagram for Education
create_sankey('Highest level of education', education_map, "Sankey Diagram: Education vs. Farmer Clusters")

In [None]:


# Group the data based on Age, Education, and Political Affiliation with clusters
age_flow = data.groupby(['Age Group', 'cluster']).size().reset_index(name='count')
education_flow = data.groupby(['Highest level of education', 'cluster']).size().reset_index(name='count')
political_flow = data.groupby(['Political affiliation', 'cluster']).size().reset_index(name='count')

# Define source, target, and value lists for the Sankey diagram using coded values
sources = []
targets = []
values = []

# Use coded values directly for sources (Age: 1 = Young, 2 = Middle-aged, 3 = Senior)
# Age Group to Clusters
for _, row in age_flow.iterrows():
    sources.append(row['Age Group'])  # Use Age Group coded values
    targets.append(row['cluster'] + 10)  # Offset target indices for clusters to avoid overlap
    values.append(row['count'])

# Education Level to Clusters (Coded: 1 to 5 for education levels)
for _, row in education_flow.iterrows():
    sources.append(row['Highest level of education'])
    targets.append(row['cluster'] + 10)  # Offset for clusters
    values.append(row['count'])

# Political Affiliation to Clusters (Coded: 1 = Democrat, 2 = Republican, 3 = Independent)
for _, row in political_flow.iterrows():
    sources.append(row['Political affiliation'])
    targets.append(row['cluster'] + 10)  # Offset for clusters
    values.append(row['count'])

# Create the Sankey diagram without explicit labels
fig = go.Figure(go.Sankey(
    node=dict(
        pad=15,
        thickness=20,
        line=dict(color="black", width=0.5),
        # Empty labels list
        label=[]
    ),
    link=dict(
        source=sources,
        target=targets,
        value=values
    )
))

fig.update_layout(title_text="Sankey Diagram Without Explicit Labels", font_size=10)
fig.show()
