In [8]:
import pandas as pd
import plotly.express as px

# Load the data
file_path = '../../../data/Incredibuild/Customer sales data/June 2023 Incredibuild Customer Survey All except China and Japan Results v1.xlsx'
sheet_name = 2  # Third sheet
survey_data = pd.read_excel(file_path, sheet_name=sheet_name)


survey_data.head()

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,...,14,Unnamed: 69,Unnamed: 70,Unnamed: 71,Unnamed: 72,Unnamed: 73,Unnamed: 74,Unnamed: 75,Unnamed: 76,Unnamed: 77
0,Respondent ID,Collector ID,Start Date,End Date,IP Address,Account 18 dig.,Account Name,Email Address,First Name,Last Name,...,Youremail,Account ARR,Industry,Country,Region,Tier,CSM,AE,IB10 Status,Account Age
1,,,,,,,,,,,...,Open-Ended Response,,,,,,,,,
2,118339270419,451177479,2023-06-07 20:15:38,2023-06-07 20:23:43,109.129.242.88,0012000000AgoWAAAZ,IMI Hydronic Enginnering,awa.ndaw@imi-hydronic.com,,,...,,416.4,Embedded Systems / Hardware,Belgium,EMEA,Tier 4,Hofit Shelly,Yaniv Gabay,Fully Converted,180
3,118339034703,451177479,2023-06-07 16:17:18,2023-06-07 16:17:26,37.76.35.95,0016900002tKM68AAG,Invictus Games,viktor.szilagyi@invictus.com,,,...,,9738,Game Development,Hungary,EMEA,Tier 4,Jacquelyn Cohen,Lothar Stuck,Pending Conversion,194
4,118338836935,451177479,2023-06-07 11:10:26,2023-06-07 11:14:47,141.135.121.126,0012000000FEM1NAAX,ICOS Vision Systems NV,kevin.cypers@kla.com,,,...,,8191.04,Embedded Systems / Hardware,Belgium,EMEA,Tier 4,Natalia Garcia,Yaniv Gabay,Partially,175


In [9]:
# Data Cleaning
survey_data_cleaned = survey_data.drop([0, 1])  # Drop first two rows
column_names = survey_data.iloc[0]  # Use the first row as column names
survey_data_cleaned.columns = column_names
survey_data_cleaned = survey_data_cleaned[survey_data_cleaned['Tier'] != '0']  # Remove '0' entries in Tier

# EDA: Distribution of clients across different Tiers
tier_distribution = survey_data_cleaned['Tier'].value_counts()
tier_distribution_fig = px.bar(tier_distribution, 
                               x=tier_distribution.index, 
                               y=tier_distribution.values, 
                               labels={'y': 'Number of Clients', 'index': 'Tier'},
                               title='Distribution of Clients Across Tiers')
tier_distribution_fig.show()

# EDA: Relationship between Tier and Industry
industry_tier_fig = px.histogram(survey_data_cleaned, 
                                 y="Industry", 
                                 color="Tier", 
                                 barmode='group',
                                 title='Tier Distribution by Industry')
industry_tier_fig.show()

# EDA: Relationship between Tier and Country
country_tier_fig = px.histogram(survey_data_cleaned, 
                                y="Country", 
                                color="Tier", 
                                barmode='group',
                                title='Tier Distribution by Country')
country_tier_fig.show()

# EDA: Relationship between Tier and Region
region_tier_fig = px.histogram(survey_data_cleaned, 
                               x="Region", 
                               color="Tier", 
                               barmode='group',
                               title='Tier Distribution by Region')
region_tier_fig.show()

# Additional exploration: 'Account ARR' and 'Account Age'
survey_data_cleaned['Account ARR'] = pd.to_numeric(survey_data_cleaned['Account ARR'], errors='coerce')
survey_data_cleaned['Account Age'] = pd.to_numeric(survey_data_cleaned['Account Age'], errors='coerce')

account_arr_fig = px.box(survey_data_cleaned, y='Account ARR', title='Distribution of Account ARR')
account_arr_fig.show()

account_age_fig = px.box(survey_data_cleaned, y='Account Age', title='Distribution of Account Age')
account_age_fig.show()

In [14]:
# Clean up column names
survey_data_cleaned = survey_data_cleaned.rename(columns=lambda x: 'Unnamed' if pd.isna(x) else x)

# Remove columns with 'Unnamed' as their names (if you decide to remove them)
survey_data_cleaned = survey_data_cleaned.loc[:, ~survey_data_cleaned.columns.str.contains('Unnamed')]

# Now, re-run the identification and categorization of sparse columns
missing_values = survey_data_cleaned.isnull().sum()
sparse_columns = missing_values[missing_values > 0].index.tolist()

categorical_columns_info = []
free_text_columns_info = []

for col in sparse_columns:
    unique_values = survey_data_cleaned[col].nunique()
    if unique_values <= 20:  # Categorical columns
        categorical_columns_info.append({
            'Column': col,
            'Total Missing': missing_values[col],
            'Unique Values': unique_values,
            'Type': 'Categorical'
        })
    else:  # Free text columns
        free_text_columns_info.append({
            'Column': col,
            'Total Missing': missing_values[col],
            'Unique Values': unique_values,
            'Type': 'Free Text'
        })

# Converting to DataFrames for display
categorical_columns_df = pd.DataFrame(categorical_columns_info)
free_text_columns_df = pd.DataFrame(free_text_columns_info)

print("\nCategorical Columns:\n", categorical_columns_df)
print("\nFree Text Columns:\n", free_text_columns_df)



Categorical Columns:
                                                Column  Total Missing  \
0                                          First Name            127   
1                                           Last Name            127   
2                                       Custom Data 1            127   
3   We're sorry to learn your experience could hav...            111   
4             How satisfied are you with our service?             50   
5   How satisfied are you with the current level o...             48   
6   What other workloads with long build times doe...             92   
7   Which of the following Unreal Engine technolog...             96   
8                What is your game's target platform?            102   
9   Would you consider using a fully managed accel...             51   
10  Why would you not consider a SaaS service for ...            122   
11     Are you using build caching in your production             51   
12  Anything else you want to add? (e.g. 

In [16]:
# Calculate unique values for each column
unique_values_count = survey_data_cleaned.nunique()

# Identify columns that are potentially categorical
# These are columns with more than 1 unique value
categorical_columns = unique_values_count[unique_values_count > 1].index.tolist()

# Create a DataFrame for the categorical columns with their unique value counts
categorical_columns_info = pd.DataFrame({
    'Column': categorical_columns,
    'Unique Values': unique_values_count[categorical_columns]
})

# Display the DataFrame
categorical_columns_info


Unnamed: 0_level_0,Column,Unique Values
0,Unnamed: 1_level_1,Unnamed: 2_level_1
Respondent ID,Respondent ID,127
Start Date,Start Date,127
End Date,End Date,127
IP Address,IP Address,119
Account 18 dig.,Account 18 dig.,103
Account Name,Account Name,103
Email Address,Email Address,127
How likely is it that you would recommend Incredibuild to a friend or colleague?,How likely is it that you would recommend Incr...,11
We're sorry to learn your experience could have been better. Please help us improve by sharing some details:,We're sorry to learn your experience could hav...,16
How satisfied are you with our service?,How satisfied are you with our service?,4


In [15]:
import plotly.graph_objects as go

# Function to create a bar plot for a categorical column
def plot_categorical_column(data, column, title):
    value_counts = data[column].value_counts()
    fig = go.Figure([go.Bar(x=value_counts.index, y=value_counts.values)])
    fig.update_layout(title=title, xaxis_title=column, yaxis_title='Count')
    fig.show()

# Visualizing the categorical columns (excluding those with 0 unique values)
for col_info in categorical_columns_info:
    col_name = col_info['Column']
    unique_values = col_info['Unique Values']
    if unique_values > 0:  # Exclude columns with 0 unique values
        plot_title = f'Distribution of {col_name}'
        plot_categorical_column(survey_data_cleaned, col_name, plot_title)

# Note: Visualization of free text columns is excluded due to their complexity