In [53]:
import plotly.express as px
from scipy.stats import mannwhitneyu
import pandas as pd

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Exploratory Data Analysis

In [54]:
data_dir = '/content/drive/My Drive/Codigo/finnish_survey/'

In [55]:
df = pd.read_csv(f'{data_dir}growth_Survey_data.csv', delimiter=';', decimal=',', na_values=' ')

In [56]:
df.head()

Unnamed: 0,Growth_Firm,question_2_row_1_transformed,question_2_row_2_transformed,question_3_row_1,question_3_row_2,question_3_row_3,question_3_row_4,question_3_row_5,question_3_row_6,question_3_row_7,...,question_5_row_4,question_5_row_5,question_5_row_6,question_5_row_7,question_5_row_8,question_5_row_9,question_5_row_10,question_6_row_1,question_6_row_2,question_7_row_1
0,0,35.135135,50.750939,4,5,5,4,3,3,4,...,4,2,4,2,3,2.0,5.0,4,5,1
1,0,23.018043,51.1822,5,4,4,4,4,4,4,...,3,4,3,3,3,4.0,3.0,5,4,1
2,0,86.640472,62.932639,3,4,4,4,4,3,4,...,5,4,4,4,4,,,5,3,1
3,0,17.647059,39.130435,3,4,5,4,4,4,5,...,3,3,4,4,4,3.0,3.0,3,3,1
4,0,60.0,32.802125,4,4,4,4,3,4,4,...,4,2,4,2,3,3.0,4.0,5,2,2


In [57]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 120 entries, 0 to 119
Data columns (total 36 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Growth_Firm                   120 non-null    int64  
 1   question_2_row_1_transformed  120 non-null    float64
 2   question_2_row_2_transformed  120 non-null    float64
 3   question_3_row_1              120 non-null    int64  
 4   question_3_row_2              120 non-null    int64  
 5   question_3_row_3              120 non-null    int64  
 6   question_3_row_4              120 non-null    int64  
 7   question_3_row_5              120 non-null    int64  
 8   question_3_row_6              120 non-null    int64  
 9   question_3_row_7              120 non-null    int64  
 10  question_3_row_8              120 non-null    int64  
 11  question_3_row_9              120 non-null    int64  
 12  question_3_row_10             120 non-null    int64  
 13  quest

In [58]:
df.isnull().sum().sum()

74

In [59]:
df.describe()

Unnamed: 0,Growth_Firm,question_2_row_1_transformed,question_2_row_2_transformed,question_3_row_1,question_3_row_2,question_3_row_3,question_3_row_4,question_3_row_5,question_3_row_6,question_3_row_7,...,question_5_row_4,question_5_row_5,question_5_row_6,question_5_row_7,question_5_row_8,question_5_row_9,question_5_row_10,question_6_row_1,question_6_row_2,question_7_row_1
count,120.0,120.0,120.0,120.0,120.0,120.0,120.0,120.0,120.0,120.0,...,120.0,120.0,120.0,120.0,120.0,83.0,83.0,120.0,120.0,120.0
mean,0.516667,203.334625,538.350618,3.991667,4.366667,4.308333,4.275,3.908333,4.075,4.1,...,3.966667,3.566667,4.0,3.9,3.616667,3.819277,4.060241,3.783333,3.508333,1.55
std,0.501817,244.200209,1639.814434,0.772379,0.672801,0.683489,0.721372,0.721761,0.757733,0.690877,...,0.839501,0.976474,0.879266,0.834145,0.94543,0.885446,0.901887,0.997335,1.028957,0.49958
min,0.0,-57.627119,-78.35654,2.0,2.0,2.0,1.0,2.0,2.0,2.0,...,2.0,1.0,2.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0
25%,0.0,50.0,68.312465,4.0,4.0,4.0,4.0,4.0,4.0,4.0,...,3.75,3.0,3.75,3.0,3.0,3.0,4.0,3.0,3.0,1.0
50%,1.0,118.98977,166.300042,4.0,4.0,4.0,4.0,4.0,4.0,4.0,...,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,2.0
75%,1.0,267.066895,348.028183,4.0,5.0,5.0,5.0,4.0,5.0,5.0,...,5.0,4.0,5.0,4.0,4.0,4.0,5.0,5.0,4.0,2.0
max,1.0,1251.351351,16103.052331,5.0,5.0,5.0,5.0,5.0,5.0,5.0,...,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,2.0


## Preprocessing

In [60]:
# Using ChatGpt, I created a dictionary with the column names and the original
# questions that they correspond to, for showing them in the plots.

questions_dict = {
    "Growth_Firm": "Company classification to either 'Growth' or 'Non-Growth' company",
    "question_2_row_1_transformed": "Expected employee count in five years (as a percent from last available year)",
    "question_2_row_2_transformed": "Expected revenue in five years (as a percent from last available year)",
    "question_3_row_1": "Employees are encouraged to be creative",
    "question_3_row_2": "Managers are expected to be creative problem solvers",
    "question_3_row_3": "Employees' ability to function creatively is respected",
    "question_3_row_4": "We are constantly looking for ways to develop and offer new or improved products and services",
    "question_3_row_5": "Assistance in developing new ideas is readily available",
    "question_3_row_6": "Our organization is open and responsive to changes",
    "question_3_row_7": "Managers here are always searching for fresh, new ways of looking at problems",
    "question_3_row_8": "Our organization has a clear and inspiring set of future goals",
    "question_3_row_9": "We have ensured that all managers and employees share the same vision of the future",
    "question_3_row_10": "All departments and employees share a clear vision of the future",
    "question_3_row_11": "We believe that higher risks are worth taking for high payoff",
    "question_3_row_12": "We encourage innovative initiatives, knowing well that some will fail",
    "question_3_row_13": "We do not like to 'play it safe'",
    "question_3_row_14": "Managers are constantly seeking new opportunities for the organization",
    "question_3_row_15": "Managers take the initiative in an effort to shape the environment to the organization’s advantage",
    "question_3_row_16": "Managers usually take the initiative by introducing new administrative techniques",
    "question_4_row_1": "Our company's top management frequently has discussions on renewal, innovation, and growth with managers from other companies",
    "question_4_row_2": "We have learned important new information on markets, technologies, and administration from interactions with managers from other companies",
    "question_4_row_3": "Interactions with managers from other companies have helped us build our capabilities and skills",
    "question_4_row_4": "Active discussions with managers from other companies have generated collaborations leading to new innovations",
    "question_5_row_1": "We commercialize products and services that challenge our previous products/services",
    "question_5_row_2": "We develop and commercialize products and services that are completely new",
    "question_5_row_3": "We frequently utilize new opportunities in new markets",
    "question_5_row_4": "We develop our business model to stand out from our competitors",
    "question_5_row_5": "We use experiments to identify and evaluate new business opportunities",
    "question_5_row_6": "We frequently make small adjustments to our existing products and services",
    "question_5_row_7": "We improve the efficiency of our products/services",
    "question_5_row_8": "We increase economies of scales in existing markets",
    "question_5_row_9": "We introduce improved versions of existing products and services for our local market",
    "question_5_row_10": "Our organization expands our offering for existing clients",
    "question_6_row_1": "The market we operate in is undergoing intense changes",
    "question_6_row_2": "Our clients regularly ask for new products and services",
    "question_7_row_1": "Has the COVID-19 pandemic had a significant impact on your firm's actions related to the topics mentioned above during the previous year"
}


## Data Analysis

In [61]:
# Histogram of expected employee count in 5 years
px.histogram(
    df,
    x= 'question_2_row_1_transformed',
    labels= {'question_2_row_1_transformed': questions_dict['question_2_row_1_transformed']},
    facet_row= 'Growth_Firm'
)

In [62]:
# Check if expected employee count varies according to growth
df_growth = df[df['Growth_Firm'] == 1]
df_no_growth = df[df['Growth_Firm'] == 0]
mannwhitneyu(df_growth['question_2_row_1_transformed'], df_no_growth['question_2_row_1_transformed'])

# Since Pvalue is lower than 0.05, there is statistically significant difference
# between both samples.

MannwhitneyuResult(statistic=2297.0, pvalue=0.008843587899143496)

In [63]:
# Histogram of expected revenue in 5 years
px.histogram(
    df,
    x= 'question_2_row_2_transformed',
    labels= {'question_2_row_2_transformed': 'Expected revenue in 5 years'},
)

In [64]:
# Plot expected revenue growth according to employee creativity
px.scatter(
    df,
    x= 'question_2_row_2_transformed',
    y= 'question_2_row_1_transformed',
    color= df['question_3_row_1'].astype(str),
    labels= {
        'question_2_row_2_transformed': 'Revenue growth',
        'question_2_row_1_transformed': 'Employee growth',
        'color': 'Employee creativity expectations'
    },
    color_discrete_sequence=["#FF0000", "#FFA500", "#006400", "#0000FF"]
)

In [65]:
# prompt: I would like to extract the numbers from each column title of the
# dataframe in a list of tuples

import re

def extract_numbers_from_columns(df):
  """
  Extracts numbers from column titles of a Pandas DataFrame.
  """
  extracted_numbers = []
  for col in df.columns[3:]:
    numbers = re.findall(r'\d+', col)  # Find all sequences of digits
    if numbers:
      extracted_numbers.append(tuple(map(int, numbers)))
  return extracted_numbers

# Example usage (assuming your DataFrame is named 'df'):
questions_list = extract_numbers_from_columns(df)
questions_list

[(3, 1),
 (3, 2),
 (3, 3),
 (3, 4),
 (3, 5),
 (3, 6),
 (3, 7),
 (3, 8),
 (3, 9),
 (3, 10),
 (3, 11),
 (3, 12),
 (3, 13),
 (3, 14),
 (3, 15),
 (3, 16),
 (4, 1),
 (4, 2),
 (4, 3),
 (4, 4),
 (5, 1),
 (5, 2),
 (5, 3),
 (5, 4),
 (5, 5),
 (5, 6),
 (5, 7),
 (5, 8),
 (5, 9),
 (5, 10),
 (6, 1),
 (6, 2),
 (7, 1)]

In [74]:
# Plot a given or random survey question

# Choose a random column from the dataframe
import random

n_question, n_row = questions_list[random.randint(0, len(questions_list))]
column_name = f'question_{n_question}_row_{n_row}'

# Convert column values to their meaning
value_mapping = {
      1: 'Strongly Disagree',
      2: 'Disagree',
      3: 'Neither',
      4: 'Agree',
      5: 'Strongly Agree'
  }
new_col_values = list(value_mapping.values())
column_answers = pd.Categorical(df[column_name].map(value_mapping),
                                 new_col_values, ordered=True)

fig = px.histogram(
    x= column_answers,
    title= f'Question: "{questions_dict[column_name]}"',
    labels= {
        'x': 'Selected option'
    },
    color_discrete_sequence=["#70A500"],
    category_orders= {'x': new_col_values}
)

# Add spacing between bars
fig.update_layout(bargap=0.2, title_x=0.5)
fig.show()

In [91]:
# Find which 5 questions have the highest correlation with future growth expectations
growth_correlation_series = df.corr()['question_2_row_2_transformed']

growth_correlation_series.rename(questions_dict, inplace=True)

growth_correlation_series.sort_values(ascending=False)[2:7] # First two rows correspond to growth and employees

Unnamed: 0,question_2_row_2_transformed
We frequently utilize new opportunities in new markets,0.242992
We develop our business model to stand out from our competitors,0.21091
We develop and commercialize products and services that are completely new,0.210686
We introduce improved versions of existing products and services for our local market,0.200296
We commercialize products and services that challenge our previous products/services,0.197801
