# Modeling

In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler



We will start by performing our modeling without the Turkey data. First, we need to create a DataFrame with all the variables and data we created before.

In [2]:
path = os.path.join("FINAL_no_TR.xlsx")

# Read the Excel file
df = pd.read_excel(path)

# Display the first few rows
df.head()

Unnamed: 0,year,nuts3_name,country,questioncount,answercount,upvotecount,downvotecount,commentcount,nuts3_code,EMP (THS),GDP (MIO_EUR),GVA (MIO_EUR),POP (THS)
0,2008,Aargau,CH,86,167,0,0,32,CH033,349.067,24942.455371,23931.324019,586.6
1,2009,Aargau,CH,370,2291,4158,265,3282,CH033,349.067,25582.147076,24614.963819,595.84
2,2010,Aargau,CH,1658,4073,12602,474,7459,CH033,349.067,28422.11381,27311.621095,604.17
3,2011,Aargau,CH,1876,6039,16410,2362,13563,CH033,348.695,32557.042431,31336.257699,614.88
4,2012,Aargau,CH,3226,6260,24401,4175,18513,CH033,355.344,33443.246005,32261.32085,622.82


Add the weighted programming activity metric:

In [3]:
# Assign weights
weights = {'questions': 0.4, 'answers': 0.4, 'comments': 0.1, 'upvotes': 0.05, 'downvotes': 0.05}

df['activity'] = (df['questioncount']*weights['questions'] + 
                          df['answercount']*weights['answers'] + 
                          df['commentcount']*weights['comments'] + 
                          df['upvotecount']*weights['upvotes'] + 
                          df['downvotecount']*weights['downvotes'])
df.head()

Unnamed: 0,year,nuts3_name,country,questioncount,answercount,upvotecount,downvotecount,commentcount,nuts3_code,EMP (THS),GDP (MIO_EUR),GVA (MIO_EUR),POP (THS),activity
0,2008,Aargau,CH,86,167,0,0,32,CH033,349.067,24942.455371,23931.324019,586.6,104.4
1,2009,Aargau,CH,370,2291,4158,265,3282,CH033,349.067,25582.147076,24614.963819,595.84,1613.75
2,2010,Aargau,CH,1658,4073,12602,474,7459,CH033,349.067,28422.11381,27311.621095,604.17,3692.1
3,2011,Aargau,CH,1876,6039,16410,2362,13563,CH033,348.695,32557.042431,31336.257699,614.88,5460.9
4,2012,Aargau,CH,3226,6260,24401,4175,18513,CH033,355.344,33443.246005,32261.32085,622.82,7074.5


Add the activity per capita:

In [4]:
df['activity_per_cap'] = df['activity'] / (df['POP (THS)'] * 1000)
df.head()

Unnamed: 0,year,nuts3_name,country,questioncount,answercount,upvotecount,downvotecount,commentcount,nuts3_code,EMP (THS),GDP (MIO_EUR),GVA (MIO_EUR),POP (THS),activity,activity_per_cap
0,2008,Aargau,CH,86,167,0,0,32,CH033,349.067,24942.455371,23931.324019,586.6,104.4,0.000178
1,2009,Aargau,CH,370,2291,4158,265,3282,CH033,349.067,25582.147076,24614.963819,595.84,1613.75,0.002708
2,2010,Aargau,CH,1658,4073,12602,474,7459,CH033,349.067,28422.11381,27311.621095,604.17,3692.1,0.006111
3,2011,Aargau,CH,1876,6039,16410,2362,13563,CH033,348.695,32557.042431,31336.257699,614.88,5460.9,0.008881
4,2012,Aargau,CH,3226,6260,24401,4175,18513,CH033,355.344,33443.246005,32261.32085,622.82,7074.5,0.011359


Add the CAGR and CAGR per capita:

In [5]:
import math

# Function to calculate the CAGR 
def calculate_cagr(end_value, start_value, num_years):
    if start_value == 0:  # To avoid division by zero
        return None
    return (end_value/start_value)**(1/num_years) - 1

# Apply to all the NUTS3 regions
for region in df['nuts3_name'].unique():
    # Get the data for the region
    region_data = df[df.nuts3_name == region]
    
    # Get inputs
    start_value = region_data[region_data.year == 2008]['activity'].values[0]
    end_value = region_data[region_data.year == 2020]['activity'].values[0]
    num_years = 12

    # If the value for 2008 is 0 get the value for the next year
    i = 1
    while start_value == 0:
        start_value = region_data[region_data.year == 2008 + i]['activity'].values[0]
        i += 1
       
    # Calculate CAGR
    cagr = calculate_cagr(end_value, start_value, num_years)
        
    
    # Update DataFrame
    df.loc[df.nuts3_name == region, 'CAGR'] = cagr
    
df.head()

Unnamed: 0,year,nuts3_name,country,questioncount,answercount,upvotecount,downvotecount,commentcount,nuts3_code,EMP (THS),GDP (MIO_EUR),GVA (MIO_EUR),POP (THS),activity,activity_per_cap,CAGR
0,2008,Aargau,CH,86,167,0,0,32,CH033,349.067,24942.455371,23931.324019,586.6,104.4,0.000178,0.384454
1,2009,Aargau,CH,370,2291,4158,265,3282,CH033,349.067,25582.147076,24614.963819,595.84,1613.75,0.002708,0.384454
2,2010,Aargau,CH,1658,4073,12602,474,7459,CH033,349.067,28422.11381,27311.621095,604.17,3692.1,0.006111,0.384454
3,2011,Aargau,CH,1876,6039,16410,2362,13563,CH033,348.695,32557.042431,31336.257699,614.88,5460.9,0.008881,0.384454
4,2012,Aargau,CH,3226,6260,24401,4175,18513,CH033,355.344,33443.246005,32261.32085,622.82,7074.5,0.011359,0.384454


In [6]:
# Apply to all the NUTS3 regions
for region in df['nuts3_name'].unique():
    # Get the data for the region
    region_data = df[df.nuts3_name == region]
    
    # Get inputs
    start_value = region_data[region_data.year == 2008]['activity_per_cap'].values[0]
    end_value = region_data[region_data.year == 2020]['activity_per_cap'].values[0]
    num_years = 12

    # If the value for 2008 is 0 get the value for the next year
    i = 1
    while start_value == 0:
        start_value = region_data[region_data.year == 2008 + i]['activity_per_cap'].values[0]
        i += 1
       
    # Calculate CAGR
    cagr = calculate_cagr(end_value, start_value, num_years)
        
    
    # Update DataFrame
    df.loc[df.nuts3_name == region, 'CAGR_per_cap'] = cagr
    
df.head()

Unnamed: 0,year,nuts3_name,country,questioncount,answercount,upvotecount,downvotecount,commentcount,nuts3_code,EMP (THS),GDP (MIO_EUR),GVA (MIO_EUR),POP (THS),activity,activity_per_cap,CAGR,CAGR_per_cap
0,2008,Aargau,CH,86,167,0,0,32,CH033,349.067,24942.455371,23931.324019,586.6,104.4,0.000178,0.384454,0.365857
1,2009,Aargau,CH,370,2291,4158,265,3282,CH033,349.067,25582.147076,24614.963819,595.84,1613.75,0.002708,0.384454,0.365857
2,2010,Aargau,CH,1658,4073,12602,474,7459,CH033,349.067,28422.11381,27311.621095,604.17,3692.1,0.006111,0.384454,0.365857
3,2011,Aargau,CH,1876,6039,16410,2362,13563,CH033,348.695,32557.042431,31336.257699,614.88,5460.9,0.008881,0.384454,0.365857
4,2012,Aargau,CH,3226,6260,24401,4175,18513,CH033,355.344,33443.246005,32261.32085,622.82,7074.5,0.011359,0.384454,0.365857


Add the programmers:

In [7]:
# Defining number of programmers in each region based on the assumption made
weeks_in_year = 52
months_in_year = 12

# Experts in Region 
df["experts"] = (df["answercount"] / (weeks_in_year*2))

# Innovators in Region
df["innovators"] = df["questioncount"] / months_in_year

# Late Adopters in Region
df["late_adopters"] = ((df["upvotecount"] +  df["downvotecount"] + df["commentcount"])/ (months_in_year*15))

# Sum up all Developers
df["programmers_total"] = df["experts"] + df["innovators"] + df["late_adopters"] 

df.head()

Unnamed: 0,year,nuts3_name,country,questioncount,answercount,upvotecount,downvotecount,commentcount,nuts3_code,EMP (THS),...,GVA (MIO_EUR),POP (THS),activity,activity_per_cap,CAGR,CAGR_per_cap,experts,innovators,late_adopters,programmers_total
0,2008,Aargau,CH,86,167,0,0,32,CH033,349.067,...,23931.324019,586.6,104.4,0.000178,0.384454,0.365857,1.605769,7.166667,0.177778,8.950214
1,2009,Aargau,CH,370,2291,4158,265,3282,CH033,349.067,...,24614.963819,595.84,1613.75,0.002708,0.384454,0.365857,22.028846,30.833333,42.805556,95.667735
2,2010,Aargau,CH,1658,4073,12602,474,7459,CH033,349.067,...,27311.621095,604.17,3692.1,0.006111,0.384454,0.365857,39.163462,138.166667,114.083333,291.413462
3,2011,Aargau,CH,1876,6039,16410,2362,13563,CH033,348.695,...,31336.257699,614.88,5460.9,0.008881,0.384454,0.365857,58.067308,156.333333,179.638889,394.03953
4,2012,Aargau,CH,3226,6260,24401,4175,18513,CH033,355.344,...,32261.32085,622.82,7074.5,0.011359,0.384454,0.365857,60.192308,268.833333,261.605556,590.631197


Add the programmer density:

In [8]:
df['programmer_density'] = df["programmers_total"] / (df['POP (THS)'] * 1000)
df.head()

Unnamed: 0,year,nuts3_name,country,questioncount,answercount,upvotecount,downvotecount,commentcount,nuts3_code,EMP (THS),...,POP (THS),activity,activity_per_cap,CAGR,CAGR_per_cap,experts,innovators,late_adopters,programmers_total,programmer_density
0,2008,Aargau,CH,86,167,0,0,32,CH033,349.067,...,586.6,104.4,0.000178,0.384454,0.365857,1.605769,7.166667,0.177778,8.950214,1.5e-05
1,2009,Aargau,CH,370,2291,4158,265,3282,CH033,349.067,...,595.84,1613.75,0.002708,0.384454,0.365857,22.028846,30.833333,42.805556,95.667735,0.000161
2,2010,Aargau,CH,1658,4073,12602,474,7459,CH033,349.067,...,604.17,3692.1,0.006111,0.384454,0.365857,39.163462,138.166667,114.083333,291.413462,0.000482
3,2011,Aargau,CH,1876,6039,16410,2362,13563,CH033,348.695,...,614.88,5460.9,0.008881,0.384454,0.365857,58.067308,156.333333,179.638889,394.03953,0.000641
4,2012,Aargau,CH,3226,6260,24401,4175,18513,CH033,355.344,...,622.82,7074.5,0.011359,0.384454,0.365857,60.192308,268.833333,261.605556,590.631197,0.000948


Add GDP and employment per capita:

In [9]:
df['GDP_per_cap'] = (df["GDP (MIO_EUR)"]* 1000000) / (df['POP (THS)'] * 1000)
df['EMP_per_cap'] = (df["EMP (THS)"]* 1000) / (df['POP (THS)'] * 1000)

df.head()

Unnamed: 0,year,nuts3_name,country,questioncount,answercount,upvotecount,downvotecount,commentcount,nuts3_code,EMP (THS),...,activity_per_cap,CAGR,CAGR_per_cap,experts,innovators,late_adopters,programmers_total,programmer_density,GDP_per_cap,EMP_per_cap
0,2008,Aargau,CH,86,167,0,0,32,CH033,349.067,...,0.000178,0.384454,0.365857,1.605769,7.166667,0.177778,8.950214,1.5e-05,42520.38079,0.595068
1,2009,Aargau,CH,370,2291,4158,265,3282,CH033,349.067,...,0.002708,0.384454,0.365857,22.028846,30.833333,42.805556,95.667735,0.000161,42934.591629,0.58584
2,2010,Aargau,CH,1658,4073,12602,474,7459,CH033,349.067,...,0.006111,0.384454,0.365857,39.163462,138.166667,114.083333,291.413462,0.000482,47043.239172,0.577763
3,2011,Aargau,CH,1876,6039,16410,2362,13563,CH033,348.695,...,0.008881,0.384454,0.365857,58.067308,156.333333,179.638889,394.03953,0.000641,52948.611812,0.567094
4,2012,Aargau,CH,3226,6260,24401,4175,18513,CH033,355.344,...,0.011359,0.384454,0.365857,60.192308,268.833333,261.605556,590.631197,0.000948,53696.486955,0.57054


In [10]:
df.isnull().sum()

year                  0
nuts3_name            0
country               0
questioncount         0
answercount           0
upvotecount           0
downvotecount         0
commentcount          0
nuts3_code            0
EMP (THS)             0
GDP (MIO_EUR)         0
GVA (MIO_EUR)         0
POP (THS)             0
activity              0
activity_per_cap      0
CAGR                  0
CAGR_per_cap          0
experts               0
innovators            0
late_adopters         0
programmers_total     0
programmer_density    0
GDP_per_cap           0
EMP_per_cap           0
dtype: int64

In [11]:
# Save the new Dataframe to an Excel file
df.to_excel("MODELING.xlsx", index=False)