In [None]:
import dapla as dp
import pandas as pd
import datetime
import random
import numpy as np
import requests, json
from pyjstat import pyjstat
from tqdm.notebook import tqdm_notebook as pbar

### Settings

In [None]:
# Inputs / Parameters
comp_in_path = '/felles/mock_sysselsatte/companies_2019_30000'
pop_in_path = '/felles/mock_sysselsatte/population_2019_30000'

companies = dp.read_pandas(comp_in_path)
population = dp.read_pandas(pop_in_path)    

In [None]:
# Parameters
comp_in_path = "/felles/mock_sysselsatte/companies_2022_30000"
pop_in_path = "/felles/mock_sysselsatte/population_2022_30000"


In [None]:
start_year = int(pop_in_path.split('_')[-2])
start_pop_num = int(pop_in_path.split('_')[-1])
pop_num = len(population)
comp_num = len(companies)
print('Start year:\t\t', start_year,'\nPopulation size:\t', pop_num, '\nCompanies number:\t', comp_num)

In [None]:
pop_num_ratio = pop_num / 5328212

In [None]:
#companies

In [None]:
#population

### Statbank function

In [None]:
def statbank_pandas(url, payload):
    resultat = requests.post(url, json = payload)
    # Les resultatet som ett pyjstat-class-objekt
    dataset = pyjstat.Dataset.read(resultat.text)
    # Skriv "hovedresultat" til en dataframe
    df = dataset.write('dataframe')
    return df

# Populace

In [None]:
# Original number of workers at start of year
original_workers = len(population[~population['work_id'].isnull()])

### Ageing

In [None]:
population['age'] = (population['year_birth'] - start_year - 1)*-1

In [None]:
population

### Birth

In [None]:
# 54495 in 2019 with a population of 5328212
total_pop = 5328212
birth_ratio = 54495 / total_pop
born_num = int(int(pop_num) * birth_ratio)
born_num

In [None]:
# Male to female ratio based on numbers from 2019
male = 28042
female = 26453

male_sex_ratio =  male / (male+female) 
female_sex_ratio = 1 - male_sex_ratio

male_sex_ratio 

In [None]:
def fnr_single(year, sex):
    start_date = datetime.date(year, 1, 1)
    end_date = datetime.date(year, 12, 31)

    time_between_dates = end_date - start_date
    days_between_dates = time_between_dates.days
    random_number_of_days = random.randrange(days_between_dates)
    random_date = start_date + datetime.timedelta(days=random_number_of_days)
    date = f'{str(random_date.day).zfill(2)}{str(random_date.month).zfill(2)}'

    # numbers 5-6 are birthyear, last two digits of current year - age
    fnr = f'{date}{str(year)[2:]}'

    # Last 5 digits are random, if sex is male, last digit should be divisable by two

    # Edgecase might go from 99999 to 100000, so we start one below
    last5 = np.random.randint(0,99998) 
    # If the number cant be divded cleanly by two, and the sex is male, increase the number by one
    if last5 % 2 and sex == 'Menn':
        last5 += 1
    last5 = str(last5).zfill(5)

    fnr = fnr + last5
    return fnr

In [None]:
for i in range(born_num):
    
    ####### CHANGE WHEN ITERATING ########
    year_birth = int(start_year) + 1
    #print(year_birth)
    
    if i < (int(born_num * male_sex_ratio)):
        sex = 'Menn'
    else:
        sex = 'Kvinner'
        
    fnr = fnr_single(year_birth, sex)
    
    add_row = [fnr, None, sex, 0.0, year_birth, float('NaN')]
    population = population.append(pd.Series(add_row, index = population.columns), ignore_index = True)

In [None]:
population.tail()

### Death

In [None]:
url = 'https://data.ssb.no/api/v0/no/table/07995/'
payload = {
  "query": [
    {
      "code": "Kjonn",
      "selection": {
        "filter": "item",
        "values": [
          "1",
          "2"
        ]
      }
    },
    {
      "code": "Alder",
      "selection": {
        "filter": "vs:AlleAldre00B",
        "values": [
          "000",
          "001",
          "002",
          "003",
          "004",
          "005",
          "006",
          "007",
          "008",
          "009",
          "010",
          "011",
          "012",
          "013",
          "014",
          "015",
          "016",
          "017",
          "018",
          "019",
          "020",
          "021",
          "022",
          "023",
          "024",
          "025",
          "026",
          "027",
          "028",
          "029",
          "030",
          "031",
          "032",
          "033",
          "034",
          "035",
          "036",
          "037",
          "038",
          "039",
          "040",
          "041",
          "042",
          "043",
          "044",
          "045",
          "046",
          "047",
          "048",
          "049",
          "050",
          "051",
          "052",
          "053",
          "054",
          "055",
          "056",
          "057",
          "058",
          "059",
          "060",
          "061",
          "062",
          "063",
          "064",
          "065",
          "066",
          "067",
          "068",
          "069",
          "070",
          "071",
          "072",
          "073",
          "074",
          "075",
          "076",
          "077",
          "078",
          "079",
          "080",
          "081",
          "082",
          "083",
          "084",
          "085",
          "086",
          "087",
          "088",
          "089",
          "090",
          "091",
          "092",
          "093",
          "094",
          "095",
          "096",
          "097",
          "098",
          "099",
          "100",
          "101",
          "102",
          "103",
          "104",
          "105+"
        ]
      }
    },
    {
      "code": "Tid",
      "selection": {
        "filter": "item",
        "values": [
          "2019"
        ]
      }
    }
  ],
  "response": {
    "format": "json-stat2"
  }
}

In [None]:
death_per_age = statbank_pandas(url, payload)

In [None]:
death_per_age['alder'] = death_per_age['alder'].str.replace(' år eller eldre','')
death_per_age['alder'] = death_per_age['alder'].str.replace(' år','')
death_per_age['alder'] = death_per_age['alder'].astype(int)

In [None]:
death_per_age['value_pop_ratio'] = round(death_per_age['value'].astype(int) * pop_num_ratio)

In [None]:
# We want this number to match the total death ratio
print(death_per_age['value'].sum() / total_pop)
print(death_per_age['value_pop_ratio'].sum() / pop_num)

In [None]:
accuracy = 0.01

while death_per_age['value'].sum() / total_pop > death_per_age['value_pop_ratio'].sum() / pop_num:
    pop_num_ratio *= 1 + accuracy
    #print(pop_num_ratio)
    death_per_age['value_pop_ratio'] = round(death_per_age['value'].astype(int) * pop_num_ratio)
    
    
while death_per_age['value'].sum() / total_pop < death_per_age['value_pop_ratio'].sum() / pop_num:
    pop_num_ratio *= 1 - accuracy
    #print(pop_num_ratio)
    death_per_age['value_pop_ratio'] = round(death_per_age['value'].astype(int) * pop_num_ratio)

In [None]:
# We want this number to match the total death ratio
print(death_per_age['value'].sum() / total_pop)
print(death_per_age['value_pop_ratio'].sum() / pop_num)

In [None]:
kill_list = death_per_age[death_per_age['value_pop_ratio'].astype(int) > 0]

In [None]:
kill_list

In [None]:
for i, row in kill_list.iterrows():
    sex = row['kjønn']
    kill_num = int(row['value_pop_ratio'])
    age = row['alder']
    
    #print('Kill', kill_num, sex, 'of age', age )
    selection = population[(population['age'] == age) & (population['sex'] == sex)]
    to_shuffle = list(selection.index)
    random.shuffle(to_shuffle)
    #print(to_shuffle[:kill_num])
    
    population = population.drop(to_shuffle[:kill_num])

In [None]:
# Kill everyone over 105
population = population[population['age'] <= 105]

In [None]:
#population

### Emigration & Immigration

In [None]:
url = 'https://data.ssb.no/api/v0/no/table/09203/'
payload = {
  "query": [
    {
      "code": "Kjonn",
      "selection": {
        "filter": "item",
        "values": [
          "1",
          "2"
        ]
      }
    },
    {
      "code": "Alder",
      "selection": {
        "filter": "item",
        "values": [
          "00-04",
          "05-09",
          "10-14",
          "15-19",
          "20-24",
          "25-29",
          "30-34",
          "35-39",
          "40-44",
          "45-49",
          "50-54",
          "55-59",
          "60-64",
          "65-69",
          "70-74",
          "75-79",
          "80+"
        ]
      }
    },
    {
      "code": "ContentsCode",
      "selection": {
        "filter": "item",
        "values": [
          "Innvandring",
          "Utvandring"
        ]
      }
    },
    {
      "code": "Tid",
      "selection": {
        "filter": "item",
        "values": [
          "2019"
        ]
      }
    }
  ],
  "response": {
    "format": "json-stat2"
  }
}

In [None]:
migrate_age_sex = statbank_pandas(url, payload)

In [None]:
emigrate_age_sex = migrate_age_sex[migrate_age_sex['statistikkvariabel'] == 'Utvandring']
immigrate_age_sex = migrate_age_sex[migrate_age_sex['statistikkvariabel'] == 'Innvandring']

In [None]:
emigrat_ratio = emigrate_age_sex['value'].sum() / total_pop
emi_target = int(pop_num * emigrat_ratio)
emi_target

In [None]:
emigrate_age_sex['value_ratio'] = emigrate_age_sex['value'].astype(int) * emigrat_ratio

In [None]:
round(emigrate_age_sex['value_ratio']).sum()

In [None]:
accuracy = 0.01

while emigrate_age_sex['value'].sum() / total_pop > emigrate_age_sex['value_ratio'].sum() / pop_num:
    emigrat_ratio *= 1 + accuracy
    #print(emigrat_ratio)
    emigrate_age_sex['value_ratio'] = round(emigrate_age_sex['value'].astype(int) * emigrat_ratio)
    
    
while emigrate_age_sex['value'].sum() / total_pop < emigrate_age_sex['value_ratio'].sum() / pop_num:
    emigrat_ratio *= 1 - accuracy
    #print(emigrat_ratio)
    emigrate_age_sex['value_ratio'] = round(emigrate_age_sex['value'].astype(int) * emigrat_ratio)
    
print(emigrat_ratio)

In [None]:
round(emigrate_age_sex['value_ratio']).sum()

In [None]:
emi_list = emigrate_age_sex[emigrate_age_sex['value_ratio'].astype(int) > 0]

In [None]:
# Old emigration, picks randomly
#emi_list = list(population.index)
#random.shuffle(emi_list)
#population = population.drop(emi_list[:emi_target])
#population

In [None]:
emi_list['alder1'] = emi_list['alder'].str.split("-").str[0].astype(int)
emi_list['alder2'] = emi_list['alder'].str.split("-").str[1].str.replace(' år','').astype(int)
emi_list = emi_list.drop("alder", axis = 'columns')

In [None]:
emi_list.head(3)

In [None]:
for i, row in emi_list.iterrows():
    sex = row['kjønn']
    num = int(row['value_ratio'])
    age1 = row['alder1']
    age2 = row['alder2']
    
    print('Emigrate', num, sex, 'between age', age1, 'and', age2 )
    selection = list(population[(population['age'] >= age1) & (population['age'] <= age2) & (population['sex'] == sex)].index)
    random.shuffle(selection)
    #print(selection[:kill_num])
    
    population = population.drop(selection[:num])

In [None]:
#population

### Immigration

In [None]:
immigrat_ratio = immigrate_age_sex['value'].sum() / total_pop
immi_target  = int(pop_num * immigrat_ratio)
immi_target

In [None]:
immigrate_age_sex['value_ratio'] = immigrate_age_sex['value'].astype(int) * immigrat_ratio

In [None]:
round(immigrate_age_sex['value_ratio']).sum()

In [None]:
accuracy = 0.01

while immigrate_age_sex['value'].sum() / total_pop > immigrate_age_sex['value_ratio'].sum() / pop_num:
    immigrat_ratio *= 1 + accuracy
    #print(pop_num_ratio)
    immigrate_age_sex['value_ratio'] = round(immigrate_age_sex['value'].astype(int) * immigrat_ratio)
    
    
while immigrate_age_sex['value'].sum() / total_pop < immigrate_age_sex['value_ratio'].sum() / pop_num:
    immigrat_ratio *= 1 - accuracy
    #print(pop_num_ratio)
    immigrate_age_sex['value_ratio'] = round(immigrate_age_sex['value'].astype(int) * immigrat_ratio)
    
print(immigrat_ratio)

In [None]:
round(immigrate_age_sex['value_ratio']).sum()

In [None]:
immi_list = immigrate_age_sex[immigrate_age_sex['value_ratio'].astype(int) > 0]

In [None]:
# Old emigration, picks randomly
#emi_list = list(population.index)
#random.shuffle(emi_list)
#population = population.drop(emi_list[:emi_target])
#population

In [None]:
immi_list['alder1'] = immi_list['alder'].str.split("-").str[0].astype(int)
immi_list['alder2'] = immi_list['alder'].str.split("-").str[1].str.replace(' år','').astype(int)
immi_list = immi_list.drop("alder", axis = 'columns')

In [None]:
immi_list['value_ratio'].sum()

In [None]:
population.head(3)

In [None]:
count = 0

for i, row in immi_list.iterrows():
    sex = row['kjønn']
    num = int(row['value_ratio'])
    age1 = row['alder1']
    age2 = row['alder2']
    
    print('Immigrate', num, sex, 'between age', age1, 'and', age2 )
    
    for i in range(num):
        age = np.random.randint(age1, age2 + 1)
        
        ######## CHANGE WITH ITERATIONS ########
        year_birth = start_year - age + 1
        
        fnr = fnr_single(year_birth, sex)

        add_row = [fnr, None, sex, age, year_birth, float('NaN')]
        #print(add_row)
        population = population.append(pd.Series(add_row, index = population.columns), ignore_index = True)
        count += 1
        
print(count)

In [None]:
population.tail(5)

#### Remove dupes on person-id

In [None]:
population.drop_duplicates('id', inplace = True)

# Companies

In [None]:
# 12 343 increase in amount of buisnesses in Norway 2019-2021 per year
# 4 557 bankruptcies in same period per year
# increase in companies therefore 16 900
bankrupt_num = 4557
started_comp_num = 16900

ratio_pop = pop_num / total_pop

bankrupt_ratio = int(bankrupt_num * ratio_pop)
started_comp_ratio = int(started_comp_num * ratio_pop)
print(bankrupt_ratio, started_comp_ratio)

In [None]:
companies

### Bankrupt

In [None]:
# Prefer companies with zero employees
zero_employees = companies[~companies['work_id'].isin(pd.unique(population['work_id']))]
zero_employees
# Then add other random companies to fill inn list, if necessary
selection = companies[~companies.index.isin(zero_employees)].index.tolist()
random.shuffle(selection)
# combine lists, with the ones with zero employees first
selection = zero_employees.index.tolist() + selection
# Drop from top of list
companies = companies.drop(selection[:bankrupt_ratio])

In [None]:
#companies

### New companies
Add one employee into each

In [None]:
url = 'https://data.ssb.no/api/v0/no/table/11606/'
payload = {
  "query": [
    {
      "code": "Region",
      "selection": {
        "filter": "vs:Fylker1972m22",
        "values": [
          "01",
          "02",
          "03",
          "04",
          "05",
          "06",
          "07",
          "08",
          "09",
          "10",
          "11",
          "12",
          "14",
          "15",
          "50",
          "16",
          "17",
          "18",
          "19",
          "20",
          "21",
          "22",
          "23"
        ]
      }
    },
    {
      "code": "Alder",
      "selection": {
        "filter": "item",
        "values": [
          "15-74"
        ]
      }
    },
    {
      "code": "NACE2007",
      "selection": {
        "filter": "vs:NACE2007regsys5siff",
        "values": [
          "01.110",
          "01.120",
          "01.130",
          "01.140",
          "01.150",
          "01.160",
          "01.190",
          "01.210",
          "01.220",
          "01.230",
          "01.240",
          "01.250",
          "01.260",
          "01.270",
          "01.280",
          "01.290",
          "01.300",
          "01.410",
          "01.420",
          "01.430",
          "01.440",
          "01.451",
          "01.452",
          "01.460",
          "01.471",
          "01.479",
          "01.490",
          "01.500",
          "01.610",
          "01.620",
          "01.630",
          "01.640",
          "01.700",
          "01.000u",
          "02.100",
          "02.200",
          "02.300",
          "02.400",
          "02.000u",
          "03.111",
          "03.112",
          "03.120",
          "03.211",
          "03.212",
          "03.213",
          "03.221",
          "03.222",
          "03.223",
          "03.000u",
          "05.100",
          "05.200",
          "06.100",
          "06.200",
          "07.100",
          "07.210",
          "07.290",
          "08.111",
          "08.112",
          "08.113",
          "08.120",
          "08.910",
          "08.920",
          "08.930",
          "08.990",
          "09.101",
          "09.109",
          "09.900",
          "10.110",
          "10.120",
          "10.130",
          "10.201",
          "10.202",
          "10.203",
          "10.209",
          "10.310",
          "10.320",
          "10.390",
          "10.411",
          "10.412",
          "10.413",
          "10.420",
          "10.510",
          "10.520",
          "10.610",
          "10.620",
          "10.710",
          "10.720",
          "10.730",
          "10.810",
          "10.820",
          "10.830",
          "10.840",
          "10.850",
          "10.860",
          "10.890",
          "10.910",
          "10.920",
          "11.010",
          "11.020",
          "11.030",
          "11.040",
          "11.050",
          "11.060",
          "11.070",
          "12.000",
          "13.100",
          "13.200",
          "13.300",
          "13.910",
          "13.921",
          "13.929",
          "13.930",
          "13.940",
          "13.950",
          "13.960",
          "13.990",
          "14.110",
          "14.120",
          "14.130",
          "14.140",
          "14.190",
          "14.200",
          "14.310",
          "14.390",
          "15.110",
          "15.120",
          "15.200",
          "16.100",
          "16.210",
          "16.220",
          "16.231",
          "16.232",
          "16.240",
          "16.290",
          "17.110",
          "17.120",
          "17.210",
          "17.220",
          "17.230",
          "17.240",
          "17.290",
          "18.110",
          "18.120",
          "18.130",
          "18.140",
          "18.200",
          "19.100",
          "19.200",
          "20.110",
          "20.120",
          "20.130",
          "20.140",
          "20.150",
          "20.160",
          "20.170",
          "20.200",
          "20.300",
          "20.410",
          "20.420",
          "20.510",
          "20.520",
          "20.530",
          "20.590",
          "20.600",
          "21.100",
          "21.200",
          "22.110",
          "22.190",
          "22.210",
          "22.220",
          "22.230",
          "22.290",
          "23.110",
          "23.120",
          "23.130",
          "23.140",
          "23.190",
          "23.200",
          "23.310",
          "23.320",
          "23.410",
          "23.420",
          "23.430",
          "23.440",
          "23.490",
          "23.510",
          "23.520",
          "23.610",
          "23.620",
          "23.630",
          "23.640",
          "23.650",
          "23.690",
          "23.700",
          "23.910",
          "23.990",
          "24.101",
          "24.102",
          "24.200",
          "24.310",
          "24.320",
          "24.330",
          "24.340",
          "24.410",
          "24.421",
          "24.422",
          "24.430",
          "24.440",
          "24.450",
          "24.460",
          "24.510",
          "24.520",
          "24.530",
          "24.540",
          "25.110",
          "25.120",
          "25.210",
          "25.290",
          "25.300",
          "25.400",
          "25.500",
          "25.610",
          "25.620",
          "25.710",
          "25.720",
          "25.730",
          "25.910",
          "25.920",
          "25.930",
          "25.940",
          "25.990",
          "26.110",
          "26.120",
          "26.200",
          "26.300",
          "26.400",
          "26.510",
          "26.520",
          "26.600",
          "26.700",
          "26.800",
          "27.110",
          "27.120",
          "27.200",
          "27.310",
          "27.320",
          "27.330",
          "27.400",
          "27.510",
          "27.520",
          "27.900",
          "28.110",
          "28.120",
          "28.130",
          "28.140",
          "28.150",
          "28.210",
          "28.221",
          "28.229",
          "28.230",
          "28.240",
          "28.250",
          "28.290",
          "28.300",
          "28.410",
          "28.490",
          "28.910",
          "28.920",
          "28.930",
          "28.940",
          "28.950",
          "28.960",
          "28.990",
          "29.100",
          "29.200",
          "29.310",
          "29.320",
          "30.111",
          "30.112",
          "30.113",
          "30.114",
          "30.115",
          "30.116",
          "30.120",
          "30.200",
          "30.300",
          "30.400",
          "30.910",
          "30.920",
          "30.990",
          "31.010",
          "31.020",
          "31.030",
          "31.090",
          "32.110",
          "32.120",
          "32.130",
          "32.200",
          "32.300",
          "32.400",
          "32.500",
          "32.910",
          "32.990",
          "33.110",
          "33.120",
          "33.130",
          "33.140",
          "33.150",
          "33.160",
          "33.170",
          "33.190",
          "33.200",
          "35.111",
          "35.112",
          "35.113",
          "35.114",
          "35.119",
          "35.120",
          "35.130",
          "35.140",
          "35.210",
          "35.220",
          "35.230",
          "35.300",
          "36.000",
          "37.000",
          "38.110",
          "38.120",
          "38.210",
          "38.220",
          "38.310",
          "38.320",
          "39.000",
          "41.101",
          "41.109",
          "41.200",
          "42.110",
          "42.120",
          "42.130",
          "42.210",
          "42.220",
          "42.910",
          "42.990",
          "43.110",
          "43.120",
          "43.130",
          "43.210",
          "43.220",
          "43.221",
          "43.222",
          "43.223",
          "43.290",
          "43.310",
          "43.320",
          "43.330",
          "43.341",
          "43.342",
          "43.390",
          "43.911",
          "43.919",
          "43.990",
          "45.111",
          "45.112",
          "45.191",
          "45.192",
          "45.200",
          "45.310",
          "45.320",
          "45.401",
          "45.402",
          "45.403",
          "46.110",
          "46.120",
          "46.130",
          "46.140",
          "46.150",
          "46.160",
          "46.170",
          "46.180",
          "46.190",
          "46.210",
          "46.220",
          "46.230",
          "46.240",
          "46.310",
          "46.320",
          "46.330",
          "46.341",
          "46.349",
          "46.350",
          "46.360",
          "46.370",
          "46.381",
          "46.389",
          "46.390",
          "46.410",
          "46.421",
          "46.422",
          "46.431",
          "46.432",
          "46.433",
          "46.434",
          "46.435",
          "46.441",
          "46.442",
          "46.450",
          "46.460",
          "46.471",
          "46.472",
          "46.473",
          "46.481",
          "46.482",
          "46.491",
          "46.492",
          "46.493",
          "46.494",
          "46.495",
          "46.499",
          "46.510",
          "46.520",
          "46.610",
          "46.620",
          "46.630",
          "46.640",
          "46.650",
          "46.660",
          "46.691",
          "46.692",
          "46.693",
          "46.694",
          "46.710",
          "46.720",
          "46.731",
          "46.732",
          "46.733",
          "46.739",
          "46.740",
          "46.750",
          "46.761",
          "46.769",
          "46.770",
          "46.900",
          "47.111",
          "47.112",
          "47.190",
          "47.210",
          "47.220",
          "47.230",
          "47.241",
          "47.242",
          "47.251",
          "47.259",
          "47.260",
          "47.291",
          "47.292",
          "47.299",
          "47.300",
          "47.410",
          "47.420",
          "47.430",
          "47.510",
          "47.521",
          "47.522",
          "47.523",
          "47.524",
          "47.529",
          "47.531",
          "47.532",
          "47.533",
          "47.540",
          "47.591",
          "47.592",
          "47.593",
          "47.594",
          "47.599",
          "47.610",
          "47.620",
          "47.630",
          "47.641",
          "47.642",
          "47.650",
          "47.710",
          "47.721",
          "47.722",
          "47.730",
          "47.740",
          "47.750",
          "47.761",
          "47.762",
          "47.771",
          "47.772",
          "47.781",
          "47.782",
          "47.789",
          "47.791",
          "47.792",
          "47.799",
          "47.810",
          "47.820",
          "47.890",
          "47.911",
          "47.912",
          "47.913",
          "47.914",
          "47.915",
          "47.916",
          "47.917",
          "47.919",
          "47.990",
          "49.100",
          "49.200",
          "49.311",
          "49.312",
          "49.320",
          "49.391",
          "49.392",
          "49.393",
          "49.410",
          "49.420",
          "49.500",
          "50.101",
          "50.102",
          "50.109",
          "50.201",
          "50.202",
          "50.203",
          "50.204",
          "50.300",
          "50.400",
          "51.100",
          "51.210",
          "51.220",
          "52.100",
          "52.211",
          "52.212",
          "52.213",
          "52.214",
          "52.215",
          "52.216",
          "52.219",
          "52.221",
          "52.222",
          "52.223",
          "52.229",
          "52.230",
          "52.240",
          "52.291",
          "52.292",
          "52.293",
          "52.299",
          "53.100",
          "53.200",
          "55.101",
          "55.102",
          "55.201",
          "55.202",
          "55.300",
          "55.301",
          "55.302",
          "55.900",
          "56.101",
          "56.102",
          "56.210",
          "56.290",
          "56.301",
          "56.309",
          "58.110",
          "58.120",
          "58.130",
          "58.140",
          "58.190",
          "58.210",
          "58.290",
          "59.110",
          "59.120",
          "59.130",
          "59.140",
          "59.200",
          "60.100",
          "60.200",
          "61.100",
          "61.200",
          "61.300",
          "61.900",
          "62.010",
          "62.020",
          "62.030",
          "62.090",
          "63.110",
          "63.120",
          "63.910",
          "63.990",
          "64.110",
          "64.190",
          "64.201",
          "64.202",
          "64.301",
          "64.302",
          "64.303",
          "64.304",
          "64.305",
          "64.306",
          "64.308",
          "64.309",
          "64.910",
          "64.920",
          "64.990",
          "65.110",
          "65.120",
          "65.200",
          "65.300",
          "66.110",
          "66.120",
          "66.190",
          "66.210",
          "66.220",
          "66.290",
          "66.300",
          "68.100",
          "68.201",
          "68.209",
          "68.310",
          "68.320",
          "69.100",
          "69.201",
          "69.202",
          "69.203",
          "70.100",
          "70.210",
          "70.220",
          "71.111",
          "71.112",
          "71.113",
          "71.121",
          "71.122",
          "71.123",
          "71.129",
          "71.200",
          "72.110",
          "72.190",
          "72.200",
          "73.110",
          "73.120",
          "73.200",
          "74.101",
          "74.102",
          "74.103",
          "74.200",
          "74.300",
          "74.901",
          "74.902",
          "74.903",
          "74.909",
          "75.000",
          "77.110",
          "77.120",
          "77.210",
          "77.220",
          "77.290",
          "77.310",
          "77.320",
          "77.330",
          "77.340",
          "77.350",
          "77.390",
          "77.400",
          "78.100",
          "78.200",
          "78.300",
          "79.110",
          "79.120",
          "79.901",
          "79.902",
          "79.903",
          "79.909",
          "80.100",
          "80.200",
          "80.300",
          "81.101",
          "81.109",
          "81.210",
          "81.220",
          "81.291",
          "81.299",
          "81.300",
          "82.110",
          "82.190",
          "82.201",
          "82.202",
          "82.300",
          "82.910",
          "82.920",
          "82.990",
          "84.110",
          "84.120",
          "84.130",
          "84.210",
          "84.220",
          "84.230",
          "84.240",
          "84.250",
          "84.300",
          "85.100",
          "85.201",
          "85.202",
          "85.203",
          "85.310",
          "85.320",
          "85.410",
          "85.421",
          "85.422",
          "85.423",
          "85.424",
          "85.429",
          "85.510",
          "85.521",
          "85.522",
          "85.529",
          "85.530",
          "85.591",
          "85.592",
          "85.593",
          "85.594",
          "85.595",
          "85.596",
          "85.599",
          "85.601",
          "85.609",
          "86.101",
          "86.102",
          "86.103",
          "86.104",
          "86.105",
          "86.106",
          "86.107",
          "86.211",
          "86.212",
          "86.221",
          "86.222",
          "86.223",
          "86.224",
          "86.225",
          "86.230",
          "86.901",
          "86.902",
          "86.903",
          "86.904",
          "86.905",
          "86.906",
          "86.907",
          "86.909",
          "87.101",
          "87.102",
          "87.201",
          "87.202",
          "87.203",
          "87.301",
          "87.302",
          "87.303",
          "87.304",
          "87.305",
          "87.901",
          "87.909",
          "88.101",
          "88.102",
          "88.103",
          "88.911",
          "88.912",
          "88.913",
          "88.914",
          "88.991",
          "88.992",
          "88.993",
          "88.994",
          "88.995",
          "88.996",
          "88.997",
          "88.998",
          "88.999",
          "90.011",
          "90.012",
          "90.019",
          "90.020",
          "90.031",
          "90.032",
          "90.033",
          "90.034",
          "90.035",
          "90.039",
          "90.040",
          "91.011",
          "91.012",
          "91.013",
          "91.021",
          "91.022",
          "91.023",
          "91.029",
          "91.030",
          "91.040",
          "92.000",
          "93.110",
          "93.120",
          "93.130",
          "93.190",
          "93.210",
          "93.291",
          "93.292",
          "93.299",
          "94.110",
          "94.120",
          "94.200",
          "94.910",
          "94.920",
          "94.991",
          "94.992",
          "95.110",
          "95.120",
          "95.210",
          "95.220",
          "95.230",
          "95.240",
          "95.250",
          "95.290",
          "96.010",
          "96.020",
          "96.030",
          "96.040",
          "96.090",
          "97.000",
          "99.000",
          "00.000"
        ]
      }
    },
    {
      "code": "ContentsCode",
      "selection": {
        "filter": "item",
        "values": [
          "Sysselsatte"
        ]
      }
    },
    {
      "code": "Tid",
      "selection": {
        "filter": "item",
        "values": [
          '2019'
        ]
      }
    }
  ],
  "response": {
    "format": "json-stat2"
  }
}

In [None]:
df_nace_codes = statbank_pandas(url, payload)
df_nace_codes['nace'] = ''
df_nace_codes['employee_points'] = 0
# Apply nacecode
count = 0
for code in payload['query'][2]['selection']['values']:
    for x in range(len(pd.unique(df_nace_codes['region']))):
        df_nace_codes['nace'].iloc[count] = code
        count += 1

# Number of rows to copy each region onto
region_len = int(len(df_nace_codes) / len(pd.unique(df_nace_codes['region'])))
df_nace_codes['region_code'] = ''
# Apply region code in increments 
count = 0
for code in payload['query'][0]['selection']['values']:
    for x in range(region_len):
        df_nace_codes['region_code'].iloc[count] = code
        count += 1

In [None]:
df_nace_codes = df_nace_codes[df_nace_codes['value'] > 0]

In [None]:
# Semi-shuffle of df_nace_codes
df_nace_codes = df_nace_codes.sort_values('value', ascending = False)
# pick every n nace code, to be static
step = np.random.randint(2, 4)
start = np.random.randint(0, 3)
static = df_nace_codes[start::step].index.tolist()

to_shuffle = df_nace_codes[~df_nace_codes.index.isin(static)].index.tolist()
random.shuffle(to_shuffle)


In [None]:
nace_order = []
count_static = 0
count_shuff = 0
try:
    for idx in range(len(static) + len(to_shuffle)):
        if idx % step == 0:
            nace_order.append(static[count_static])
            count_static += 1
        else:
            nace_order.append(to_shuffle[count_shuff])
            count_shuff += 1
        #print(idx)
except:
    ...
try:
    nace_order += static[count_static:]
except:
    ...
try:
    nace_order += to_shuffle[count_shuff:]
except:
    ...

In [None]:
print(df_nace_codes.index.tolist()[:5])
#print(static[:5])
#print(to_shuffle[:5])   
print(nace_order[:5]) 

In [None]:
df_nace_codes = df_nace_codes.loc[nace_order]
df_nace_codes

In [None]:
for i, row in df_nace_codes[:started_comp_ratio].iterrows():
    #print(i)
    
    work_id = str(np.random.randint(0, 1000000000)).zfill(9)
    nace = row['nace']
    region_code = row['region_code']
    region = row['region']
    employee_points = 1
    
    
    add_row = [work_id, nace, region_code, region, employee_points]
    #print(add_row)
    companies = companies.append(pd.Series(add_row, index = companies.columns), ignore_index = True)
    
    # Add a random employee that is not currently employed
    employ_select = population[(population['work_id'].isnull()) & (population['age'] > 16) & (population['age'] < 74) ].index.tolist()
    #print(employ_select)
    random.shuffle(employ_select)
    #population.loc[employ_select[0]]['work_id'] = work_id
    population.loc[population.index == employ_select[0], 'work_id'] = work_id
    #print(population[population['work_id'] == work_id])
    #break

In [None]:
#print(population[population['work_id'] == '044062808'])

In [None]:
#companies

#### Remove dupes on company ids

In [None]:
companies.drop_duplicates('work_id', inplace = True)

# Work

### Move workers from non-existing companies to existing companies

In [None]:
selection = population[~population['work_id'].isin(companies['work_id'].tolist())]
selection = selection[~selection['work_id'].isnull()]
selection_ids = selection.index.tolist()

In [None]:
for i in selection_ids:
    # Pick random existing company
    comp_select = companies.index.tolist()
    random.shuffle(comp_select)
    work_id = str(companies['work_id'].loc[comp_select[0]])
    #print(work_id)
    population.loc[population.index == i, 'work_id'] = work_id

In [None]:
# Check that selection now is empty, as we moved these
selection = population[~population['work_id'].isin(companies['work_id'].tolist())]
selection = selection[~selection['work_id'].isnull()]
selection

### Firings (leaving job)

In [None]:
url = 'https://data.ssb.no/api/v0/no/table/12317/'
payload = {
  "query": [
    {
      "code": "Sektor",
      "selection": {
        "filter": "item",
        "values": [
          "ALLE"
        ]
      }
    },
    {
      "code": "ContentsCode",
      "selection": {
        "filter": "item",
        "values": [
          "AvgVirkTotalt",
          "TilgVirkTotalt"
        ]
      }
    }
  ],
  "response": {
    "format": "json-stat2"
  }
}

In [None]:
work_down = statbank_pandas(url, payload)
#work_down

In [None]:
# 4 quarters per year, times 4 is job loss per year
lost_jobs = work_down[work_down['statistikkvariabel'] == 'Jobbnedgang, virksomheter i alt']['value'].mean()*4
# Number of people who now should loose their jobs
jobloss_corr = int((lost_jobs / total_pop) * len(population))

In [None]:
workers = population[~population['work_id'].isnull()]
print('Want to fire', round(jobloss_corr / len(workers) * 100), '% of workers in the population.')

select_workers = workers.index.tolist()
random.shuffle(select_workers)
select_workers = select_workers[:jobloss_corr]

population.loc[select_workers, 'work_id'] = None
population.loc[select_workers, 'work_percent'] = np.nan

print('Fired', len(select_workers), 'of', len(workers), 'original workers.')

In [None]:
# Number of people still employed
len(population[~population['work_id'].isnull()])

### Pensions

In [None]:
# People above 62 have a probability of (82-age) * 5% to be removed from their work status
potential_pensioners = population[(population['age'] > 62) & (~population['work_id'].isnull())]
count = 0
for i, r in potential_pensioners.iterrows():
    age = int(r['age'])
    chance = ((age - 82 ) * 5) + 100
    draw = random.randint(0,100)
    #print(age, chance, draw)
    
    # The ones with a random draw below their chance, will be pensioned
    if draw <= chance:
        #print('Pensioner', i, 'at age', age)
        count += 1
        # Empty rows work_id
        population.loc[i, 'work_id'] = None
print('Pensioned', count, 'of', len(potential_pensioners), 'potential pensioners.')

In [None]:
# Check if there is no workers above the age of 82
workers_82 = population[(population['age'] >= 82) & (~population['work_id'].isnull())]
if len(workers_82):
    print(len(workers_82), 'workers above the age of 82, there should be 0.')

### Hirings (starting job)

In [None]:
# Hire according to ratio first
gained_jobs = work_down[work_down['statistikkvariabel'] == 'Jobboppgang, virksomheter i alt']['value'].mean()*4
# Number of people we should hire according to stat
jobgain_corr = int((gained_jobs / total_pop) * len(population))
print(jobgain_corr)

In [None]:
# Number of working people we should have according to other statistics 
15261 - len(population[~population['work_id'].isnull()])

In [None]:
# We prefer to hire people in their most productive years
count = 0
total_original_workers = df_nace_codes['value'].sum()
with pbar(total = jobgain_corr) as pbar1:
    while count <= jobgain_corr:
        # This creates a semi-normalized chance of preferring people for hire in the mid-range (around 40 years of age)
        half_range = 82 / 2
        dice1 = random.randint(0, half_range)
        dice2 = random.randint(0, half_range)
        draw = dice1 + dice2
        if draw > 16 and draw < 67:
            # Try to hire one of this age
            peeps_for_hire = population[(population['work_id'].isnull()) & (population['age'] == draw)]
            if len(peeps_for_hire):
                # Pick a random nace, based on chance from nace value rows

                dice1 = random.randint(1, total_original_workers + 1)
                nace_count = 0
                nace_val_count = 0
                while nace_val_count <= dice1:
                    nace_val_count += df_nace_codes.iloc[nace_count]['value']
                    nace = df_nace_codes.iloc[nace_count]['nace']

                # Pick a random company with this Nace
                work_id = companies[companies['nace'] == nace].sample()['work_id'].iloc[0]

                # Pick one random worker among the correct age
                pop_id = peeps_for_hire.sample().index.tolist()[0]
                # Hire at company
                population.loc[pop_id, 'work_id'] = work_id 
                #print(work_id, pop_id)
                count += 1
                pbar1.update(1)
print('Hired', count, 'people.')

In [None]:
# Check against total in work now, 
print('Went from', len(workers), 'workers before firings, to', len(population[~population['work_id'].isnull()]), 'after hirings and pensionings.')

### Re-establish work-percentage distribution

In [None]:
# Reshuffle existing work-percentages
select_percents = population[~population['work_percent'].isnull()]

for i, r in select_percents.iterrows():
    half_range = 50 / 2
    dice1 = random.randint(0, half_range)
    dice2 = random.randint(0, half_range)
    score = dice1 + dice2 - 25
    if score > 10 or score < -10:
        work_percent = int(r['work_percent']) + score
        if work_percent > 100:
            work_percent = 100
        elif work_percent < 0:
            work_percent = 5
        #print(work_percent)
        population.loc[i, 'work_percent'] = work_percent   

In [None]:
url = 'https://data.ssb.no/api/v0/no/table/12541/'
payload = {
  "query": [
    {
      "code": "Alder",
      "selection": {
        "filter": "item",
        "values": [
          "15-19",
          "20-24",
          "25-39",
          "40-54",
          "55-66",
          "67-74"
        ]
      }
    },
    {
      "code": "Kjonn",
      "selection": {
        "filter": "item",
        "values": [
          "1",
          "2"
        ]
      }
    },
    {
      "code": "HovedBiarbeid",
      "selection": {
        "filter": "item",
        "values": [
          "T"
        ]
      }
    },
    {
      "code": "ArbeidsTidRen",
      "selection": {
        "filter": "vs:ArbTidS12",
        "values": [
          "P000-009",
          "P010-019",
          "P020-029",
          "P030-039",
          "P040-049",
          "P050-059",
          "P060-069",
          "P070-079",
          "P080-089",
          "P090-099",
          "P100"
        ]
      }
    },
    {
      "code": "Tid",
      "selection": {
        "filter": "item",
        "values": [
          '2019'
        ]
      }
    }
  ],
  "response": {
    "format": "json-stat2"
  }
}

In [None]:
df_work_percent = statbank_pandas(url, payload)
df_work_percent['alder'] = df_work_percent['alder'].str.replace(' år','')
df_work_percent[['alder1', 'alder2']] = df_work_percent['alder'].str.split('-', expand = True)

df_work_percent['arbeidstid'] = df_work_percent['arbeidstid'].str.replace(' prosent','')
df_work_percent['arbeidstid'] = df_work_percent['arbeidstid'].str.replace(' eller mer','-100')
df_work_percent[['arbeidstid1', 'arbeidstid2']] = df_work_percent['arbeidstid'].str.split('-', expand = True)

df_work_percent = df_work_percent[['alder1', 'alder2', 'kjønn', 'arbeidstid1', 'arbeidstid2', 'value']]
#df_work_percent

In [None]:
# This amount of rows should have work_percents
work_percent_target = round((df_work_percent['value'].sum() / total_pop) * len(population))
# Subtract the amount that already has work_percent
work_percent_target = work_percent_target - len(population[~population['work_percent'].isnull()])

# Pick the people without work percents, but that work in companies, make a list over ids in population-dataframe
selection = population[(population['work_percent'].isnull()) & (~population['work_id'].isnull())].index.tolist()
# Shuffle the id-list
random.shuffle(selection)

# Loop through the shuffled list, until we hit the amount we are looking for
for i in selection[:work_percent_target]:
    
    # Get persons sex and age
    age = population.loc[i]['age']
    sex = population.loc[i]['sex']
    #print(age, sex)
    
    # Look at df_work_percent and add all the values together that match sex and age
    appropriate = df_work_percent[(df_work_percent['kjønn'] == sex) &
                    (df_work_percent['alder1'].astype(int) <= int(age)) & 
                    (df_work_percent['alder2'].astype(int) >= int(age))]
    
    # Draw a random int in this range
    draw = random.randint(0, appropriate['value'].sum())
    for index, r in appropriate.iterrows():
        # Place in corresponding category
        draw -= int(r['value'])
        if draw <= 0:
            # Set percent as a random number between edges
            percent = random.randint(int(r['arbeidstid1']), int(r['arbeidstid2']))
            #print(age, sex, percent)
            
            # Apply the work_percent to the dataset
            population.loc[i, 'work_percent'] = percent
            break
    #break

In [None]:
# This should be 0, if we added the correct amount of work_percents
round((df_work_percent['value'].sum() / total_pop) * len(population)) - len(population[~population['work_percent'].isnull()])

# Checks

### Companies without employees

In [None]:
companies[~companies['work_id'].isin(pd.unique(population['work_id']).tolist())]

### Workers with non-exisiting companies

In [None]:
population[(~population['work_id'].isin(companies['work_id'].tolist())) & (~population['work_id'].isnull())]

## Save files

In [None]:
start_path = '/'.join(pop_in_path.split('/')[:-1])
comp_outpath = f'{start_path}/companies_{start_year + 1}_{start_pop_num}'
pop_outpath = f'{start_path}/population_{start_year + 1}_{start_pop_num}'

In [None]:
dp.write_pandas(companies, comp_outpath, valuation='OPEN', state= 'OUTPUT')
dp.write_pandas(population, pop_outpath, valuation='OPEN', state= 'OUTPUT')

In [None]:
companies.to_csv(f'companies_{start_year + 1}_{start_pop_num}_v001.csv', sep = ';', encoding = 'utf-8-sig')
population.to_csv(f'population_{start_year + 1}_{start_pop_num}_v001.csv', sep = ';', encoding = 'utf-8-sig')