In [None]:
import pandas as pd
import dapla as dp
import numpy as np
import requests, json
from pyjstat import pyjstat
import datetime
import random

In [None]:
curr_year = 2019
pop_num = 30000

### Statbank function

In [None]:
def statbank_pandas(url, payload):
    resultat = requests.post(url, json = payload)
    # Les resultatet som ett pyjstat-class-objekt
    dataset = pyjstat.Dataset.read(resultat.text)
    # Skriv "hovedresultat" til en dataframe
    df = dataset.write('dataframe')
    return df

### Population start

In [None]:
# sex and age according to general distribution in populace

In [None]:
url = 'https://data.ssb.no/api/v0/no/table/07459/'
    
payload = {
  "query": [
    {
      "code": "Region",
      "selection": {
        "filter": "vs:Landet",
        "values": [
          "0"
        ]
      }
    },
    {
      "code": "Kjonn",
      "selection": {
        "filter": "item",
        "values": [
          "1",
          "2"
        ]
      }
    },
    {
      "code": "Alder",
      "selection": {
        "filter": "agg:Funksjonell4",
        "values": [
          "F311",
          "F312",
          "F313",
          "F314",
          "F315",
          "F316",
          "F317",
          "F318",
          "F319",
          "F320"
        ]
      }
    },
    {
      "code": "Tid",
      "selection": {
        "filter": "item",
        "values": [
          curr_year
        ]
      }
    }
  ],
  "response": {
    "format": "json-stat2"
  }
}

In [None]:
df_age_sex = statbank_pandas(url, payload)
df_age_sex[['kjønn', 'alder', 'value']]
df_age_sex['alder'] = df_age_sex['alder'].str.replace('^0 år','0-0')
df_age_sex['alder'] = df_age_sex['alder'].str.replace(' år','')
df_age_sex['alder'] = df_age_sex['alder'].str.replace('90 eller eldre','90-100')

df_age_sex[['alder1', 'alder2']] = df_age_sex['alder'].str.split('-', expand = True)
df_age_sex = df_age_sex[['kjønn', 'value', 'alder1', 'alder2']]

In [None]:
# Initialiser pop_num, om det ikke er fylt ut tidligere
if not 'pop_num' in locals():
    pop_num = df_age_sex['value'].sum()

pop_num

In [None]:
population = pd.DataFrame([], columns = ['id', 'work_id', 'sex', 'age', 'year_birth', 'work_percent'])
# Manual overwrite population target, if testing
population = population.reindex(population.index.tolist() + list(range(pop_num)))

In [None]:
peep_ratio = pop_num / df_age_sex['value'].sum()

In [None]:
count = 0
for i, row in df_age_sex.iterrows():
    #print(row['value'])
    target = round(row['value'] * peep_ratio)
    
    print(target, '\t', row['kjønn'], ' mellom ', row['alder1'] , ' og ' , row['alder2'], ' år.')
    for x in range(target):
        #print(count)
        population['sex'].iloc[count] = row['kjønn']
        population['age'].iloc[count] = np.random.randint(int(row['alder1']), high = int(row['alder2'])+1)
        count += 1
        if count >= pop_num: 
            print(count)
            break
print('Totalt', count)

In [None]:
population['age']

In [None]:
# Year of birth
for i, row in population.iterrows():
    if row['age'] == row['age']:
        population['year_birth'].loc[i] = curr_year - int(row['age'])

In [None]:
for i, row in population.iterrows():
    if row['year_birth'] == row['year_birth']:
        # first 4 numbers are a random date of the year (so we want to not pick dates that did not exist that year)
        start_date = datetime.date(row['year_birth'], 1, 1)
        end_date = datetime.date(row['year_birth'], 12, 31)

        time_between_dates = end_date - start_date
        days_between_dates = time_between_dates.days
        random_number_of_days = random.randrange(days_between_dates)
        random_date = start_date + datetime.timedelta(days=random_number_of_days)
        date = f'{str(random_date.day).zfill(2)}{str(random_date.month).zfill(2)}'

        # numbers 5-6 are birthyear, last two digits of current year - age
        fnr = f'{date}{str(row["year_birth"])[2:]}'

        # Last 5 digits are random, if sex is male, last digit should be divisable by two

        # Edgecase might go from 99999 to 100000, so we start one below
        last5 = np.random.randint(0,99998) 
        # If the number cant be divded cleanly by two, and the sex is male, increase the number by one
        if last5 % 2 and row['sex'] == 'Menn':
            last5 += 1
        last5 = str(last5).zfill(5)

        fnr = fnr + last5

        population['id'].iloc[i] = fnr
        #print(fnr)
        #print(row['sex'])
        #break

In [None]:
# Remove dupes on person-id
population.drop_duplicates('id', inplace = True)

In [None]:
# Work_percent according to age and sex

In [None]:
url = 'https://data.ssb.no/api/v0/no/table/12541/'
payload = {
  "query": [
    {
      "code": "Alder",
      "selection": {
        "filter": "item",
        "values": [
          "15-19",
          "20-24",
          "25-39",
          "40-54",
          "55-66",
          "67-74"
        ]
      }
    },
    {
      "code": "Kjonn",
      "selection": {
        "filter": "item",
        "values": [
          "1",
          "2"
        ]
      }
    },
    {
      "code": "HovedBiarbeid",
      "selection": {
        "filter": "item",
        "values": [
          "T"
        ]
      }
    },
    {
      "code": "ArbeidsTidRen",
      "selection": {
        "filter": "vs:ArbTidS12",
        "values": [
          "P000-009",
          "P010-019",
          "P020-029",
          "P030-039",
          "P040-049",
          "P050-059",
          "P060-069",
          "P070-079",
          "P080-089",
          "P090-099",
          "P100"
        ]
      }
    },
    {
      "code": "Tid",
      "selection": {
        "filter": "item",
        "values": [
          curr_year
        ]
      }
    }
  ],
  "response": {
    "format": "json-stat2"
  }
}

In [None]:
df_work_percent = statbank_pandas(url, payload)
df_work_percent['alder'] = df_work_percent['alder'].str.replace(' år','')
df_work_percent[['alder1', 'alder2']] = df_work_percent['alder'].str.split('-', expand = True)

df_work_percent['arbeidstid'] = df_work_percent['arbeidstid'].str.replace(' prosent','')
df_work_percent['arbeidstid'] = df_work_percent['arbeidstid'].str.replace(' eller mer','-100')
df_work_percent[['arbeidstid1', 'arbeidstid2']] = df_work_percent['arbeidstid'].str.split('-', expand = True)

df_work_percent = df_work_percent[['alder1', 'alder2', 'kjønn', 'arbeidstid1', 'arbeidstid2', 'value']]
df_work_percent

### Warning: Running this cell many times will fill up with work_percent, and fail running

In [None]:
for i, row in df_work_percent.iterrows():
    # Amount of rows we'd like to populate with work percentages, the amount of people 'value' is of the total, selected population
    target = round( (int(row['value']) / df_age_sex['value'].sum()) * pop_num )
    #print(row['value'], target)
    
    # Indexes in population where age is between range, sex matches category and work_percent is not yet filled
    selection = population[population['age'] >= int(row['alder1'])]
    selection = selection[selection['age'] <= int(row['alder2'])]
    selection = selection[selection['sex'] == row['kjønn']]
    selection = selection[selection['work_percent'].isnull()]
    
    #print(selection)
    
    indexes = selection.index.tolist()
    random.shuffle(indexes)
    #print(indexes)
    
    for x in range(target):
        # Fill random selection of indexes with a random value between the extremes of work_percent
        if len(indexes) >= x:
            #print(indexes[x])
            work_percent = np.random.randint(int(row['arbeidstid1']), int(row['arbeidstid2']) + 1)
            #print(work_percent)

            population['work_percent'].loc[indexes[x]] = work_percent
        else:
            print('Ran out of indexes in selection at', x, 'in', row['kjønn'], 'between ages of', row['alder1'], 'and', row['alder2'] )
    #break

In [None]:
population[~population['work_percent'].isnull()]['work_percent'].value_counts().plot()

In [None]:
# Amount of population employed
working_ratio = df_work_percent['value'].sum() / df_age_sex['value'].sum()
working_ratio

### Companies start

In [None]:
url = 'https://data.ssb.no/api/v0/no/table/11606/'
payload = {
  "query": [
    {
      "code": "Region",
      "selection": {
        "filter": "vs:Fylker1972m22",
        "values": [
          "01",
          "02",
          "03",
          "04",
          "05",
          "06",
          "07",
          "08",
          "09",
          "10",
          "11",
          "12",
          "14",
          "15",
          "50",
          "16",
          "17",
          "18",
          "19",
          "20",
          "21",
          "22",
          "23"
        ]
      }
    },
    {
      "code": "Alder",
      "selection": {
        "filter": "item",
        "values": [
          "15-74"
        ]
      }
    },
    {
      "code": "NACE2007",
      "selection": {
        "filter": "vs:NACE2007regsys5siff",
        "values": [
          "01.110",
          "01.120",
          "01.130",
          "01.140",
          "01.150",
          "01.160",
          "01.190",
          "01.210",
          "01.220",
          "01.230",
          "01.240",
          "01.250",
          "01.260",
          "01.270",
          "01.280",
          "01.290",
          "01.300",
          "01.410",
          "01.420",
          "01.430",
          "01.440",
          "01.451",
          "01.452",
          "01.460",
          "01.471",
          "01.479",
          "01.490",
          "01.500",
          "01.610",
          "01.620",
          "01.630",
          "01.640",
          "01.700",
          "01.000u",
          "02.100",
          "02.200",
          "02.300",
          "02.400",
          "02.000u",
          "03.111",
          "03.112",
          "03.120",
          "03.211",
          "03.212",
          "03.213",
          "03.221",
          "03.222",
          "03.223",
          "03.000u",
          "05.100",
          "05.200",
          "06.100",
          "06.200",
          "07.100",
          "07.210",
          "07.290",
          "08.111",
          "08.112",
          "08.113",
          "08.120",
          "08.910",
          "08.920",
          "08.930",
          "08.990",
          "09.101",
          "09.109",
          "09.900",
          "10.110",
          "10.120",
          "10.130",
          "10.201",
          "10.202",
          "10.203",
          "10.209",
          "10.310",
          "10.320",
          "10.390",
          "10.411",
          "10.412",
          "10.413",
          "10.420",
          "10.510",
          "10.520",
          "10.610",
          "10.620",
          "10.710",
          "10.720",
          "10.730",
          "10.810",
          "10.820",
          "10.830",
          "10.840",
          "10.850",
          "10.860",
          "10.890",
          "10.910",
          "10.920",
          "11.010",
          "11.020",
          "11.030",
          "11.040",
          "11.050",
          "11.060",
          "11.070",
          "12.000",
          "13.100",
          "13.200",
          "13.300",
          "13.910",
          "13.921",
          "13.929",
          "13.930",
          "13.940",
          "13.950",
          "13.960",
          "13.990",
          "14.110",
          "14.120",
          "14.130",
          "14.140",
          "14.190",
          "14.200",
          "14.310",
          "14.390",
          "15.110",
          "15.120",
          "15.200",
          "16.100",
          "16.210",
          "16.220",
          "16.231",
          "16.232",
          "16.240",
          "16.290",
          "17.110",
          "17.120",
          "17.210",
          "17.220",
          "17.230",
          "17.240",
          "17.290",
          "18.110",
          "18.120",
          "18.130",
          "18.140",
          "18.200",
          "19.100",
          "19.200",
          "20.110",
          "20.120",
          "20.130",
          "20.140",
          "20.150",
          "20.160",
          "20.170",
          "20.200",
          "20.300",
          "20.410",
          "20.420",
          "20.510",
          "20.520",
          "20.530",
          "20.590",
          "20.600",
          "21.100",
          "21.200",
          "22.110",
          "22.190",
          "22.210",
          "22.220",
          "22.230",
          "22.290",
          "23.110",
          "23.120",
          "23.130",
          "23.140",
          "23.190",
          "23.200",
          "23.310",
          "23.320",
          "23.410",
          "23.420",
          "23.430",
          "23.440",
          "23.490",
          "23.510",
          "23.520",
          "23.610",
          "23.620",
          "23.630",
          "23.640",
          "23.650",
          "23.690",
          "23.700",
          "23.910",
          "23.990",
          "24.101",
          "24.102",
          "24.200",
          "24.310",
          "24.320",
          "24.330",
          "24.340",
          "24.410",
          "24.421",
          "24.422",
          "24.430",
          "24.440",
          "24.450",
          "24.460",
          "24.510",
          "24.520",
          "24.530",
          "24.540",
          "25.110",
          "25.120",
          "25.210",
          "25.290",
          "25.300",
          "25.400",
          "25.500",
          "25.610",
          "25.620",
          "25.710",
          "25.720",
          "25.730",
          "25.910",
          "25.920",
          "25.930",
          "25.940",
          "25.990",
          "26.110",
          "26.120",
          "26.200",
          "26.300",
          "26.400",
          "26.510",
          "26.520",
          "26.600",
          "26.700",
          "26.800",
          "27.110",
          "27.120",
          "27.200",
          "27.310",
          "27.320",
          "27.330",
          "27.400",
          "27.510",
          "27.520",
          "27.900",
          "28.110",
          "28.120",
          "28.130",
          "28.140",
          "28.150",
          "28.210",
          "28.221",
          "28.229",
          "28.230",
          "28.240",
          "28.250",
          "28.290",
          "28.300",
          "28.410",
          "28.490",
          "28.910",
          "28.920",
          "28.930",
          "28.940",
          "28.950",
          "28.960",
          "28.990",
          "29.100",
          "29.200",
          "29.310",
          "29.320",
          "30.111",
          "30.112",
          "30.113",
          "30.114",
          "30.115",
          "30.116",
          "30.120",
          "30.200",
          "30.300",
          "30.400",
          "30.910",
          "30.920",
          "30.990",
          "31.010",
          "31.020",
          "31.030",
          "31.090",
          "32.110",
          "32.120",
          "32.130",
          "32.200",
          "32.300",
          "32.400",
          "32.500",
          "32.910",
          "32.990",
          "33.110",
          "33.120",
          "33.130",
          "33.140",
          "33.150",
          "33.160",
          "33.170",
          "33.190",
          "33.200",
          "35.111",
          "35.112",
          "35.113",
          "35.114",
          "35.119",
          "35.120",
          "35.130",
          "35.140",
          "35.210",
          "35.220",
          "35.230",
          "35.300",
          "36.000",
          "37.000",
          "38.110",
          "38.120",
          "38.210",
          "38.220",
          "38.310",
          "38.320",
          "39.000",
          "41.101",
          "41.109",
          "41.200",
          "42.110",
          "42.120",
          "42.130",
          "42.210",
          "42.220",
          "42.910",
          "42.990",
          "43.110",
          "43.120",
          "43.130",
          "43.210",
          "43.220",
          "43.221",
          "43.222",
          "43.223",
          "43.290",
          "43.310",
          "43.320",
          "43.330",
          "43.341",
          "43.342",
          "43.390",
          "43.911",
          "43.919",
          "43.990",
          "45.111",
          "45.112",
          "45.191",
          "45.192",
          "45.200",
          "45.310",
          "45.320",
          "45.401",
          "45.402",
          "45.403",
          "46.110",
          "46.120",
          "46.130",
          "46.140",
          "46.150",
          "46.160",
          "46.170",
          "46.180",
          "46.190",
          "46.210",
          "46.220",
          "46.230",
          "46.240",
          "46.310",
          "46.320",
          "46.330",
          "46.341",
          "46.349",
          "46.350",
          "46.360",
          "46.370",
          "46.381",
          "46.389",
          "46.390",
          "46.410",
          "46.421",
          "46.422",
          "46.431",
          "46.432",
          "46.433",
          "46.434",
          "46.435",
          "46.441",
          "46.442",
          "46.450",
          "46.460",
          "46.471",
          "46.472",
          "46.473",
          "46.481",
          "46.482",
          "46.491",
          "46.492",
          "46.493",
          "46.494",
          "46.495",
          "46.499",
          "46.510",
          "46.520",
          "46.610",
          "46.620",
          "46.630",
          "46.640",
          "46.650",
          "46.660",
          "46.691",
          "46.692",
          "46.693",
          "46.694",
          "46.710",
          "46.720",
          "46.731",
          "46.732",
          "46.733",
          "46.739",
          "46.740",
          "46.750",
          "46.761",
          "46.769",
          "46.770",
          "46.900",
          "47.111",
          "47.112",
          "47.190",
          "47.210",
          "47.220",
          "47.230",
          "47.241",
          "47.242",
          "47.251",
          "47.259",
          "47.260",
          "47.291",
          "47.292",
          "47.299",
          "47.300",
          "47.410",
          "47.420",
          "47.430",
          "47.510",
          "47.521",
          "47.522",
          "47.523",
          "47.524",
          "47.529",
          "47.531",
          "47.532",
          "47.533",
          "47.540",
          "47.591",
          "47.592",
          "47.593",
          "47.594",
          "47.599",
          "47.610",
          "47.620",
          "47.630",
          "47.641",
          "47.642",
          "47.650",
          "47.710",
          "47.721",
          "47.722",
          "47.730",
          "47.740",
          "47.750",
          "47.761",
          "47.762",
          "47.771",
          "47.772",
          "47.781",
          "47.782",
          "47.789",
          "47.791",
          "47.792",
          "47.799",
          "47.810",
          "47.820",
          "47.890",
          "47.911",
          "47.912",
          "47.913",
          "47.914",
          "47.915",
          "47.916",
          "47.917",
          "47.919",
          "47.990",
          "49.100",
          "49.200",
          "49.311",
          "49.312",
          "49.320",
          "49.391",
          "49.392",
          "49.393",
          "49.410",
          "49.420",
          "49.500",
          "50.101",
          "50.102",
          "50.109",
          "50.201",
          "50.202",
          "50.203",
          "50.204",
          "50.300",
          "50.400",
          "51.100",
          "51.210",
          "51.220",
          "52.100",
          "52.211",
          "52.212",
          "52.213",
          "52.214",
          "52.215",
          "52.216",
          "52.219",
          "52.221",
          "52.222",
          "52.223",
          "52.229",
          "52.230",
          "52.240",
          "52.291",
          "52.292",
          "52.293",
          "52.299",
          "53.100",
          "53.200",
          "55.101",
          "55.102",
          "55.201",
          "55.202",
          "55.300",
          "55.301",
          "55.302",
          "55.900",
          "56.101",
          "56.102",
          "56.210",
          "56.290",
          "56.301",
          "56.309",
          "58.110",
          "58.120",
          "58.130",
          "58.140",
          "58.190",
          "58.210",
          "58.290",
          "59.110",
          "59.120",
          "59.130",
          "59.140",
          "59.200",
          "60.100",
          "60.200",
          "61.100",
          "61.200",
          "61.300",
          "61.900",
          "62.010",
          "62.020",
          "62.030",
          "62.090",
          "63.110",
          "63.120",
          "63.910",
          "63.990",
          "64.110",
          "64.190",
          "64.201",
          "64.202",
          "64.301",
          "64.302",
          "64.303",
          "64.304",
          "64.305",
          "64.306",
          "64.308",
          "64.309",
          "64.910",
          "64.920",
          "64.990",
          "65.110",
          "65.120",
          "65.200",
          "65.300",
          "66.110",
          "66.120",
          "66.190",
          "66.210",
          "66.220",
          "66.290",
          "66.300",
          "68.100",
          "68.201",
          "68.209",
          "68.310",
          "68.320",
          "69.100",
          "69.201",
          "69.202",
          "69.203",
          "70.100",
          "70.210",
          "70.220",
          "71.111",
          "71.112",
          "71.113",
          "71.121",
          "71.122",
          "71.123",
          "71.129",
          "71.200",
          "72.110",
          "72.190",
          "72.200",
          "73.110",
          "73.120",
          "73.200",
          "74.101",
          "74.102",
          "74.103",
          "74.200",
          "74.300",
          "74.901",
          "74.902",
          "74.903",
          "74.909",
          "75.000",
          "77.110",
          "77.120",
          "77.210",
          "77.220",
          "77.290",
          "77.310",
          "77.320",
          "77.330",
          "77.340",
          "77.350",
          "77.390",
          "77.400",
          "78.100",
          "78.200",
          "78.300",
          "79.110",
          "79.120",
          "79.901",
          "79.902",
          "79.903",
          "79.909",
          "80.100",
          "80.200",
          "80.300",
          "81.101",
          "81.109",
          "81.210",
          "81.220",
          "81.291",
          "81.299",
          "81.300",
          "82.110",
          "82.190",
          "82.201",
          "82.202",
          "82.300",
          "82.910",
          "82.920",
          "82.990",
          "84.110",
          "84.120",
          "84.130",
          "84.210",
          "84.220",
          "84.230",
          "84.240",
          "84.250",
          "84.300",
          "85.100",
          "85.201",
          "85.202",
          "85.203",
          "85.310",
          "85.320",
          "85.410",
          "85.421",
          "85.422",
          "85.423",
          "85.424",
          "85.429",
          "85.510",
          "85.521",
          "85.522",
          "85.529",
          "85.530",
          "85.591",
          "85.592",
          "85.593",
          "85.594",
          "85.595",
          "85.596",
          "85.599",
          "85.601",
          "85.609",
          "86.101",
          "86.102",
          "86.103",
          "86.104",
          "86.105",
          "86.106",
          "86.107",
          "86.211",
          "86.212",
          "86.221",
          "86.222",
          "86.223",
          "86.224",
          "86.225",
          "86.230",
          "86.901",
          "86.902",
          "86.903",
          "86.904",
          "86.905",
          "86.906",
          "86.907",
          "86.909",
          "87.101",
          "87.102",
          "87.201",
          "87.202",
          "87.203",
          "87.301",
          "87.302",
          "87.303",
          "87.304",
          "87.305",
          "87.901",
          "87.909",
          "88.101",
          "88.102",
          "88.103",
          "88.911",
          "88.912",
          "88.913",
          "88.914",
          "88.991",
          "88.992",
          "88.993",
          "88.994",
          "88.995",
          "88.996",
          "88.997",
          "88.998",
          "88.999",
          "90.011",
          "90.012",
          "90.019",
          "90.020",
          "90.031",
          "90.032",
          "90.033",
          "90.034",
          "90.035",
          "90.039",
          "90.040",
          "91.011",
          "91.012",
          "91.013",
          "91.021",
          "91.022",
          "91.023",
          "91.029",
          "91.030",
          "91.040",
          "92.000",
          "93.110",
          "93.120",
          "93.130",
          "93.190",
          "93.210",
          "93.291",
          "93.292",
          "93.299",
          "94.110",
          "94.120",
          "94.200",
          "94.910",
          "94.920",
          "94.991",
          "94.992",
          "95.110",
          "95.120",
          "95.210",
          "95.220",
          "95.230",
          "95.240",
          "95.250",
          "95.290",
          "96.010",
          "96.020",
          "96.030",
          "96.040",
          "96.090",
          "97.000",
          "99.000",
          "00.000"
        ]
      }
    },
    {
      "code": "ContentsCode",
      "selection": {
        "filter": "item",
        "values": [
          "Sysselsatte"
        ]
      }
    },
    {
      "code": "Tid",
      "selection": {
        "filter": "item",
        "values": [
          curr_year
        ]
      }
    }
  ],
  "response": {
    "format": "json-stat2"
  }
}

In [None]:
df_nace_codes = statbank_pandas(url, payload)
df_nace_codes

In [None]:
df_nace_codes['nace'] = ''

In [None]:
# Apply nacecode
count = 0
for code in payload['query'][2]['selection']['values']:
    for x in range(len(pd.unique(df_nace_codes['region']))):
        df_nace_codes['nace'].iloc[count] = code
        count += 1

In [None]:
# Number of rows to copy each region onto
region_len = int(len(df_nace_codes) / len(pd.unique(df_nace_codes['region'])))
region_len

In [None]:
payload['query'][0]['selection']['values']

In [None]:
df_nace_codes['region_code'] = ''

In [None]:
# Apply region code in increments 
count = 0
for code in payload['query'][0]['selection']['values']:
    for x in range(region_len):
        df_nace_codes['region_code'].iloc[count] = code
        count += 1

In [None]:
df_nace_codes[820:840]

In [None]:
df_nace_codes['employee_fit'] = 0

In [None]:
fit_ratio_nace = pop_num / df_age_sex['value'].sum()
fit_ratio_nace

In [None]:
df_nace_codes['value'].sum()

In [None]:
# Number of employed people in the selected population
total_employed = round(df_nace_codes['value'].sum() * fit_ratio_nace )
total_employed

In [None]:
df_nace_codes['employee_fit'] = df_nace_codes['value'] * fit_ratio_nace
df_nace_codes['employee_fit'] = df_nace_codes['employee_fit'].round(0).astype(int)

In [None]:
accuracy = 0.001

# Loop to hit desired level of employees

# If smaller increment up
while df_nace_codes['employee_fit'].round(0).sum() < total_employed:
    fit_ratio_nace *= 1.0 + accuracy
    df_nace_codes['employee_fit'] = df_nace_codes['value'] * fit_ratio_nace
    #print(fit_ratio_nace)

# If greater increment down
while df_nace_codes['employee_fit'].round(0).sum() > total_employed:
    fit_ratio_nace *= 1.0 - accuracy
    df_nace_codes['employee_fit'] = df_nace_codes['value'] * fit_ratio_nace

# Updated fit-ratio
print(fit_ratio_nace)
    
# Round off employee_fit
df_nace_codes['employee_fit'] = df_nace_codes['employee_fit'].round(0).astype(int)

In [None]:
# This should reflect the amount of employees "total_employed" now
df_nace_codes['employee_fit'].sum()

In [None]:
url = 'https://data.ssb.no/api/v0/no/table/07091/'
payload = {
  "query": [
    {
      "code": "Region",
      "selection": {
        "filter": "vs:Fylker",
        "values": [
          "30",
          "01",
          "02",
          "06",
          "03",
          "34",
          "04",
          "05",
          "38",
          "07",
          "08",
          "42",
          "09",
          "10",
          "11",
          "46",
          "12",
          "13",
          "14",
          "15",
          "50",
          "16",
          "17",
          "18",
          "54",
          "19",
          "20",
          "21",
          "22",
          "23",
          "25",
          "26",
          "88",
          "99"
        ]
      }
    },
    {
      "code": "NACE2007",
      "selection": {
        "filter": "item",
        "values": [
          "01",
          "02",
          "03",
          "05",
          "06",
          "07",
          "08",
          "09",
          "10",
          "11",
          "12",
          "13",
          "14",
          "15",
          "16",
          "17",
          "18",
          "19",
          "20",
          "21",
          "22",
          "23",
          "24",
          "25",
          "26",
          "27",
          "28",
          "29",
          "30",
          "31",
          "32",
          "33",
          "35",
          "36",
          "37",
          "38",
          "39",
          "41",
          "42",
          "43",
          "45",
          "46",
          "47",
          "49",
          "50",
          "51",
          "52",
          "53",
          "55",
          "56",
          "58",
          "59",
          "60",
          "61",
          "62",
          "63",
          "64",
          "65",
          "66",
          "68",
          "69",
          "70",
          "71",
          "72",
          "73",
          "74",
          "75",
          "77",
          "78",
          "79",
          "80",
          "81",
          "82",
          "84",
          "85",
          "86",
          "87",
          "88",
          "90",
          "91",
          "92",
          "93",
          "94",
          "95",
          "96",
          "97",
          "99",
          "00"
        ]
      }
    },
    {
      "code": "AntAnsatte",
      "selection": {
        "filter": "item",
        "values": [
          "01",
          "02",
          "03",
          "04",
          "05",
          "15"
        ]
      }
    },
    {
      "code": "Tid",
      "selection": {
        "filter": "item",
        "values": [
          curr_year
        ]
      }
    }
  ],
  "response": {
    "format": "json-stat2"
  }
}

In [None]:
resultat = requests.post(url, json = payload)
# Les resultatet som ett pyjstat-class-objekt
dataset = pyjstat.Dataset.read(resultat.text)
# Skriv "hovedresultat" til en dataframe
df_num_comps = dataset.write('dataframe')
df_num_comps['region_code'] = dataset.write('dataframe', naming='id')['Region']
df_num_comps

In [None]:
# Amount of companies with employees
comp_num = df_num_comps['value'].sum()
comp_num

In [None]:
companies = pd.DataFrame([], columns = ['work_id', 'nace','region_code', 'region', 'employee_points'])

people_to_comps_ratio = comp_num / df_age_sex['value'].sum()
comp_num = round(pop_num * people_to_comps_ratio)
#comp_num = 300000
companies = companies.reindex(companies.index.tolist() + list(range(comp_num)))
# Number of companies in relation to current population target
comp_num

In [None]:
# We would prefer there to be more than one company per nace-code

In [None]:
print(comp_num, 'should be larger than', len(df_nace_codes[df_nace_codes['value'] > 0]), 
      'if we want more than one company for each combination of valid nacekode x region.')

In [None]:
# work_id 9 digit random
for i in range(comp_num):
    companies['work_id'].iloc[i] = str(np.random.randint(0, 1000000000)).zfill(9)

In [None]:
# Remove dupes on company-ids
companies.drop_duplicates('work_id', inplace = True)

In [None]:
companies

In [None]:
df_nace_codes

In [None]:
# If we just made groups from these, we would get n-number of work places, minimum
print ( df_nace_codes[df_nace_codes['employee_fit'] > 0 ].groupby('nace').ngroups )
# We want this number
comp_num

In [None]:
# Remove all rows that are set to zero employee_fit
fit_comp_employ = df_nace_codes[df_nace_codes['employee_fit'] > 0]
fit_comp_sums = fit_comp_employ.groupby('nace')['employee_fit'].sum()
wanted_naces = fit_comp_sums.sort_values(ascending = False).head(comp_num).index.tolist()
#print(wanted_naces)
fit_comp_employ = fit_comp_employ[fit_comp_employ['nace'].isin(wanted_naces)]

In [None]:
fit_comp_employ['employee_fit'].sum()

In [None]:
# If smaller increment up
while fit_comp_employ['employee_fit'].round(0).sum() < total_employed:
    fit_ratio_nace *= 1.0 + accuracy
    fit_comp_employ['employee_fit'] = fit_comp_employ['value'] * fit_ratio_nace
    #print(fit_ratio_nace)

# If greater increment down
while fit_comp_employ['employee_fit'].round(0).sum() > total_employed:
    fit_ratio_nace *= 1.0 - accuracy
    fit_comp_employ['employee_fit'] = fit_comp_employ['value'] * fit_ratio_nace

In [None]:
fit_comp_employ['employee_fit'] = fit_comp_employ['employee_fit'].round(0).astype(int)

#Number of naces (companies)
print(fit_comp_employ.groupby('nace').ngroups)

#Number of employees in selection
print(fit_comp_employ['employee_fit'].sum())

In [None]:
fit_comp_employ['employee_sum'] = 0

In [None]:
# Grupper på nace, sorter etter employee_fit, kombiner verdier, behold kun første rad
fit_comp_employ = fit_comp_employ.groupby('nace').apply(lambda x: x.sort_values('employee_fit', ascending = False)).reset_index(drop = True)
fit_comp_employ = fit_comp_employ.sort_values('employee_fit', ascending = False)
fit_comp_sums = fit_comp_employ['employee_fit'].sum()

In [None]:
fit_comp_employ

In [None]:
count = 0
for i in pd.unique(fit_comp_employ['nace']):
    nace = i
    emp_num = fit_comp_employ[fit_comp_employ['nace'] == i]['employee_fit'].sum()
    fylk = fit_comp_employ[fit_comp_employ['nace'] == i]['region'].iloc[0]
    fylk_id = fit_comp_employ[fit_comp_employ['nace'] == i]['region_code'].iloc[0]
    #print(i, emp_num, fylk)
    
    companies.iloc[count]['nace'] = nace
    companies.iloc[count]['region_code'] = fylk_id
    companies.iloc[count]['region'] = fylk
    companies.iloc[count]['employee_points'] = emp_num
    
    count += 1

In [None]:
# If there now are rows without naces, we will start looping through them to start "stealing" employees
# We will do this by getting the list of which nace codes there are most companies of
loop = 2
init_count = count
count += 1
pain_threshold = 5

while count < comp_num - 1:
    print('Loop:', loop)
    for i in pd.unique(fit_comp_employ['nace']):      
        
        # We dont want to start dividing below 5 employees per firm, really...
        if companies[companies['nace'] == i]['employee_points'].iloc[0] > pain_threshold:
            # Get the employees on the nacecode, divide among loops
            employee_points_div = companies[companies['nace'] == i]['employee_points'].sum() / loop
            #print(employee_points_div)


            # Assign the now sum on all of the existing companies
            companies.loc[companies['nace'] == i, 'employee_points'] = employee_points_div

            # Get the loopnumber - 1 index on the region
            try:
                fylk = fit_comp_employ[fit_comp_employ['nace'] == i]['region'].iloc[loop-1]
            except:
                # If we run out of regions to pick once, just pick the first
                fylk = fit_comp_employ[fit_comp_employ['nace'] == i]['region'].iloc[0]
                
            try:    
                fylk_id = fit_comp_employ[fit_comp_employ['nace'] == i]['region_code'].iloc[loop-1]
            except:
                # If we run out of regions to pick once, just pick the first
                fylk_id = fit_comp_employ[fit_comp_employ['nace'] == i]['region_code'].iloc[0]
                
            # Make a new company on the nace code
            companies.iloc[count]['nace'] = i
            companies.iloc[count]['region_code'] = fylk_id
            companies.iloc[count]['region'] = fylk
            companies.iloc[count]['employee_points'] = employee_points_div

            #print(count)
            #print(i)

            if count >= comp_num - 1:
                break
            count += 1
            
            # But if there are no firms to steal from, because all firms have pain_threshold or lower amount of employees, raise error
            if not len(companies[companies['employee_points'] > 5]):
                raise ValueError('No rows with enough employees to keep stealing.')
        
    loop += 1
    #if loop > 2:
    #    break

In [None]:
ratio_temp = 1
accuracy = 0.0000000000005
# If smaller increment up
while companies['employee_points'].astype(float).round(0).sum() < total_employed:
    ratio_temp *= 1.0 + accuracy
    companies['employee_points'] = companies['employee_points'] * ratio_temp
    print(companies['employee_points'].astype(float).round(0).sum())

ratio_temp = 1   
# If greater increment down
while companies['employee_points'].astype(float).round(0).sum() > total_employed:
    ratio_temp *= 1.0 - accuracy
    companies['employee_points'] = companies['employee_points'] * ratio_temp
    print(companies['employee_points'].astype(float).round(0).sum())

In [None]:
companies['employee_points'].fillna(0, inplace = True)
companies['employee_points'] = companies['employee_points'].astype(float).round(0).astype(int)

In [None]:
# Add one employee to each row from top until we match our target again
count = 0
while companies['employee_points'].sum() < total_employed:
    companies['employee_points'].iloc[count] += 1  
    count += 1

In [None]:
total_employed

In [None]:
#15258
companies['employee_points'].sum()

In [None]:
companies

In [None]:
# Number of people we will employ
companies['employee_points'].sum()

# Number of people with work percent
len(population[population['work_percent'] > 0])


if companies['employee_points'].sum() > len(population[population['work_percent'] > 0]):
        pick_non_wrkprcnt = int(companies['employee_points'].sum() - len(population[population['work_percent'] > 0]))
        # We only want to pick those within working age from these
        ids = population[(population['age'] > 16) & (population['age'] < 74)]
        ids = ids[ids['work_percent'].isnull()]['id'].tolist()
        random.shuffle(ids)
        ids = ids[:pick_non_wrkprcnt-1]
        
# Ids of the people we will employ
ids = ids + population[~population['work_percent'].isnull()]['id'].tolist()
random.shuffle(ids)

In [None]:
population

In [None]:
# Emplying people
for i, row in companies.iterrows():
    
    for employ in range(int(row['employee_points'])):
        curr_id = ids.pop()
        population.loc[population['id'] == curr_id, 'work_id'] = row['work_id']
        #population.iloc[curr_id, 'id']['work_id'] = row['work_id']

In [None]:
# People who work, but have no work_percent
population[(~population['work_id'].isnull()) & (population['work_percent'].isnull())]

In [None]:
# People who work, and have work_percent
population[(~population['work_id'].isnull()) & (~population['work_percent'].isnull())]

In [None]:
# nace random according to general distribution

In [None]:
companies

In [None]:
# Companies without employees

### MVP no.1

In [None]:
# income_2019 random according to mean of nace

In [None]:
# number of regions random between 1-all

# choose regions according to person-distribution in country

In [None]:
# organization_type according to distribution among companies

In [None]:
# Assign company employeepoints according to income, number of regions, organization_form, nace-kode 

# Divide points on active employees. Points to employee-ratio

# Number of employees to pick for each company

# Check thet total employees on companies need to match number of active employees

### Population continuation

In [None]:
# Assign region based on company

# Assign region to "non-active-workers" to random region based on distribution

In [None]:
# Pick random municipality based on resident region

In [None]:
# lost_workdays according to sex, region, nace

### Companies conclusion

### Write to dapla storage

In [None]:
dp.write_pandas(companies, f'/felles/mock_sysselsatte/companies_{curr_year}_{pop_num}', valuation='OPEN', state= 'OUTPUT')
dp.write_pandas(population, f'/felles/mock_sysselsatte/population_{curr_year}_{pop_num}', valuation='OPEN', state= 'OUTPUT')

In [None]:
dp.show('/felles/mock_sysselsatte/')

### Write to csvs in folder

In [None]:
companies.to_csv(f'companies_{curr_year}_v001.csv', sep = ';', encoding = 'utf-8-sig')

In [None]:
population.to_csv(f'population_{curr_year}_v001.csv', sep = ';', encoding = 'utf-8-sig')

### Wishlist

In [None]:
# Organisasjonstype

In [None]:
# Selskapets inntekt

In [None]:
# More than one company per nace

In [None]:
# Try to pick profession randomly from list according to nace

In [None]:
# Age according to distribution in regions

##### Simulate 8 timeperiods
- Person-id must be consistent from year to year
- Persons will:
    - Increase age
- Persons might:
    - Change jobs
    - Change job-status, become pensioner, get hired
    - Increase pay, rarely decrease
    - Be born, die
    - Migrate in or out
- Companies might:
    - Companies might go bankrupt, new companies are started
    - Companies tend to hire more people over time
    - Companies might change form in certain cases?
- Checks:
    - Adjust tolerances
    - Percentage of working people still matches distribution
    - Amount of poeple in each region still matches distribution
    - Age distribution matches populace
    - Total earnings increase match general progress