In [None]:
import numpy as np
import pandas as pd
from glob import glob
from tqdm import tqdm
import json
import pickle
from collections import defaultdict
import dill

# Get co-occurrences from raw Adzuna data

In [None]:
def find_cooccurrence(years, months):
    # create an set for unique ids
    ids_set = set()
    # initiate co-occurrrence matrix
    co_occurrence_counts = defaultdict(lambda: defaultdict(int))
    # loop over the directories
    for year in years:
        for month in months: 
            directory = '/Volumes/Elements/year=' + year + '/month=' + month + '/'
            day = glob(directory + '/*')
            for j in tqdm(range(len(day))):
                filename = glob(day[j] + '/*')
                for file in filename:
                    # loading each line in a json file
                    for line in open(file, 'r'):
                        jsonfile = json.loads(line)
                        # deduplicating
                        job_id = jsonfile['id']
                        # if 'id' seen then pass
                        old_len = len(ids_set)
                        ids_set.add(job_id)
                        # avoid empty job adverts
                        try:
                            if old_len != len(ids_set):
                                skills = jsonfile['skills']
                                # dtype different before/after 04/2022
                                skills = skills[1:-1].split(', ')           # comment out for 04/2022    
                                # storing the co-ocurrences in the symmetric matrix
                                for i in range(len(skills)):
                                    for j in range(i + 1, len(skills)):
                                        skill1, skill2 = skills[i], skills[j]
                                        co_occurrence_counts[skill1][skill2] += 1
                                        co_occurrence_counts[skill2][skill1] += 1
                        except:
                            pass
                            
    return co_occurrence_counts

In [None]:
co_dict161820all = find_cooccurrence(['2016', '2018', '2020'], ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12'])

In [None]:
co_dict22_1to3 = find_cooccurrence(['2022'], ['1', '2', '3'])

In [None]:
co_dict22_4to12 = find_cooccurrence(['2022'], ['4', '5', '6', '7', '8', '9', '10', '11', '12'])

In [None]:
def combine_defaultdict(co_dict1, co_dict2):
    co_dict_combined = defaultdict(lambda: defaultdict(int))
    
    # Update the combined dictionary with the counts from the first dictionary
    for element1, inner_dict1 in co_dict1.items():
        for element2, count1 in inner_dict1.items():
            co_dict_combined[element1][element2] += count1
    
    # Update the combined dictionary with the counts from the second dictionary
    for element1, inner_dict2 in co_dict2.items():
        for element2, count2 in inner_dict2.items():
            co_dict_combined[element1][element2] += count2
    
    return co_dict_combined

In [None]:
co_dict2022 = combine_defaultdict(co_dict22_1to3, co_dict22_4to12)
co_dict_combined = combine_defaultdict(co_dict2022, co_dict161820all)

## Convert to Pandas dataframe

In [None]:
unique_elements = set()
for element1, inner_dict in co_dict_combined.items():
    unique_elements.add(element1)

In [None]:
# Convert the set of unique elements to a list
unique_elements = list(unique_elements)

# Step 2: Create an empty DataFrame with columns and indices
df = pd.DataFrame(columns=unique_elements, index=unique_elements)

# Step 3: Fill in the DataFrame with values from the defaultdict of defaultdict
for element1, inner_dict in co_dict_combined.items():
    for element2, count in inner_dict.items():
        df.at[element1, element2] = count
        df.at[element2, element1] = count

In [None]:
df.fillna(0, inplace=True)