-
Notifications
You must be signed in to change notification settings - Fork 1
/
main.py
61 lines (45 loc) · 2.13 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
# -*- coding: utf-8 -*-
"""
Created on Thu Sep 14 20:02:05 2023
@author: abiga
"""
from run_llm import process_file
from pull_historical import fetch_and_write_out_historical
from graphics_frequency_job_title import frequencies_wordcloud
from clustering import summarize_data
from test_ner import all_strings_present, analyze_tool_discrepancies
from ner_with_regex import find_tools_in_duties
from marvin import settings
class Config:
def __init__(self):
# General configuration
self.historical_file = "historical_joa"
self.file_with_llm_markings = "final_aggregated_joa"
self.wordcloud_name = "wordcloud"
# Configuration for historical data
self.historical = self.HistoricalConfig()
# Configuration for LLM processing
self.llm = self.LLMConfig()
class HistoricalConfig:
def __init__(self):
self.start_date = "1/1/2022"
self.end_date = "9/30/2023"
class LLMConfig:
def __init__(self):
self.BATCH_SIZE = 10
self.DIR_NAME = "batched_files"
self.FILE_PREFIX = "batch_"
self.sample_size = None
if __name__ == "__main__":
print(settings)
config = Config()
#df_historical = fetch_and_write_out_historical(config.historical.start_date, config.historical.end_date, config.historical_file)
df_llm = process_file(config.historical_file, config.file_with_llm_markings, sample_size=config.llm.sample_size, BATCH_SIZE=config.llm.BATCH_SIZE, DIR_NAME=config.llm.DIR_NAME, FILE_PREFIX=config.llm.FILE_PREFIX)
# this is now just doing occs unique to job_title, not any found in official ones
frequencies_wordcloud(config.file_with_llm_markings, config.wordcloud_name)
top_titles_dict = summarize_data(config.file_with_llm_markings)
df_llm['all_present'] = df_llm.apply(all_strings_present, axis=1)
df_llm = find_tools_in_duties(df_llm)
df_llm = analyze_tool_discrepancies(df_llm)
df_llm['positionTitle']=df_llm['positionTitle'].str.title()
df_llm.groupby(['job_title','positionTitle']).count().to_csv("../results/title_fields.csv")