## Step 0: importing libraries and functions


In [None]:
import pandas as pd
from os import path
from config import CSV_FOLDER
from data_parser import parse_main
from data_processor import get_tech_count


## Step 1: getting csv file parsed from djinni
    - by default parse_main parses job vacancies from djinni.co for "python"
    - base url as well as language preset can be viewd and adjusted in config file
    - csv_file_name variable to store filename, for example it can be date of data parsing for archiving
    - after parsing is complete .csv file can found in folder "parsed_results"
    - folder name can be adjusted by changing value of CSV_FOLDER in config file

In [None]:
%run data_parser.py
%autoawait asyncio

csv_file_name = "test_output.csv"

await parse_main(csv_file_name)


## Step 2: making dataframe from stored csv

    - pandas dataframe is building according to csv structure
    - csv structure of file depends on dataclass and logic from data_parser file
    - folder where csv files are stored depends on CSV_FOLDER variable from config
    - after getting dataframe it can be filtered by date or by years_limit in demand
    - change years_limit variable (numeric value) to set year filter for vacancies
    


In [None]:
years_limit = None

df = pd.read_csv(path.join(CSV_FOLDER, csv_file_name))
df["posted"] = df["posted"].astype("datetime64[ns]")

if years_limit is not None:
    df = df[df["experience"] <= years_limit]
df.info()


## Step 3: get most common tech names with count from dataframe

    - we are getting count of techs in demand by combining descriptions from dataframe and getting most used keywords
    - to configure what words shouldn't be in this list we can edit files in custom_stopwords folder
    - since most of the jobs on djinni posted in english or ukrainian languages by default added EN and UA files
    - new stop words can be added both by editing existing files or by adding new custom file
    - in case when added new stopwords file its name sould be added to list in config file
    - by editing STOP_WORDS_FILES list in config we can set which files will be read during select stopwords to use


In [None]:
%run data_processor.py

text_combined = " ".join(df["description"])
techs = get_tech_count(text_combined, 20)
techs


## Step 4: building bar with statistics from tech count results

    - plots will be build according to year limit we set before so this filter be present in title
    - also we can adjust names for both columns we get from parsing techs


In [None]:
x_column = "Technology"
y_column = "Appearance"

tech_df = pd.DataFrame(techs, columns=[x_column, y_column])
tech_df.plot.bar(
    x=x_column,
    y=y_column,
    figsize=(13, 10),
    title=(
        "Technologies in demand for python developer "
        f"with filter set to {years_limit}"
    )
)
