In [111]:
import numpy as np
import pandas as pd
import csv
from pathlib import Path
import json
main_dir = Path(r'data/csv_data')

In [112]:
# пример работы pathlib
for file_name in main_dir.iterdir():
    print(file_name)

data/csv_data/part1.csv
data/csv_data/part2.csv


In [113]:
# process csv data
def process_csv_data(file_path: str) -> list[dict]:
    """
    Read and process a CSV file containing user data, returning a list of dictionaries.

    This function reads a CSV file with columns for full name, gender, age, skills,
    company, salary, location (latitude and longitude), and registered date.
    It processes the data and returns a list of dictionaries, where each dictionary
    represents a user with the following keys:

    - 'name': The user's full name.
    - 'gender': The user's gender.
    - 'age': The user's age.
    - 'skills': A list of the user's skills.
    - 'company': The user's company.
    - 'salary': The user's salary.
    - 'location': A dictionary containing the user's location with keys 'lat' and 'lon'.
    - 'registered': The user's registered date.

    Parameters
    ----------
    file_path : str
        The file path to the CSV file containing the user data.

    Returns
    -------
    list[dict]
        A list of dictionaries, where each dictionary represents a user with the
        specified keys.

    Examples
    --------
    >>> data = process_csv_data('path/to/users.csv')
    >>> print(data[0])
    {
        'name': 'John Doe',
        'gender': 'Male',
        'age': '30',
        'skills': ['Python', 'Data Analysis', 'Machine Learning'],
        'company': 'XYZ Corporation',
        'salary': '$80,000',
        'location': {'lat': 40.7128, 'lon': -74.0060},
        'registered': '2022-01-15'
    }
    """
    with open(file_path, 'r', encoding='utf-8') as file:
        reader = csv.reader(file)
        next(reader)
        data = []
        for row in reader:
            fullname,gender,age,skills,company,salary,location,registered_date = row
            
            skills = skills.split(';')
            lat = float(location.split(';')[0])
            lon = float(location.split(';')[1])
            
            data_dict = {
                'name': fullname,
                'gender': gender,
                'age': age,
                'skills': skills,
                'company': company,
                'salary': salary,
                'location': {
                    'lat': lat,
                    'lon': lon
                },
                'registered': registered_date
            }
            data.append(data_dict)
        return data

In [114]:
def save_json(data: list[dict], file_path: str) -> None:
    """
    Save a list of dictionaries as JSON data to a file.

    This function serializes the input list of dictionaries into a JSON formatted
    string and writes it to the specified file path with an indentation of 4 spaces
    for better readability.

    Parameters
    ----------
    data : list[dict]
        A list of dictionaries to be saved as JSON data.
    file_path : str
        The file path where the JSON data will be saved.

    Returns
    -------
    None

    Examples
    --------
    >>> data = [{"name": "John", "age": 30}, {"name": "Jane", "age": 25}]
    >>> save_json(data, "example.json")
    # The JSON data will be saved to the "example.json" file with the following content:
    # [
    #     {"name": "John", "age": 30},
    #     {"name": "Jane", "age": 25}
    # ]
    """
    with open(file_path, "w") as file:
        json.dump(data, file, indent=4)

In [115]:
# MAIN BODY
main_dir = Path(r"data/csv_data")
data = []
for file in main_dir.iterdir():
    data.append(process_csv_data(file))

save_json(data, "data/data.json")

## Elastic search

In [116]:
from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk

In [117]:
def read_json(path) -> dict | list[dict]:
    with open(path, 'r', encoding='utf-8') as file:
        return json.load(file)

In [118]:
def create_index(agent: Elasticsearch, index_name: str, settings):
    agent.indices.create(index=index_name, body=settings)
    


In [119]:
def upload(index, data):
    for item in data:
        yield {
            '_index': index,
            '_source': item
        }

In [122]:
login = 'jinr_school_34'
password = 'U6d3HG4v'
index = 'jobhunt_34'
url = 'https://pluton.mephi.ru/elk-e'
es = Elasticsearch(
    url,
    basic_auth=(login, password)
)
print(es.info())
#settings = read_json('data/mapping.json')
#create_index(es, index, settings)
data = read_json('data/data.json')
bulk(es, upload(index=index, data=data))

{'name': '89260c7982e1', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'rM0DcZKtRWWya4Jr4Gw9TA', 'version': {'number': '7.17.7', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '78dcaaa8cee33438b91eca7f5c7f56a70fec9e80', 'build_date': '2022-10-17T15:29:54.167373105Z', 'build_snapshot': False, 'lucene_version': '8.11.1', 'minimum_wire_compatibility_version': '6.8.0', 'minimum_index_compatibility_version': '6.0.0-beta1'}, 'tagline': 'You Know, for Search'}


(493, [])