In [173]:
import datatable as dt
import redis
import pandas as pd
import json
import random
from datetime import datetime
from redis.commands.json.path import Path
from redis.commands.search.indexDefinition import IndexDefinition, IndexType
from redisearch import TextField, NumericField, TagField, Query

### Connect to Redis

In [2]:
r = redis.Redis(host='localhost', port=6379, decode_responses=True)

### Load the data into a DataTable object

In [175]:
data = dt.fread("data/my_data.csv")

### Convert DataTable to a list of dictionaries

In [171]:
df = data.to_pandas()

df = df.head(100)

df = df.drop(columns=['C0'])

df[:5]

Unnamed: 0,key,company_name,cvr_no,industry_designation_primary
0,1,Company 1,78349416,Industry 26
1,2,Company 2,28019879,Industry 9
2,3,Company 3,20305018,Industry 33
3,4,Company 4,36578982,Industry 33
4,5,Company 5,58284639,Industry 25


In [None]:
df.industry_designation_primary.tolist()

### Upload the data to Redis

In [12]:
for _, row in df.iterrows():
    row_id = row['key']  # Assuming 'key' column contains the unique identifier
    row_dict = row.drop('key').to_dict()
    
    # Convert Timestamp values to string
    for key, value in row_dict.items():
        if isinstance(value, pd.Timestamp):
            row_dict[key] = value.strftime('%Y-%m-%d %H:%M:%S')
    
    r.hset(row_id, mapping=row_dict)

print("Data uploaded to Redis successfully.")

Data uploaded to Redis successfully.


### Test the data retrieval from Redis

In [None]:
# Test the data retrieval from Redis
keys = r.keys("*")
for key in keys:
    values = r.hgetall(key)
    print(f"Key: {key}, Values: {values}")

In [153]:
# df.head()
df = df[["key","cvr_no", "company_name","industry_designation_primary"]]
df.head()

Unnamed: 0,key,cvr_no,company_name,industry_designation_primary
0,2,41201134,GRASS DK ApS,Engroshandel med kemiske produkter
1,3,40317430,Bomholt Malerfirma ApS,Malerforretninger
2,4,33047754,"BO ANDERSEN HOLDING, SVENSTRUP ApS",Ikke-finansielle holdingselskaber
3,5,41026162,Fyenbo Holding ApS,Ikke-finansielle holdingselskaber
4,7,37537535,Kjeld Johannesen Holding ApS,Ikke-finansielle holdingselskaber


In [154]:
trans_key = df['key'].astype('str')
df['key'] = trans_key
df.head()

Unnamed: 0,key,cvr_no,company_name,industry_designation_primary
0,2,41201134,GRASS DK ApS,Engroshandel med kemiske produkter
1,3,40317430,Bomholt Malerfirma ApS,Malerforretninger
2,4,33047754,"BO ANDERSEN HOLDING, SVENSTRUP ApS",Ikke-finansielle holdingselskaber
3,5,41026162,Fyenbo Holding ApS,Ikke-finansielle holdingselskaber
4,7,37537535,Kjeld Johannesen Holding ApS,Ikke-finansielle holdingselskaber


In [107]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 4 columns):
 #   Column                        Non-Null Count  Dtype 
---  ------                        --------------  ----- 
 0   key                           100 non-null    object
 1   cvr_no                        100 non-null    int32 
 2   company_name                  100 non-null    object
 3   industry_designation_primary  100 non-null    object
dtypes: int32(1), object(3)
memory usage: 2.9+ KB


### Define a schema

In [128]:
schema = (
    TextField("$.key", as_name="key"),
    NumericField("$.cvr_no", as_name="cvr_no"),
    TextField("$.company_name", as_name="company_name"),
    TextField("$.industry_designation_primary", as_name="industry_designation_primary")
)

### Set index name

In [129]:
index_name = "idx:cleaned_data"

### Create an index

In [130]:
rs = r.ft(index_name)
rs.create_index(
    schema,
    definition=IndexDefinition(
        prefix=["cleaned_data:"], index_type=IndexType.JSON
    )
)

'OK'

### Upload data to Redis and index it

In [131]:
for _, row in df.iterrows():
    row_id = row['key']
    row_dict = row.drop('key').to_dict()

    r.json().set(f"cleaned_data:{row_id}", Path.root_path(), row_dict)
    
print("Data stored and indexed successfully.")

Data stored and indexed successfully.


### Perform a search query

In [143]:
result = r.json().get('cleaned_data:2')
print(result)

{'cvr_no': 41201134, 'company_name': 'GRASS DK ApS', 'industry_designation_primary': 'Engroshandel med kemiske produkter'}


### Generate dummy data

In [169]:
# Define the number of dummy rows to generate
num_rows = 100

# Create a list of random values for each column
keys = range(1, num_rows+1)
company_names = [f"Company {i}" for i in range(1, num_rows+1)]
cvr_nos = random.sample(range(10000000, 99999999), num_rows)
industry_designations = [f"Industry {random.randint(1, 36)}" for _ in range(num_rows)]

# Create a DataFrame from the generated data
df = pd.DataFrame({
    'key': keys,
    'company_name': company_names,
    'cvr_no': cvr_nos,
    'industry_designation_primary': industry_designations
})

# Display the generated dummy data
df

# Save the DataFrame as a CSV file
df.to_csv('data/my_data.csv')
