In [1]:
pip install modin[ray] ray dask[dataframe]



In [2]:
import time
import pandas as pd
import dask.dataframe as dd
import modin.pandas as mpd
import ray

In [3]:
file_path = "/content/measures_v2.csv"

In [4]:
#reading with pandas
start = time.time()
df = pd.read_csv(file_path)
end = time.time()

print("Reading the file with pandas took", (end-start), "seconds")

Reading the file with pandas took 4.035475969314575 seconds


In [5]:
#reading with Modin & Ray
ray.init()
start = time.time()
df = mpd.read_csv(file_path)
end = time.time()

print("Reading the file with modin & ray took", (end-start), "seconds")

2024-10-11 10:36:33,463	INFO worker.py:1786 -- Started a local Ray instance.


Reading the file with modin & ray took 15.348858118057251 seconds


In [6]:
#reading with dask
start = time.time()
df = dd.read_csv(file_path)
end = time.time()

print("Reading the file with dask took", (end-start), "seconds")

Reading the file with dask took 0.016909122467041016 seconds


**Reading the file with dask took the least amount of time**

In [7]:
cols = df.columns
cols

Index(['u_q', 'coolant', 'stator_winding', 'u_d', 'stator_tooth',
       'motor_speed', 'i_d', 'i_q', 'pm', 'stator_yoke', 'ambient', 'torque',
       'profile_id'],
      dtype='object')

In [8]:
%%writefile testutility.py

import os
import logging
import subprocess
import yaml
import pandas as pd
import datetime
import gc
import re

#Reading the File

def read_config_file(filepath):
  with open (filepath, 'r') as stream:
    try:
      return yaml.safe_load(stream)
    except yaml.YAMLError as exc:
      logging.error(exc)

def replacer(string, char):
    """
    Replacing repeated characters with a single occurence
    """
    pattern = char + '{2,}'
    string = re.sub(pattern, char, string)
    return string

def col_name_val(df, table_config):
    """
    Standardizing and validating column names of the dataframe
    against the expected set of column names
    """
    df.columns = df.columns.str.lower()
    df.columns = df.columns.str.replace('[^\w]', '_', regex=True)
    df.columns = list(map(lambda x: x.strip('_'), list(df.columns)))
    df.columns = list(map(lambda x: replacer(x, '_'), list(df.columns)))

    expected_col = list(map(lambda x: x.lower(), table_config['columns']))
    expected_col.sort()
    df.columns = list(map(lambda x: x.lower(), list(df.columns)))
    df = df.reindex(sorted(df.columns), axis=1)

    if len(df.columns) == len(expected_col) and list(expected_col) == list(df.columns):
        print("Column name and length validation passed")
        return 1
    else:
       print("Column name and length validation failed")
       mismatched_columns_file = list(set(df.columns).difference(expected_col))
       print("The following file columns are not in the YAML file", mismatched_columns_file)
       missing_YAML_file = list(set(expected_col).difference(df.columns))
       print("The following YAML columns are not in the file uploaded", missing_YAML_file)
       logging.info(f'dataframe columns: {df.columns}')
       logging.info(f'expected columns: {expected_col}')
       return 0


Overwriting testutility.py


In [9]:
%%writefile file.yaml
file_type: csv
dataset_name: measures_v2
file_name: measures_v2
table_name: edsurv
inbound_delimiter: ","
outbound_delimiter: "|"
skip_leading_rows: 1
columns:
    - u_q
    - coolant
    - stator_winding
    - u_d
    - stator_tooth
    - motor_speed
    - i_d
    - i_q
    - pm
    - stator_yoke
    - ambient
    - torque
    - profile_id


Overwriting file.yaml


In [10]:
#Reading the config file
import testutility as util
config_data = util.read_config_file('/content/file.yaml')

In [11]:
config_data

{'file_type': 'csv',
 'dataset_name': 'measures_v2',
 'file_name': 'measures_v2',
 'table_name': 'edsurv',
 'inbound_delimiter': ',',
 'outbound_delimiter': '|',
 'skip_leading_rows': 1,
 'columns': ['u_q',
  'coolant',
  'stator_winding',
  'u_d',
  'stator_tooth',
  'motor_speed',
  'i_d',
  'i_q',
  'pm',
  'stator_yoke',
  'ambient',
  'torque',
  'profile_id']}

In [12]:
#Reading the data using the config file
file_type = config_data['file_type']
source_file = "./" + config_data['file_name'] + f'.{file_type}'
df = pd.read_csv(source_file, delimiter = config_data['inbound_delimiter'])
df.head()

Unnamed: 0,u_q,coolant,stator_winding,u_d,stator_tooth,motor_speed,i_d,i_q,pm,stator_yoke,ambient,torque,profile_id
0,-0.450682,18.805172,19.08667,-0.350055,18.293219,0.002866,0.004419,0.000328,24.554214,18.316547,19.850691,0.187101,17
1,-0.325737,18.818571,19.09239,-0.305803,18.294807,0.000257,0.000606,-0.000785,24.538078,18.314955,19.850672,0.245417,17
2,-0.440864,18.82877,19.08938,-0.372503,18.294094,0.002355,0.00129,0.000386,24.544693,18.326307,19.850657,0.176615,17
3,-0.327026,18.835567,19.083031,-0.316199,18.292542,0.006105,2.6e-05,0.002046,24.554018,18.330833,19.850647,0.238303,17
4,-0.47115,18.857033,19.082525,-0.332272,18.291428,0.003133,-0.064317,0.037184,24.565397,18.326662,19.850639,0.208197,17


In [13]:
#File validation
util.col_name_val(df, config_data)

Column name and length validation passed


1

In [14]:
#creating a test data set for validation
test_data = {
    'u_q' : [-0.807, -0.9775, -0.80075, -0.0098],
    'coolant' : [18.095, 18.9547, 18.0593, 18.0473],
    'stator winding' : [19.067, 19.042, 19.054,19.0647],
    'u d' : [-0.007, -0.575, -0.00075, -0.0098],
    'stator tooth' : [18.67, 18.42, 17.54,18.647],
    'Motor Speed' : [0.00032, 0.0000984, 0.032, 0.2044],
    'i d' : [0.0002, 0.000044, 0.00032, 0.244],
    'i q' : [0.003, 0.00045, 0.8656, 0.4422],
    'pm' : [30.67, 30.42, 59.54,19.647],
    'Stator yoke' : [19.67, 19.42, 19.54,19.647],
    'Ambient' : [19.67, 19.42, 19.54,19.647],
    'Torque' : [0.45, 0.22, 0.24, 0.45],
    'Profile ID' : [17,17,24,77],

}

test_df = pd.DataFrame(test_data,
                       columns =
                        ['u_q', 'coolant', 'stator winding', 'u d',
                         'stator tooth', 'Motor Speed', 'i d', 'i q',
                         'pm', 'Stator yoke', 'Ambient', 'Torque', 'Profile ID'])

In [15]:
test_df.head()

Unnamed: 0,u_q,coolant,stator winding,u d,stator tooth,Motor Speed,i d,i q,pm,Stator yoke,Ambient,Torque,Profile ID
0,-0.807,18.095,19.067,-0.007,18.67,0.00032,0.0002,0.003,30.67,19.67,19.67,0.45,17
1,-0.9775,18.9547,19.042,-0.575,18.42,9.8e-05,4.4e-05,0.00045,30.42,19.42,19.42,0.22,17
2,-0.80075,18.0593,19.054,-0.00075,17.54,0.032,0.00032,0.8656,59.54,19.54,19.54,0.24,24
3,-0.0098,18.0473,19.0647,-0.0098,18.647,0.2044,0.244,0.4422,19.647,19.647,19.647,0.45,77


In [16]:
#validating test_data.csv
util.col_name_val(test_df, config_data)

Column name and length validation passed


1

In [17]:
#New test data for validation

new_test_data = {
    'u_q' : [-0.807, -0.9775, -0.80075, -0.0098],
    'coolant' : [18.095, 18.9547, 18.0593, 18.0473],
    'stator winding' : [19.067, 19.042, 19.054,19.0647],
    'u d' : [-0.007, -0.575, -0.00075, -0.0098],
    'stator tooth' : [18.67, 18.42, 17.54,18.647],
    'Motor Speed' : [0.00032, 0.0000984, 0.032, 0.2044],
    'time' : [0.00032, 0.0000984, 0.032, 0.2044]
   }
new_test_df = pd.DataFrame(new_test_data,
                       columns =
                        ['u_q', 'coolant', 'stator winding', 'u d',
                         'stator tooth', 'Motor Speed', 'time'])

In [18]:
util.col_name_val(new_test_df, config_data)

Column name and length validation failed
The following file columns are not in the YAML file ['time']
The following YAML columns are not in the file uploaded ['stator_yoke', 'pm', 'profile_id', 'ambient', 'i_d', 'i_q', 'torque']


0

**Writing the file in pipe separated text file in gz format**

In [19]:
df.head()

Unnamed: 0,u_q,coolant,stator_winding,u_d,stator_tooth,motor_speed,i_d,i_q,pm,stator_yoke,ambient,torque,profile_id
0,-0.450682,18.805172,19.08667,-0.350055,18.293219,0.002866,0.004419,0.000328,24.554214,18.316547,19.850691,0.187101,17
1,-0.325737,18.818571,19.09239,-0.305803,18.294807,0.000257,0.000606,-0.000785,24.538078,18.314955,19.850672,0.245417,17
2,-0.440864,18.82877,19.08938,-0.372503,18.294094,0.002355,0.00129,0.000386,24.544693,18.326307,19.850657,0.176615,17
3,-0.327026,18.835567,19.083031,-0.316199,18.292542,0.006105,2.6e-05,0.002046,24.554018,18.330833,19.850647,0.238303,17
4,-0.47115,18.857033,19.082525,-0.332272,18.291428,0.003133,-0.064317,0.037184,24.565397,18.326662,19.850639,0.208197,17


In [20]:
output_file = 'measures_v2.gz'
df.to_csv(output_file, sep = '|', index= False, compression = 'gzip' )