In [6]:
import torch
import pandas as pd
import os
import numpy as np
from datetime import datetime,timedelta
from scipy import stats

In [7]:
folder_path=r''

In [8]:
file_name=r''

# Custom Exception

In [9]:
class empty_df_exception(Exception):
    pass
class cat_to_num_exception(Exception):
    pass
class save_df_to_csv_exception(Exception):
    pass

# Preprocess

### 1. import file

In [10]:
def import_files(file_path):
    try:
        df=pd.read_csv(file_path,low_memory=False)
        return df
    except Exception as e:
        print(e)

### 2. Return a list of Unnumerical Points

In [11]:
def get_index_positions(list_of_elems, element):
    index_pos_list = []
    index_pos = 0
    while True:
        try:
            # Search for item in list from indexPos to the end of list
            index_pos = list_of_elems.index(element, index_pos)
            # Add the index position in list
            index_pos_list.append(index_pos)
            index_pos += 1
        except ValueError as e:
            break
    return index_pos_list

def nunmericalize_df_and_return_list_of_unumerical_cols(df,column_index_of_timestamp=0):
    names=df.columns.to_list()
    if(len(names)==0):
        raise empty_df_exception('The input dataframe is empty!')
    unnumerical_cols=[]
    for i,name in enumerate(names):
        if(i!=column_index_of_timestamp):
            try:
                df[name]=df[name].astype(float)
            except:
                unnumerical_cols.append(name)
    return unnumerical_cols

## after the first step of numericalize every column in the dataframe, we now have two kinds of 
## missing values in the dataframe: (1) standard missing value:NaN, this will pass the first step
## thus the column that contains NaN will not be included in the unnumerical cols. (2) non-standard
## missing values: any non-numerical record that is not NaN shall be deemed as this type of missing
## values. Any column that includes this kind of missing value will be included in the list of 
## unnumerical_cols, we will need to go through each row in these columns to see which row includes
## this kind of missing value.
def numericalize_df_and_return_list_of_unnumerical_points(df,unnumerical_cols,column_index_of_timestamp=0):
    unnumerical_dict={}
    
    ################## check for non-standard missing points #######################
    for names in unnumerical_cols:
        for i in df.shape[0]:
            if(type(df[name][i])!=np.float64):
                try:
                    df[name][i]=float(df[name][i])
                except:
                    if(name in unnumerical_dict):
                        unnumerical_dict[name].append(i)
                    else:
                        unnumerical_dict[name]=[i]
    ################## check for non-standard missing points #######################
    
    ################## check for standard missing points #######################
    names=df.columns.to_list()
    for col_name in names:
        NaN_index_list=get_index_positions(df[col_name].isnull().to_list(),True)
        if(len(NaN_index_list)!=0):
            if(col_name in unnumerical_dict):
                unnumerical_dict[col_name].extend(NaN_index_list)
            else:
                unnumerical_dict[col_name]=NaN_index_list
    ################## check for standard missing points #######################
    
    for name in unnumerical_dict:
        unnumerical_dict[name]=list(set(unnumerical_dict[name]))
    return unnumerical_dict

def numericalize_df(df,column_index_of_timestamp=0):
    unnumerical_cols=nunmericalize_df_and_return_list_of_unumerical_cols(df,column_index_of_timestamp)
    unnumerical_dict=numericalize_df_and_return_list_of_unnumerical_points(df,unnumerical_cols,column_index_of_timestamp)
    return unnumerical_dict

### 2.0 get details about unnumerical data

In [12]:
class unnumerical_detail_pod:
    def __init__(row_number,unique_vals):
        self.unnumerical_rows_number=row_number
        self.unique_val_set=unique_vals

def get_details_about_unnumerical_data(df,unnumerical_dict):
    details={}
    names=df.columns.to_list()
    for name in unnumerical_dict:
        name_index=names.index(name)
        unnumerical_val_set=set(df.iloc[unnumerical_dict[name],name_index].to_list())
        details[name]=unnumerical_detail_pod(len(unnumerical_dict[name]),unnumerical_val_set)
    return details

### 2.1 Fill in missing points with a fixed value

### 2.2 Fill in missing points with an artificial data(moving windows etc.)

### 2.3. Categorical to Numerical

In [13]:
def convert_to_numerical(df,categorical_column_index):
    try:
        cat_col_set=set(df.iloc[:,categorical_column_index].to_list())
        counter=0.0
        name_to_val={}
        for name in cat_col_set:
            name_to_val[name]=counter
            counter+=1
        for i in range(df.shape[0]):
            df.iloc[i,categorical_column_index]=name_to_val[df.iloc[i,categorical_column_index]]
    except Exception as e:
        raise cat_to_num_exception(str(e))

### 3. Check if the timestamp column is continuous

In [14]:
def check_time_col_continuation(df,time_col_index,granularity_in_minutes,parse_time_format):
    time_col=df.iloc[:,time_col_index]
    previous=datetime.strptime(time_col[0],parse_time_format)
    standard_time_interval=timedelta(minutes=granularity_in_minutes)
    gap_timestamp_pairs={}
    for i in range(1,df.shape[0]):
        current=datetime.strptime(time_col[i],parse_time_format)
        if(current-previous!=standard_time_interval):
            gap_timestamp_pairs[(i-1,i)]=(previous,current)
        previous=current
    return gap_timestamp_pairs

### 4. Normalization

#### 4.1. z-score normalization

In [71]:
def z_score_normalization(df,time_col_index):
    new_df=df.drop(df.columns[time_col_index],axis=1)
    for col in new_df.columns:
        mean=np.mean(new_df[col])
        std=np.std(new_df[col])
        new_df[col]=(new_df[col]-mean)/(std+1e-7) # in case, std==0
    for col in new_df.columns:
        df[col]=new_df[col]

#### 4.2 min_max_normalization

In [79]:
def min_max_normalization(df,time_col_index):
    new_df=df.drop(df.columns[time_col_index],axis=1)
    for col in new_df.columns:
        max_val=np.max(new_df[col])
        min_val=np.min(new_df[col])
        new_df[col]=(new_df[col]-min_val)/(max_val-min_val+1e-7) #in case max_val==min_val
    for col in new_df.columns:
        df[col]=new_df[col]

### 5. Save preprocessed dataframe to a designated place.

In [16]:
def save_df_to_csv(df,save_path):
    try:
        df.to_csv(save_path,index=False)
    except Exception as e:
        raise save_df_to_csv_exception(str(e))

### main

The process of preprocessing has to be an interactive process since there is no general way of dealing with unnumerical record, user has to decide how they want to deal with the unnunmerical data and this will determine how the preprocess will proceed, this scipt shall provide (1) how to proceed with the preprocessing of a csv file (2) some solutions to different kinds of problems that could occur. However, it is always encouraged that the users inject more steps/methods into this scipt o customize their own preprocessing script.

In [62]:
## step one: get the df:
df=import_files(os.path.join(folder_path,file_name))

In [40]:
## step two: numericalize and get unnumerical list:
unnumericalize_dict=numericalize_df(df)

In [41]:
## step three: get unnumerical details:
unnumerical_details=get_details_about_unnumerical_data(df,unnumericalize_dict)

In [42]:
## step four: chekc unnumerical details:
print(unnumerical_details)

{}


In [74]:
## step five: deal with unnumerical details:

In [97]:
## step six: check continuation of timestamp column:
check_time_col_continuation(df,0,10,'%Y/%m/%d %H:%M')

{}

In [72]:
## step seven: normalization:
z_score_normalization(df,0)

In [77]:
## step seven: save file:
save_path=r'D:\past_repos\AIOT_AD\MSCRED\preprocessed_data\z_score_中山嘉明#6机汽机发电机.csv'
save_df_to_csv(df,save_path)