In [32]:
import pandas as pd
import math
import scipy
from sklearn import preprocessing
from scipy.stats import norm
from scipy.integrate import quad
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import glob

In [33]:
# Read input file
def readFile(directory, filename):
    filepath = '{0}.csv'.format(filename)
    df = pd.read_csv(filepath, header = None)
    return df

In [34]:
# Task 1a - Normalize the time series with bound -1 and 1
def normalizeDataFrame(df):
    def norm(x):
        return 2*(x - x.min())/(x.max()- x.min()) - 1
    n_df = df.apply(norm, axis = 1)
    return n_df.T

In [35]:
# Task 1b: To quantize the time series with a resolution of 3
def quantizeDataFrame(n_df, resolution=3):
    x_min = -1.0
    x_max = 1.0
    mean = 0.0
    std = 0.25

    x = np.linspace(x_min, x_max, 1000)
    y = scipy.stats.norm.pdf(x,mean,std)
    # plt.plot(x,y, color='black')

    def normal_distribution_function(x):
        value = scipy.stats.norm.pdf(x,mean,std)
        return value
    
    part_size = [-1] + [0]*(2*resolution)
    delta = 2/(2*resolution)
    total_area, err = quad(normal_distribution_function, x_min, x_max)
    
    for i in range(1, 2*resolution+1):
        res, err = quad(normal_distribution_function, (i-resolution-1)/resolution, (i-resolution)/resolution)
        part_size[i] = 2*res/total_area
    
    for i in range(1, len(part_size)):
        part_size[i] += part_size[i-1]
    
    # to account for boundary condition
    part_size[-1] = 1.01   
    
    digitize_df = np.digitize( n_df, bins = part_size, right = False)
    return digitize_df

In [36]:
# Task 1c - To convert times series into words of length 3 and shift 2 and write to f.wrd file
def shiftAndWrite(digitize_df, filename, window=3, shift=2):
    m, n = digitize_df.shape
    word_list = []
    tmp = '' 
    for i in range(n):
        for j in range(0,m-window+1, shift):
            tmp = str(filename) + ',' + str(i) + ',' + str(j) + ',' 
            for k in range(window):
                tmp += str(digitize_df[j+k,i])
            word_list.append(tmp)
    f.write('\n'.join(word_list))

In [37]:
# Task 1: Main

print('Enter absolute directory path to data files  => C:\ Users\....... \ sample data\ Z')
directory = input()
# directory =  "C:\\Users\\abhee\\Desktop\\asu\\cse 515(mwdb)\\project\\phase1\\sample_data\\Z"

# change the current directory to data files directory
os.chdir(directory)
all_files = glob.glob('*.csv')
total_files = len(all_files)

print('enter window length')
window = int(input())
# window = 3
print('enter shift length')
shift = int(input())
# shift = 2
print('enter resolution')
resolution = int(input())
# resolution = 3


for filename in range(1, total_files+1):
    f = open("{0}.wrd".format(filename), "w")
    df = readFile(directory, filename)
    n_df = normalizeDataFrame(df)
    digitize_df = quantizeDataFrame(n_df, resolution)
    shiftAndWrite(digitize_df, filename, window, shift) 
    f.close()
    
print('task1 completed')

Enter absolute directory path to data files  => C:\ Users\....... \ sample data\ Z
C:\Users\abhee\Desktop\asu\cse 515(mwdb)\project\phase1\sample_data\Z
enter window length
3
enter shift length
2
enter resolution
3
task1 completed
