In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import _pickle as cPickle
import numpy as np
import numpy



In [3]:
subject_collection = {} 
for i in range(1,33):
    if (i<10):
        with open('raw/s0'+str(i)+'.dat', 'rb') as f:
            x= cPickle.load(f, encoding='latin1')
            subject_collection[i-1]=x
    else:
        with open('raw/s'+str(i)+'.dat', 'rb') as f:
            x= cPickle.load(f, encoding='latin1')
            subject_collection[i-1]=x

In [4]:
xjk=subject_collection[0]
xjkX=xjk["data"]
np.shape(xjkX)

(40, 40, 8064)

In [5]:
fulldata={}
fulllabel={}
for i in range(32):
    x=subject_collection[i]
    X = x["data"]
    Y=x["labels"]
    fulldata[i]=X
    fulllabel[i]=Y
    

In [6]:
#here we are naming the columns for the eeg data that we will tanspose and we will name the columns 
sen_list = [("sen"+str(i)) for i in range(40)]

In [9]:
def dfa(X, Ave=None, L=None):
    """Compute Detrended Fluctuation Analysis from a time series X and length of
    boxes L.

    The first step to compute DFA is to integrate the signal. Let original
    series be X= [x(1), x(2), ..., x(N)].

    The integrated signal Y = [y(1), y(2), ..., y(N)] is obtained as follows
    y(k) = \sum_{i=1}^{k}{x(i)-Ave} where Ave is the mean of X.

    The second step is to partition/slice/segment the integrated sequence Y
    into boxes. At least two boxes are needed for computing DFA. Box sizes are
    specified by the L argument of this function. By default, it is from 1/5 of
    signal length to one (x-5)-th of the signal length, where x is the nearest
    power of 2 from the length of the signal, i.e., 1/16, 1/32, 1/64, 1/128,
    ...

    In each box, a linear least square fitting is employed on data in the box.
    Denote the series on fitted line as Yn. Its k-th elements, yn(k),
    corresponds to y(k).

    For fitting in each box, there is a residue, the sum of squares of all
    offsets, difference between actual points and points on fitted line.

    F(n) denotes the square root of average total residue in all boxes when box
    length is n, thus
    Total_Residue = \sum_{k=1}^{N}{(y(k)-yn(k))}
    F(n) = \sqrt(Total_Residue/N)

    The computing to F(n) is carried out for every box length n. Therefore, a
    relationship between n and F(n) can be obtained. In general, F(n) increases
    when n increases.

    Finally, the relationship between F(n) and n is analyzed. A least square
    fitting is performed between log(F(n)) and log(n). The slope of the fitting
    line is the DFA value, denoted as Alpha. To white noise, Alpha should be
    0.5. Higher level of signal complexity is related to higher Alpha.

    Parameters
    ----------

    X:
        1-D Python list or numpy array
        a time series

    Ave:
        integer, optional
        The average value of the time series

    L:
        1-D Python list of integers
        A list of box size, integers in ascending order

    Returns
    -------

    Alpha:
        integer
        the result of DFA analysis, thus the slope of fitting line of log(F(n))
        vs. log(n). where n is the

    Examples
    --------
    >>> import pyeeg
    >>> from numpy.random import randn
    >>> print(pyeeg.dfa(randn(4096)))
    0.490035110345

    Reference
    ---------
    Peng C-K, Havlin S, Stanley HE, Goldberger AL. Quantification of scaling
    exponents and crossover phenomena in nonstationary heartbeat time series.
    _Chaos_ 1995;5:82-87

    Notes
    -----

    This value depends on the box sizes very much. When the input is a white
    noise, this value should be 0.5. But, some choices on box sizes can lead to
    the value lower or higher than 0.5, e.g. 0.38 or 0.58.

    Based on many test, I set the box sizes from 1/5 of    signal length to one
    (x-5)-th of the signal length, where x is the nearest power of 2 from the
    length of the signal, i.e., 1/16, 1/32, 1/64, 1/128, ...

    You may generate a list of box sizes and pass in such a list as a
    parameter.

    """

    X = numpy.array(X)

    if Ave is None:
        Ave = numpy.mean(X)

    Y = numpy.cumsum(X)
    Y -= Ave

    if L is None:
        L = numpy.floor(len(X) * 1 / (
            2 ** numpy.array(list(range(4, int(numpy.log2(len(X))) - 4))))
        )

    F = numpy.zeros(len(L))  # F(n) of different given box length n

    for i in range(0, len(L)):
        n = int(L[i])                        # for each box length L[i]
        if n == 0:
            print("time series is too short while the box length is too big")
            print("abort")
            exit()
        for j in range(0, len(X), n):  # for each box
            if j + n < len(X):
                c = list(range(j, j + n))
                # coordinates of time in the box
                c = numpy.vstack([c, numpy.ones(n)]).T
                # the value of data in the box
                y = Y[j:j + n]
                # add residue in this box
                F[i] += numpy.linalg.lstsq(c, y)[1]
        F[i] /= ((len(X) / n) * n)
    F = numpy.sqrt(F)

    Alpha = numpy.linalg.lstsq(numpy.vstack(
        [numpy.log(L), numpy.ones(len(L))]
    ).T, numpy.log(F))[0][0]

    return Alpha

In [10]:
#getting the Pfd feature dataset 
for i in range(32):
    X=fulldata[i]
    dataframe_collection = {}
    #creating 40 dataframe for each subject
    for j in range(40):
        d=pd.DataFrame(X[j])
        dt=d.transpose()
        dt.columns=sen_list
        dataframe_collection[j] = dt
    #after running this loop dataset list will have 40 rows 
    dataset=[]
    for j in range(40):
        row=[]
        l=dataframe_collection[j]
        #row list will have 40 hurst feature for 40 sensors creating 1 row for the dataset
        for k in range(40):
            h=dfa(l[l.columns[k]])
            row.append(h)
        dataset.append(row)
    #now we will make a dataframe from the the dataset and assign it a column name 
    g=pd.DataFrame(dataset)
    Hurst_list = [("senDfa"+str(lg)) for lg in range(40)]
    g.columns=Hurst_list
    #as computing the features take a lot of time we save them to csv files :D 
    g.to_csv("features/s"+str(i)+"Dfa.csv", index=False)
    



In [6]:
from __future__ import print_function
import numpy


# ####################### Begin function definitions #######################

def hurst(X):
    """ Compute the Hurst exponent of X. If the output H=0.5,the behavior
    of the time-series is similar to random walk. If H<0.5, the time-series
    cover less "distance" than a random walk, vice verse.

    Parameters
    ----------

    X

        list

        a time series

    Returns
    -------
    H

        float

        Hurst exponent

    Notes
    --------
    Author of this function is Xin Liu

    Examples
    --------

    >>> import pyeeg
    >>> from numpy.random import randn
    >>> a = randn(4096)
    >>> pyeeg.hurst(a)
    0.5057444

    """
    X = numpy.array(X)
    N = X.size
    T = numpy.arange(1, N + 1)
    Y = numpy.cumsum(X)
    Ave_T = Y / T

    S_T = numpy.zeros(N)
    R_T = numpy.zeros(N)

    for i in range(N):
        S_T[i] = numpy.std(X[:i + 1])
        X_T = Y - T * Ave_T[i]
        R_T[i] = numpy.ptp(X_T[:i + 1])

    R_S = R_T / S_T
    R_S = numpy.log(R_S)[1:]
    n = numpy.log(T)[1:]
    A = numpy.column_stack((n, numpy.ones(n.size)))
    [m, c] = numpy.linalg.lstsq(A, R_S)[0]
    H = m
    return H

In [16]:
#getting the hurst data 
for i in range(32):
    X=fulldata[i]
    dataframe_collection = {}
    #creating 40 dataframe for each subject
    for j in range(40):
        d=pd.DataFrame(X[j])
        dt=d.transpose()
        dt.columns=sen_list
        dataframe_collection[j] = dt
    #after running this loop dataset list will have 40 rows 
    dataset=[]
    for j in range(40):
        row=[]
        l=dataframe_collection[j]
        #row list will have 40 hurst feature for 40 sensors creating 1 row for the dataset
        for k in range(40):
            h=hurst(l[l.columns[k]])
            row.append(h)
        dataset.append(row)
    #now we will make a dataframe from the the dataset and assign it a column name 
    g=pd.DataFrame(dataset)
    Hurst_list = [("senHurst"+str(lg)) for lg in range(40)]
    g.columns=Hurst_list
    #now to complete the dataset we also add the labels
    dfx=pd.DataFrame(Y)
    g.join(dfx)
    g.head()
    #as computing the features take a lot of time we save them to csv files :D 
    g.to_csv("features/s"+str(i)+"Hurst.csv", index=False)
    



In [7]:
yu=fulllabel[0]
gy=pd.DataFrame(yu)
Label_list = ["Valance","Arousal","Dominance","Liking"]
gy.columns=Label_list
gy

Unnamed: 0,Valance,Arousal,Dominance,Liking
0,7.71,7.6,6.9,7.83
1,8.1,7.31,7.28,8.47
2,8.58,7.54,9.0,7.08
3,4.94,6.01,6.12,8.06
4,6.96,3.92,7.19,6.05
5,8.27,3.92,7.0,8.03
6,7.44,3.73,7.08,7.04
7,7.32,2.55,6.32,5.87
8,4.04,3.29,3.62,5.99
9,1.99,4.86,2.04,7.09


In [8]:
gg=pd.read_csv("features/s"+str(0)+"Hurst.csv")
#yu=fulllabel[0]
#gy=pd.DataFrame(yu)
#Label_list = ["Valance","Arousal","Dominance","Liking"]
#gy.columns=Label_list
#gy
gxy=gg.join(gy)



In [9]:
for i in range(1,32):
    gg=pd.read_csv("features/s"+str(i)+"Hurst.csv")
    yu=fulllabel[i]
    gy=pd.DataFrame(yu)
    Label_list = ["Valance","Arousal","Dominance","Liking"]
    gy.columns=Label_list
    nxy=gg.join(gy)
    Dataxy=gxy.append(nxy)
    gxy=Dataxy
Dataxy    

Unnamed: 0,senHurst0,senHurst1,senHurst2,senHurst3,senHurst4,senHurst5,senHurst6,senHurst7,senHurst8,senHurst9,...,senHurst34,senHurst35,senHurst36,senHurst37,senHurst38,senHurst39,Valance,Arousal,Dominance,Liking
0,0.207357,0.206126,0.210456,0.221277,0.284442,0.214171,0.196319,0.228724,0.166148,0.140822,...,0.939189,1.033011,0.996901,0.809074,0.970004,,7.71,7.60,6.90,7.83
1,0.161172,0.231657,0.275299,0.204313,0.258757,0.294660,0.283453,0.333747,0.194770,0.221266,...,1.046685,1.085901,,0.936995,0.937595,,8.10,7.31,7.28,8.47
2,0.166113,0.264642,0.233001,0.237042,0.256414,0.227641,0.266156,0.222257,0.163673,0.120927,...,1.063166,1.088769,,0.543609,0.995391,,8.58,7.54,9.00,7.08
3,0.154886,0.192576,0.206195,0.242245,0.232567,0.227261,0.240730,0.245814,0.204450,0.145505,...,1.086076,1.157497,0.978142,0.735437,1.031969,,4.94,6.01,6.12,8.06
4,0.113650,0.115400,0.129648,0.136296,0.162400,0.172677,0.141366,0.128738,0.097081,0.112602,...,1.003874,1.174798,,0.831288,1.086271,,6.96,3.92,7.19,6.05
5,0.153600,0.152827,0.140391,0.127865,0.203257,0.182056,0.212403,0.130586,0.179774,0.170893,...,1.051278,1.119187,,0.578129,1.213105,,8.27,3.92,7.00,8.03
6,0.195351,0.228620,0.242343,0.257643,0.242692,0.244664,0.266271,0.269793,0.191108,0.159593,...,0.934231,,,1.090569,0.984723,,7.44,3.73,7.08,7.04
7,0.189605,0.204280,0.203644,0.222748,0.236824,0.202516,0.165262,0.248132,0.191961,0.181798,...,0.994804,1.027995,,0.847980,1.003463,,7.32,2.55,6.32,5.87
8,0.192040,0.284095,0.309022,0.265816,0.282325,0.289760,0.334905,0.331160,0.217364,0.182876,...,1.113718,1.204675,0.999363,0.762109,1.188869,,4.04,3.29,3.62,5.99
9,0.271089,0.318587,0.351095,0.289635,0.285755,0.335088,0.340423,0.410179,0.250654,0.224939,...,1.096598,1.130738,0.842595,0.881202,0.927015,,1.99,4.86,2.04,7.09


In [12]:
#Dataxy.head(50)


In [25]:
Dataxy=Dataxy.reset_index(drop=True)
Dataxy.to_csv("Dataxy.csv", index=False)

In [26]:
DataAr=Dataxy.iloc[:,:-3]

In [27]:
DataAr

Unnamed: 0,senHurst0,senHurst1,senHurst2,senHurst3,senHurst4,senHurst5,senHurst6,senHurst7,senHurst8,senHurst9,...,senHurst31,senHurst32,senHurst33,senHurst34,senHurst35,senHurst36,senHurst37,senHurst38,senHurst39,Valance
0,0.207357,0.206126,0.210456,0.221277,0.284442,0.214171,0.196319,0.228724,0.166148,0.140822,...,0.131089,1.110708,0.890081,0.939189,1.033011,0.996901,0.809074,0.970004,,8.13
1,0.161172,0.231657,0.275299,0.204313,0.258757,0.294660,0.283453,0.333747,0.194770,0.221266,...,0.152521,0.849700,1.098002,1.046685,1.085901,,0.936995,0.937595,,4.99
2,0.166113,0.264642,0.233001,0.237042,0.256414,0.227641,0.266156,0.222257,0.163673,0.120927,...,0.159770,1.063228,1.082313,1.063166,1.088769,,0.543609,0.995391,,8.05
3,0.154886,0.192576,0.206195,0.242245,0.232567,0.227261,0.240730,0.245814,0.204450,0.145505,...,0.168248,1.022074,1.117156,1.086076,1.157497,0.978142,0.735437,1.031969,,6.96
4,0.113650,0.115400,0.129648,0.136296,0.162400,0.172677,0.141366,0.128738,0.097081,0.112602,...,0.198912,0.996385,0.999356,1.003874,1.174798,,0.831288,1.086271,,7.15
5,0.153600,0.152827,0.140391,0.127865,0.203257,0.182056,0.212403,0.130586,0.179774,0.170893,...,0.156456,1.071641,1.124049,1.051278,1.119187,,0.578129,1.213105,,5.78
6,0.195351,0.228620,0.242343,0.257643,0.242692,0.244664,0.266271,0.269793,0.191108,0.159593,...,0.126527,1.065618,1.001052,0.934231,,,1.090569,0.984723,,4.94
7,0.189605,0.204280,0.203644,0.222748,0.236824,0.202516,0.165262,0.248132,0.191961,0.181798,...,0.172678,0.951652,0.926475,0.994804,1.027995,,0.847980,1.003463,,7.96
8,0.192040,0.284095,0.309022,0.265816,0.282325,0.289760,0.334905,0.331160,0.217364,0.182876,...,0.174748,1.014903,1.065954,1.113718,1.204675,0.999363,0.762109,1.188869,,7.86
9,0.271089,0.318587,0.351095,0.289635,0.285755,0.335088,0.340423,0.410179,0.250654,0.224939,...,0.163913,1.052275,1.023410,1.096598,1.130738,0.842595,0.881202,0.927015,,4.08


In [28]:
DataAr=DataAr.replace(np.nan,0)

In [29]:
DataAr

Unnamed: 0,senHurst0,senHurst1,senHurst2,senHurst3,senHurst4,senHurst5,senHurst6,senHurst7,senHurst8,senHurst9,...,senHurst31,senHurst32,senHurst33,senHurst34,senHurst35,senHurst36,senHurst37,senHurst38,senHurst39,Valance
0,0.207357,0.206126,0.210456,0.221277,0.284442,0.214171,0.196319,0.228724,0.166148,0.140822,...,0.131089,1.110708,0.890081,0.939189,1.033011,0.996901,0.809074,0.970004,0.000000,8.13
1,0.161172,0.231657,0.275299,0.204313,0.258757,0.294660,0.283453,0.333747,0.194770,0.221266,...,0.152521,0.849700,1.098002,1.046685,1.085901,0.000000,0.936995,0.937595,0.000000,4.99
2,0.166113,0.264642,0.233001,0.237042,0.256414,0.227641,0.266156,0.222257,0.163673,0.120927,...,0.159770,1.063228,1.082313,1.063166,1.088769,0.000000,0.543609,0.995391,0.000000,8.05
3,0.154886,0.192576,0.206195,0.242245,0.232567,0.227261,0.240730,0.245814,0.204450,0.145505,...,0.168248,1.022074,1.117156,1.086076,1.157497,0.978142,0.735437,1.031969,0.000000,6.96
4,0.113650,0.115400,0.129648,0.136296,0.162400,0.172677,0.141366,0.128738,0.097081,0.112602,...,0.198912,0.996385,0.999356,1.003874,1.174798,0.000000,0.831288,1.086271,0.000000,7.15
5,0.153600,0.152827,0.140391,0.127865,0.203257,0.182056,0.212403,0.130586,0.179774,0.170893,...,0.156456,1.071641,1.124049,1.051278,1.119187,0.000000,0.578129,1.213105,0.000000,5.78
6,0.195351,0.228620,0.242343,0.257643,0.242692,0.244664,0.266271,0.269793,0.191108,0.159593,...,0.126527,1.065618,1.001052,0.934231,0.000000,0.000000,1.090569,0.984723,0.000000,4.94
7,0.189605,0.204280,0.203644,0.222748,0.236824,0.202516,0.165262,0.248132,0.191961,0.181798,...,0.172678,0.951652,0.926475,0.994804,1.027995,0.000000,0.847980,1.003463,0.000000,7.96
8,0.192040,0.284095,0.309022,0.265816,0.282325,0.289760,0.334905,0.331160,0.217364,0.182876,...,0.174748,1.014903,1.065954,1.113718,1.204675,0.999363,0.762109,1.188869,0.000000,7.86
9,0.271089,0.318587,0.351095,0.289635,0.285755,0.335088,0.340423,0.410179,0.250654,0.224939,...,0.163913,1.052275,1.023410,1.096598,1.130738,0.842595,0.881202,0.927015,0.000000,4.08


In [51]:
Xmama = DataAr.iloc[:,:-1]
#print(Xmama)
Ymama = DataAr.iloc[:,40].values
print(Ymama)
#np.shape(Xmama)
#np.shape(Ymama)
from sklearn.model_selection import train_test_split 
from sklearn import tree 
from sklearn.linear_model import LinearRegression
regressor=LinearRegression()

x_train,x_test,y_train,y_test = train_test_split(Xmama,Ymama,test_size=0.25,random_state=0)

regressor.fit(x_train,x_test)

y_pred=regressor.predict(x_test)








import numpy as np
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred))






[8.13 4.99 8.05 ... 3.05 3.99 7.15]


ValueError: Found input variables with inconsistent numbers of samples: [960, 320]

In [15]:
gg=pd.read_csv("features/s"+str(0)+"Dfa.csv")
#yu=fulllabel[0]
#gy=pd.DataFrame(yu)
#Label_list = ["Valance","Arousal","Dominance","Liking"]
#gy.columns=Label_list
#gy
#gxy=gg.join(gy)
gxy=gg


In [16]:
for i in range(1,32):
    gg=pd.read_csv("features/s"+str(i)+"Dfa.csv")
#     yu=fulllabel[i]
#     gy=pd.DataFrame(yu)
#     Label_list = ["Valance","Arousal","Dominance","Liking"]
#     gy.columns=Label_list
#     nxy=gg.join(gy)
    nxy=gg
    Dataxy=gxy.append(nxy)
    gxy=Dataxy
Dataxy
Dataxy=Dataxy.reset_index(drop=True)
Dataxy.to_csv("DataDfaxy.csv", index=False)

In [2]:
import pandas as pd 
mama=pd.read_csv("DataDfaxy.csv")
mama2=pd.read_csv("DataHJxy.csv")
mama3=pd.read_csv("DataPfdxy.csv")
mamalast=pd.read_csv("Dataxy.csv")

In [29]:
mama3.head()


Unnamed: 0,senPfd0,senPfd1,senPfd2,senPfd3,senPfd4,senPfd5,senPfd6,senPfd7,senPfd8,senPfd9,...,senPfd30,senPfd31,senPfd32,senPfd33,senPfd34,senPfd35,senPfd36,senPfd37,senPfd38,senPfd39
0,0.559845,0.560418,0.560082,0.559143,0.557953,0.558753,0.55793,0.552918,0.559097,0.560696,...,0.561948,0.562873,0.543607,0.54042,0.541963,0.550067,1.0,0.545384,0.650867,0.63913
1,0.558684,0.561112,0.561235,0.552965,0.558053,0.560082,0.558741,0.553155,0.558947,0.560916,...,0.562075,0.562152,0.546172,0.541338,0.542371,0.549163,1.0,0.545909,0.660526,0.638904
2,0.559492,0.56083,0.560345,0.553925,0.557478,0.560094,0.559074,0.552441,0.558833,0.560575,...,0.562769,0.561747,0.545122,0.540457,0.542186,0.550491,1.0,0.545275,0.657582,0.640864
3,0.559916,0.560757,0.560261,0.558176,0.557908,0.558947,0.558423,0.551544,0.559656,0.560611,...,0.561911,0.563187,0.544739,0.541498,0.543968,0.54943,1.0,0.545627,0.646566,0.635666
4,0.559539,0.560178,0.559621,0.557753,0.556497,0.558333,0.557886,0.550326,0.558798,0.55994,...,0.561911,0.56273,0.543223,0.541433,0.544178,0.549296,1.0,0.544532,0.643551,0.635666


In [32]:
mamaNew=mama.join(mama3)

In [34]:
mamaNew=mamaNew.join(mamalast)

In [35]:
mamaNew.head()

Unnamed: 0,senDfa0,senDfa1,senDfa2,senDfa3,senDfa4,senDfa5,senDfa6,senDfa7,senDfa8,senDfa9,...,senHurst34,senHurst35,senHurst36,senHurst37,senHurst38,senHurst39,Valance,Arousal,Dominance,Liking
0,-0.002133,-0.000325,-0.000156,0.001114,0.000636,-0.001559,0.001631,0.00138,-0.006688,-0.007642,...,0.939189,1.033011,0.996901,0.809074,0.970004,,7.71,7.6,6.9,7.83
1,-0.006262,-0.006898,-0.009834,-0.008837,-0.005583,-0.005103,-0.005215,-0.007236,-0.009539,-0.003813,...,1.046685,1.085901,,0.936995,0.937595,,8.1,7.31,7.28,8.47
2,-0.007681,-0.01106,-0.00845,-0.003313,-0.008348,-0.00451,-0.00834,-0.003636,-0.008242,-0.010575,...,1.063166,1.088769,,0.543609,0.995391,,8.58,7.54,9.0,7.08
3,-0.017675,-0.012013,-0.012038,-0.021174,-0.018692,-0.003882,-0.013477,-0.007505,-0.014245,-0.013097,...,1.086076,1.157497,0.978142,0.735437,1.031969,,4.94,6.01,6.12,8.06
4,-0.009453,-0.006495,-0.006437,-0.00889,-0.005594,-0.008346,-0.00488,-0.004362,-0.003013,-0.008143,...,1.003874,1.174798,,0.831288,1.086271,,6.96,3.92,7.19,6.05


In [38]:
mamaNew.to_csv("FinalDataset.csv", index=False)