In [5]:
import numpy as np
import csv
import os
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
import sklearn


def _PCA_(A, k):
    pca = PCA(n_components=k)
    U = pca.fit_transform(A)
    V = pca.components_
    S = pca.singular_values_
    return U, V, S


def readAudioCSV(path):
    f = open(path)
    f_csv = csv.reader(f)
    data=[]
    next(f_csv)  # skip the head
    for row in f_csv:
        data.append([float(i) for i in row[0].split(';')[2:]])
    data=np.array(data)
    f.close()
    if len(data)!=0:
        return np.mean(data, axis=0)
    else:
        return None


def readTextCSV(path):
    f = open(path)
    f_csv = csv.reader(f)
    data = []
    next(f_csv)  # skip the head
    for row in f_csv:
        data.append([float(i) for i in row[2:]])
    f.close()
    return data


def readVideoCSV(path):
    f = open(path)
    f_csv = csv.reader(f)
    data = []

    next(f_csv)  # skip the head

    for row in f_csv:
        data.append([float(i) for i in row[4:]])

    f.close()

    if len(data)!=0:
        return np.mean(data, axis=0)
    else:
        return None


def readLabel(path):
    f = open(path)
    f_csv = csv.reader(f)
    maj=[]
    next(f_csv)  # skip the head
    for row in f_csv:
        maj.append(int(row[-1]))
    f.close()
    return maj


def getAudioData(datapath, labelpath):
    file_list=os.listdir(datapath)
    file_list.sort(key=lambda x: int(x.split('-')[0][5:])+float(x.split('-')[1][:-4])/100)
    data=[]
    for file in file_list:
        data.append(readAudioCSV(datapath+file))
    label=readLabel(labelpath)
    for i, d in enumerate(data):
        if d is None:
            del data[i]
            del label[i]

    return data, label


def getTextData(datapath, labelpath):
    data=readTextCSV(datapath)
    label=readLabel(labelpath)
    p_data=[]
    for i, d in enumerate(data):
        if len(d) == 0:
            del label[i]
        else:
            p_data.append([float(i) for i in d])

    p_data=np.array(p_data)
    return p_data, label


def getVideoData(datapath, labelpath):
    file_list = os.listdir(datapath)
    file_list.sort(key=lambda x: int(x.split('-')[0][5:]) + float(x.split('-')[1][:-4]) / 100)
    data = []
    for file in file_list:
        data.append(readVideoCSV(datapath + file))
    label = readLabel(labelpath)

    return data, label


def audio():

    def svm_c(x_train, x_test, y_train, y_test):
        clf = SVC(kernel='rbf', gamma=0.09, C=1.8)
        # clf=SVC()
        clf.fit(x_train, y_train)

        y_ = clf.predict(x_train)
        s = 0
        for i in range(len(y_)):
            if y_[i] == y_train[i]:
                s += 1

        y_p = clf.predict(x_test)
        f1 = sklearn.metrics.f1_score(y_test, y_p, average='micro')

        return s / len(y_), f1

    def randomForest(x_train, x_test, y_train, y_test):
        clf = RandomForestClassifier(n_estimators=30, n_jobs=-1,
                                     max_features='auto', min_samples_split=4, min_samples_leaf=1)

        clf.fit(x_train, y_train)
        y_ = clf.predict(x_train)

        s = 0
        for i in range(len(y_)):
            if y_[i] == y_train[i]:
                s += 1

        y_p = clf.predict(x_test)
        f1 = sklearn.metrics.f1_score(y_test, y_p, average='micro')

        return s / len(y_), f1


    data, label = getAudioData('/home/yhc/data/Audio_eGeMAPS/',
                               '/home/yhc/data/Sentiment_Annotation.csv')
    label = np.array(label)

    scaler = StandardScaler()

    x_std = scaler.fit_transform(data)

    kf = KFold(n_splits=10, shuffle=True)


    x_std_svm, _, _ = _PCA_(x_std, 13)  # svm audio chongxun
    c, d = 0, 0
    for i in range(10):
        for train_index, test_index in kf.split(x_std_svm):
    
            x_train, x_test=x_std_svm[train_index], x_std_svm[test_index]
            y_train, y_test = label[train_index], label[test_index]
    
    
            a,b = svm_c(x_train, x_test, y_train, y_test)
            c+=a
            d+=b
    print('audio svm', d / 100)

    c, d=0,0
    for i in range(10):
        for train_index, test_index in kf.split(x_std):

            x_train, x_test=x_std[train_index], x_std[test_index]
            y_train, y_test = label[train_index], label[test_index]

            a, b = randomForest(x_train, x_test, y_train, y_test)
            c+=a
            d+=b

    print('audio randomforest',d/100)

def text():

    def randomForest(x_train, x_test, y_train, y_test):
        clf = RandomForestClassifier(n_estimators=100, n_jobs=-1,
                                     max_features='auto', min_samples_split=4, min_samples_leaf=1)

        clf.fit(x_train, y_train)
        y_ = clf.predict(x_train)

        s = 0
        for i in range(len(y_)):
            if y_[i] == y_train[i]:
                s += 1

        y_p = clf.predict(x_test)
        f1 = sklearn.metrics.f1_score(y_test, y_p, average='micro')

        return s / len(y_), f1

    def svm(x_train, x_test, y_train, y_test):
        clf=SVC(kernel='rbf', gamma='auto',C=5)
        clf.fit(x_train, y_train)

        y_ = clf.predict(x_train)
        s = 0
        for i in range(len(y_)):
            if y_[i] == y_train[i]:
                s += 1

        y_p = clf.predict(x_test)
        f1 = sklearn.metrics.f1_score(y_test, y_p, average='micro')

        return s / len(y_), f1

    data, label=getTextData('/home/yhc/data/Text_LIWC/transcripts_LIWC.csv',
                            '/home/yhc/data/Sentiment_Annotation.csv')

    label = np.array(label)
    scaler = StandardScaler()
    x_std = scaler.fit_transform(data)  # 标准化
    kf = KFold(n_splits=10, shuffle=True)


    x_std_svm, _, _ = _PCA_(x_std, 77)  # svm audio
    c, d = 0, 0
    for i in range(10):
        for train_index, test_index in kf.split(x_std_svm):
    
            x_train, x_test=x_std_svm[train_index], x_std_svm[test_index]
            y_train, y_test = label[train_index], label[test_index]
            a,b = svm(x_train, x_test, y_train, y_test)
            c+=a
            d+=b
    print('text svm', d / 100)

    c, d = 0, 0
    for i in range(10):
        for train_index, test_index in kf.split(x_std):

            x_train, x_test=x_std[train_index], x_std[test_index]
            y_train, y_test = label[train_index], label[test_index]


            a,b = randomForest(x_train, x_test, y_train, y_test)
            c+=a
            d+=b
    print('text randomforest',d / 100)

def video():
    def svm(x_train, x_test, y_train, y_test):
        clf = SVC(kernel='rbf', gamma='auto', C=3)  
        clf.fit(x_train, y_train)

        y_ = clf.predict(x_train)
        s = 0
        for i in range(len(y_)):
            if y_[i] == y_train[i]:
                s += 1

        y_p = clf.predict(x_test)
        f1 = sklearn.metrics.f1_score(y_test, y_p, average='micro')

        return s / len(y_), f1

    def randomForest(x_train, x_test, y_train, y_test):
        clf = RandomForestClassifier(n_estimators=197, n_jobs=-1,
                                     max_features='auto', min_samples_split=2, min_samples_leaf=1)

        clf.fit(x_train, y_train)
        y_ = clf.predict(x_train)

        s = 0
        for i in range(len(y_)):
            if y_[i] == y_train[i]:
                s += 1

        y_p = clf.predict(x_test)
        f1 = sklearn.metrics.f1_score(y_test, y_p, average='micro')

        return s / len(y_), f1

    data, label = getVideoData('/home/yhc/data/Video_OpenFace/',
                              '/home/yhc/data/Sentiment_Annotation.csv')


    label = np.array(label)
    scaler = StandardScaler()
    x_std = scaler.fit_transform(data)
    kf = KFold(n_splits=10, shuffle=True)

    x_std_svm, _, _ = _PCA_(x_std, 74)  # svm audio 74

    c, d = 0, 0
    for i in range(10):
        for train_index, test_index in kf.split(x_std_svm):

            x_train, x_test=x_std_svm[train_index], x_std_svm[test_index]
            y_train, y_test = label[train_index], label[test_index]


            a,b = svm(x_train, x_test, y_train, y_test)
            c+=a
            d+=b
    print('video SVM:', d / 100)


    c, d = 0, 0
    for i in range(10):
        for train_index, test_index in kf.split(x_std):
            x_train, x_test = x_std[train_index], x_std[test_index]
            y_train, y_test = label[train_index], label[test_index]
    
            a, b = randomForest(x_train, x_test, y_train, y_test)
            c += a
            d += b
    
    print('video randomforest', d / 100)

In [6]:
audio()

audio svm 0.46935185185185174
audio randomforest 0.4475132275132274


In [7]:
text()

text svm 0.4789285714285715
text randomforest 0.5014285714285712


In [8]:
video()

video SVM: 0.5185714285714284
video randomforest 0.5150000000000001
