In [1]:
import cv2
import os
import numpy as np
import pandas as pd

In [2]:
# read and write all data
cwd = os.getcwd()
df = pd.read_csv(cwd + "/data_csv/preprocessing_data.csv")

FileNotFoundError: ignored

In [None]:
from sklearn.datasets import make_classification
from skimage.feature import hog

column = ["image", "emotion"]
all_data = []
for idx, row in df.iterrows():
    print(str(idx+1) + '/' + str(df.shape[0]))
    imagePath = cwd + "/cleaned_images/" + row.image
    image = cv2.imread(imagePath)
    image = cv2.resize(image, (112, 112), interpolation = cv2.INTER_AREA)

    feature, hog_img = hog(image, orientations=9, pixels_per_cell=(8, 8), cells_per_block=(2, 2), visualize=True, multichannel=True)
    feature = feature.tolist()

    data = []
    data.append(row.image)
    data.append(row.emotion)
    for i in range(len(feature)):
        data.append(feature[i])

        if idx == 0:
            column.append("feature_" + str(i))
    
    all_data.append(data)

df = pd.DataFrame(all_data, columns=column)
df.to_csv(cwd + "/data_csv/hog_feature_data.csv", index=False)

: 

In [None]:
# read and write all data
cwd = os.getcwd()
df = pd.read_csv(cwd + "/data_csv/hog_feature_data.csv")

In [None]:
import seaborn as sns
from matplotlib import pyplot as plt

X = df[df.columns.difference(['Unnamed: 0', 'emotion', 'image'])]
corr = X.corr()
plt.figure(figsize = (10, 10))
sns.heatmap(corr, annot=True)
print(len(X.columns))

columns = np.full((corr.shape[0],), True, dtype=bool)
for i in range(corr.shape[0]):
    for j in range(i + 1, corr.shape[0]):
        if corr.iloc[i, j] >= 0.9:
            if columns[j]:
                columns[j] = False
print("Filter Method : ", len(corr.columns[columns]))

selected_columns = list(corr.columns[columns]) + ['emotion', 'image']
print(selected_columns)

df_filter = df.copy()
df_filter = df_filter[selected_columns]
print(df_filter)

In [None]:
# Critical To Model (Feature important)
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=250, random_state=7)

X = df[df.columns.difference(['emotion', 'image'])]
y = df[['emotion']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

model.fit(X_train, y_train)  

fs = pd.Series(model.feature_importances_, index=X.columns).sort_values(ascending=True)
fs.plot(kind='barh')

: 

In [None]:
# Wrapper (Backward Elimination)
import statsmodels.api as sm
from sklearn.feature_selection import RFE

X_1 = sm.add_constant(X)
model = sm.OLS(y, X_1).fit()
print(model.pvalues)

cols = list(X.columns)
pmax = 1

while len(cols) > 0:
    p = []
    X_1 = X[cols]
    X_1 = sm.add_constant(X_1)

    model = sm.OLS(y,X_1).fit()

    p = pd.Series(model.pvalues.values[1:], index = cols)      
    pmax = max(p)
    feature_with_p_max = p.idxmax()
    if pmax > 0.05 :
        cols.remove(feature_with_p_max)
    else:
        break

selected_features_BE = list(cols)
print("Number of Feature Selected BE : ", len(selected_features_BE))
print(selected_features_BE)

from sklearn.linear_model import LinearRegression

#no of features
nof_list = np.arange(1, len(cols))            
high_score = 0

#Variable to store the optimum features
nof = 0           
score_list = []

for n in range(len(nof_list)):
    X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.1, random_state = 0)

    model = LinearRegression()
    rfe = RFE(model, n_features_to_select=nof_list[n])

    X_train_rfe = rfe.fit_transform(X_train,y_train)
    X_test_rfe = rfe.transform(X_test)

    model.fit(X_train_rfe,y_train)

    score = model.score(X_test_rfe,y_test)
    score_list.append(score)
    if score > high_score:
        high_score = score
        nof = nof_list[n]

print("Optimum number of features: %d" %nof)
print("Score with %d features: %f" % (nof, high_score))

model = LinearRegression()
#Initializing RFE model
rfe = RFE(model, n_features_to_select=14)
#Transforming data using RFE
X_rfe = rfe.fit_transform(X,y) 
model.fit(X_rfe,y)
print(rfe.support_)
print(rfe.ranking_)
cols = list(X.columns)
temp = pd.Series(rfe.support_,index = cols)
selected_features_rfe = list(temp[temp==True].index)
print("Number of feature rfe : ", len(selected_features_rfe))

# selected_features_rfe = selected_features_rfe.extend(['emotion', 'image'])
selected_features_rfe = ['image', 'emotion'] + selected_features_rfe
df_wrapper = df.copy()
df_wrapper = df_wrapper[selected_features_rfe]
df_wrapper.to_csv(cwd + "/data_csv/hog_feature_data_wrapper.csv", index=False)

: 

In [None]:
# Embedded
from sklearn.linear_model import RidgeCV, LassoCV, Ridge, Lasso

reg = LassoCV()
reg.fit(X, y)
print("Best alpha using built-in LassoCV: %f" % reg.alpha_)
print("Best score using built-in LassoCV: %f" %reg.score(X,y))
coef = pd.Series(reg.coef_, index = X.columns)

print("Lasso picked " + str(sum(coef != 0)) + " variables and eliminated the other " +  str(sum(coef == 0)) + " variables")

imp_coef = coef.sort_values()
import matplotlib
matplotlib.rcParams['figure.figsize'] = (8.0, 10.0)
imp_coef.plot(kind = "barh")
plt.title("Feature importance using Lasso Model")

feature_selected_embed = [
    'image',
    'emotion',
]
df_embedded = df.copy()
df_embedded = df_embedded[feature_selected_embed]
print(len(df_embedded.columns))
df_embedded.to_csv(cwd + "/data_csv/hog_feature_data_embedded.csv", index=False)

: 

In [None]:
# Embedded Methods?
from sklearn.ensemble import GradientBoostingClassifier
estimator = GradientBoostingClassifier(n_estimators = 100)
selector = RFE(estimator, step= 1)
selector = selector.fit(X_train, y_train)

sel_cols = np.array(X.columns)[selector.support_]
print(sel_cols)

df_embedded_2 = df.copy()
df_embedded_2 = df_embedded_2[['image', 'emotion'] + list(sel_cols)]
df_embedded_2.to_csv(cwd + "/data_csv/hog_feature_data_embedded_2.csv", index=False)

: 

In [None]:
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import RocCurveDisplay
from sklearn.metrics import auc
from sklearn.metrics import accuracy_score
from sklearn.metrics import log_loss
from sklearn.metrics import classification_report

dictionary = ['ANGER', 'CONTEMPT', 'DISGUST', 'FEAR', 'HAPPINESS',  'NEUTRAL', 'SADNESS', 'SURPRISE']

def plot_roc_curve(clf, name, df):
    for emo in range(8):
        print(dictionary[emo])

        emo_feature = []
        emo_target = []

        for i in range(len(df['emotion'])):
            if df['emotion'][i] == emo:
                emo_target.append(1)
            else:
                emo_target.append(0)
                
            data = []
            for j in range(2, len(df.columns)):
                data.append(df._get_value(i, j, takeable = True))
            emo_feature.append(data)

        emo_feature = np.array(emo_feature)
        emo_target = np.array(emo_target)

        # import data
        X = emo_feature
        y = emo_target

        # add noisy features
        random_state = np.random.RandomState(0)

        # train test split
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=random_state)

        history = []
        tprs = []
        aucs = []
        mean_fpr = np.linspace(0, 1, 100)
        fig, ax = plt.subplots()

        # Run classifier with cross-validation and plot ROC curves
        cv = StratifiedKFold(n_splits=5, shuffle=True)
        for i, (train, val) in enumerate(cv.split(X_train, y_train)):
            X_train, X_val = X[train], X[val]
            y_train, y_val = y[train], y[val]

            # fit model
            clf.fit(X_train, y_train)

            # predict
            predict = clf.predict(X_val)
            clf_probs = clf.predict_proba(X_val)

            # store model report in history list
            history.append(classification_report(y_val, predict))

            # get loss and accuracy
            acc = accuracy_score(y_val, predict)
            loss = log_loss(y_val, clf_probs)
            print(f'====================Fold {i}====================', '\n')
            print(f"accuracy_score : {acc}")
            print(f"log_loss : {loss}\n")

            # plot ROC curve
            viz = RocCurveDisplay.from_estimator(clf, X_val, y_val, name="ROC fold {}".format(i), alpha=0.3, lw=1, ax=ax)
            interp_tpr = np.interp(mean_fpr, viz.fpr, viz.tpr)
            interp_tpr[0] = 0.0
            tprs.append(interp_tpr)
            aucs.append(viz.roc_auc)

        # middle line
        ax.plot([0, 1], [0, 1], 'k--')

        # mean line
        mean_tpr = np.mean(tprs, axis=0)
        mean_tpr[-1] = 1.0
        mean_auc = auc(mean_fpr, mean_tpr)
        std_auc = np.std(aucs)
        ax.plot(
            mean_fpr,
            mean_tpr,
            color="b",
            label=r"Mean ROC (AUC = %0.2f $\pm$ %0.2f)" % (mean_auc, std_auc),
            lw=2,
            alpha=0.8,
        )

        # std
        std_tpr = np.std(tprs, axis=0)
        tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
        tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
        ax.fill_between(
            mean_fpr,
            tprs_lower,
            tprs_upper,
            color="grey",
            alpha=0.2,
            label=r"$\pm$ 1 std. dev.",
        )

        ax.set(xlim=[-0.05, 1.05],
                ylim=[-0.05, 1.05],
                title="Receiver operating characteristic")
        ax.legend(loc="lower right")
        plt.xlabel("False Positive Rate")
        plt.ylabel("True Positive Rate")
        plt.savefig('graph/' + dictionary[emo] + '/' + name + '.jpg')
        plt.show()

: 

In [None]:
df = pd.read_csv(cwd + "/data_csv/hog_feature_data_wrapper.csv")

: 

In [None]:
# svm linear kernel
clf = svm.SVC(kernel='linear', C=1, probability=True)
plot_roc_curve(clf, 'svm_linear_wrp_hog', df)

: 

In [None]:
# svm poly kernel
clf = svm.SVC(kernel='poly', C=1, probability=True)
plot_roc_curve(clf, 'svm_poly_wrp_hog', df)

: 

In [None]:
# svm rbf kernel
clf = svm.SVC(kernel='rbf', C=1, probability=True)
plot_roc_curve(clf, 'svm_rbf_wrp_hog', df)

: 

In [None]:
# logistic regression
clf = LogisticRegression(solver='lbfgs', max_iter=10000)
plot_roc_curve(clf, 'logreg_wrp_hog', df)

: 

In [None]:
# random forest
clf = RandomForestClassifier(n_estimators=40)
plot_roc_curve(clf, 'rf_wrp_hog', df)

: 

In [None]:
# naive bayes
clf = GaussianNB()
plot_roc_curve(clf, 'nb_wrp_hog', df)

: 

In [None]:
df = pd.read_csv(cwd + "/data_csv/hog_feature_data_embedded.csv")

: 

In [None]:
# svm linear kernel
clf = svm.SVC(kernel='linear', C=1, probability=True)
plot_roc_curve(clf, 'svm_linear_emb_hog', df)

: 

In [None]:
# svm poly kernel
clf = svm.SVC(kernel='poly', C=1, probability=True)
plot_roc_curve(clf, 'svm_poly_emb_hog', df)

: 

In [None]:
# svm rbf kernel
clf = svm.SVC(kernel='rbf', C=1, probability=True)
plot_roc_curve(clf, 'svm_rbf_emb_hog', df)

: 

In [None]:
# logistic regression
clf = LogisticRegression(solver='lbfgs', max_iter=10000)
plot_roc_curve(clf, 'logreg_emb_hog', df)

: 

In [None]:
# random forest
clf = RandomForestClassifier(n_estimators=40)
plot_roc_curve(clf, 'rf_emb_hog', df)

: 

In [None]:
# naive bayes
clf = GaussianNB()
plot_roc_curve(clf, 'nb_emb_hog', df)

: 

In [None]:
df = pd.read_csv(cwd + "/data_csv/hog_feature_data_embedded_2.csv")

: 

In [None]:
# svm linear kernel
clf = svm.SVC(kernel='linear', C=1, probability=True)
plot_roc_curve(clf, 'svm_linear_emb_2_hog', df)

: 

In [None]:
# svm poly kernel
clf = svm.SVC(kernel='poly', C=1, probability=True)
plot_roc_curve(clf, 'svm_poly_emb_2_hog', df)

: 

In [None]:
# svm rbf kernel
clf = svm.SVC(kernel='rbf', C=1, probability=True)
plot_roc_curve(clf, 'svm_rbf_emb_2_hog', df)

: 

In [None]:
# logistic regression
clf = LogisticRegression(solver='lbfgs', max_iter=10000)
plot_roc_curve(clf, 'logreg_emb_2_hog', df)

: 

In [None]:
# random forest
clf = RandomForestClassifier(n_estimators=40)
plot_roc_curve(clf, 'rf_emb_2_hog', df)

: 

In [None]:
# naive bayes
clf = GaussianNB()
plot_roc_curve(clf, 'nb_emb_2_hog', df)

: 