In [143]:
import numpy as np
import pandas as pd
from sklearn import linear_model
from sklearn.metrics import explained_variance_score
from sklearn import cross_validation
from scipy.stats import spearmanr
from sklearn import preprocessing
import matplotlib.pyplot as plt

In [54]:
## How we generate the geometric_all dataframe. 

def gen_geometric_all():
    attract = pd.read_csv('../clean_data/geometric_attract.csv')
    geometric_feature = pd.read_csv('../clean_data/geometric_features.csv')

    imgName_feature = geometric_feature['imgName']
    imgName_att = attract['Filename']

    for i in range(len(imgName_feature)):
        imgName_feature[i] = imgName_feature[i][34:]    

    # Create a new line on geometric feature list. 
    geometric_attract = np.zeros((len(imgName_feature), 1))
    for idx, file_name in enumerate(imgName_feature):
        try:
            match_index = imgName_att.values.tolist().index(file_name)
            geometric_attract[idx] = attract['attractive'][match_index]
        except ValueError:
            print(file_name)

    new_attract = [item for sublist in geometric_attract for item in sublist]
    geometric_all = geometric_feature
    geometric_all['attractive'] = new_attract
    geometric_all.to_csv('../clean_data/geometric_all.csv')
    return 


In [98]:
# load x and y 
geometric_all = pd.read_csv('../clean_data/geometric_all.csv')

x = geometric_all.drop(['imgName', 'Unnamed: 0', 'attractive'], axis=1)
x_fields = list(x.columns.values)
feature_x = x.values
feature_x = preprocessing.scale(feature_x)  # normalize the feature matrix. 

attract_y = geometric_all['attractive'].values

In [None]:
# Run linear regression multiple times. 
itr_num = 50
random_seed = np.random.randint(1, 1000, size=itr_num)
test_corr_list = np.zeros((itr_num, 1))
coef_list = np.zeros((len(x_fields), itr_num))
intercept_list = np.zeros((itr_num, 1))
alpha_list = np.zeros((itr_num, 1))

for cur_itr in range(itr_num):
    x_train, x_test, y_train, y_test = cross_validation.train_test_split(feature_x, attract_y, test_size=0.5, random_state=random_seed[cur_itr])
    clf = linear_model.RidgeCV(alphas=np.logspace(-3, 1, num=20), fit_intercept=True)
    clf.fit(x_train, y_train)
    
    coef_list[:, cur_itr] = clf.coef_
    intercept_list[cur_itr] = clf.intercept_
    alpha_list[cur_itr] = clf.alpha_
    
    y_test_pred = clf.predict(x_test)
    corr = spearmanr(y_test, y_test_pred)
    test_corr_list[cur_itr] = corr[0]

test_corr_list.mean()

In [142]:
# Now plot the social features with error bar. 
coef_mean = coef_list.mean(axis=1)
coef_std = coef_list.std(axis=1)

# plot
n = len(coef_mean)  # number of data entries
ind = np.arange(n)  # the x locations for the groups
width = 0.35 # bar width

fig, ax = plt.subplots()

rects1 = ax.bar(ind, coef_mean,                  # data
                width,                          # bar width
                color='MediumSlateBlue',        # bar colour
                yerr=coef_std,                  # data for error bars
                error_kw={'ecolor':'Tomato',    # error-bars colour
                          'linewidth':2})       # error-bar width

axes = plt.gca()
axes.set_ylim([-1, 1])


ax.set_ylabel('coefficient')
ax.set_title('coefficients of every geometric feature')

ax.set_xticks(ind + width)
ax.set_xticklabels(x_fields, rotation='vertical')
plt.tick_params(labelsize=10)
plt.show()