In [1]:
# import plotly components for offline display of diagrams
from IPython.display import display, HTML
import plotly.offline as offline
from plotly.graph_objs import *


# enable latex-mathsymbols in plotly
offline.init_notebook_mode(connected=True)

# The polling here is to ensure that plotly.js has already been loaded before
# setting display alignment in order to avoid a race condition.
display(HTML(
    '<script>'
        'var waitForPlotly = setInterval( function() {'
            'if( typeof(window.Plotly) !== "undefined" ){'
                'MathJax.Hub.Config({ SVG: { font: "STIX-Web" }, displayAlign: "center" });'
                'MathJax.Hub.Queue(["setRenderer", MathJax.Hub, "SVG"]);'
                'clearInterval(waitForPlotly);'
            '}}, 250 );'
    '</script>'
))

import pandas as pd
from selected_features_boosting import selected_features_boosting
from sklearn.externals import joblib

TRAITS = ['agr', 'con', 'ext', 'neu', 'ope']

# Predicting brand personalities

Using the crawled brand posts, we calculated LIWC features. Now we can perform personality prediction (big5 traits) using our best predictor.

## Keyfigures of the crawled data

In [9]:
df = pd.read_csv('preprocessed_big5_new.csv', encoding="UTF-8", index_col=0)
keyfigures = df[['WC','Analytic','Clout','Authentic','Tone','WPS', 'Sixltr', 'Dic', 'function.', 'pronoun', 'ppron', 'i', 'we', 'you', 'shehe', 'they', 'ipron', 'article', 'prep', 'auxverb', 'adverb', 'conj', 'negate', 'verb', 'adj', 'compare', 'interrog', 'number', 'quant', 'affect', 'posemo', 'negemo', 'anx', 'anger', 'sad', 'social', 'family', 'friend', 'female', 'male', 'cogproc', 'insight', 'cause', 'discrep', 'tentat', 'certain', 'differ', 'percept', 'see', 'hear', 'feel', 'bio', 'body', 'health', 'sexual', 'ingest', 'drives', 'affiliation', 'achieve', 'power', 'reward', 'risk', 'focuspast', 'focuspresent', 'focusfuture', 'relativ', 'motion', 'space', 'time', 'work', 'leisure', 'home', 'money', 'relig', 'death', 'informal', 'swear', 'netspeak', 'assent', 'nonflu', 'filler', 'AllPunc', 'Period', 'Comma', 'Colon', 'SemiC', 'QMark', 'Exclam', 'Dash', 'Quote', 'Apostro', 'Parenth', 'OtherP']]
prediction_data = df[(df['Segment'] > 0)].reset_index()

Using the selected features from [Validation of selected features](https://hub.benedikt1992.de:8000/notebooks/Validation%20of%20Selected%20Features.ipynb) we can now predict the brands personality using different models:

- SVM (SVR)
- Decission Tree (Gradient Boosting)
- Neural Network

## SVM:

In [8]:
from cv_svr import RAND_SEED
svr_prediction = pd.DataFrame(columns=['Index', 'AccountID', 'PartyID', 'start', 'end', *TRAITS], index=range(0, len(prediction_data)))
svr_prediction['Index'] = prediction_data['Index']
svr_prediction['AccountID'] = prediction_data['AccountID']
svr_prediction['PartyID'] = prediction_data['PartyID']
svr_prediction['start'] = prediction_data['start']
svr_prediction['end'] = prediction_data['end']
prediction_data = prediction_data.drop(['AccountID', 'PartyID', 'start', 'end', 'Segment'], axis=1)

# scale features
scaler = joblib.load("data/cv/scaler.model")
X_predict = prediction_data.drop(['Index'], axis=1)
X_predict = scaler.transform(X_predict)
X_predict = pd.DataFrame(X_predict, columns = prediction_data.columns[1:]) # features columns without the brand name
    
for trait in TRAITS:
    name = trait + "_" + trait + "_" + str(RAND_SEED)
    model_name = "data/cv/boosted/univariate_svr_rbf_" + name + ".model"
    clf = joblib.load(model_name)
    
    # perform prediction based on selected features
    print(*selected_features_boosting[trait])
    svr_prediction[trait] = clf.predict(X_predict[[*selected_features_boosting[trait]]])

print(svr_prediction)
file_name = './five-factor-values.csv'
svr_prediction.to_csv(file_name, sep=',', encoding='utf-8', index=False, header=False)


Trying to unpickle estimator StandardScaler from version 0.19.1 when using version 0.20.3. This might lead to breaking code or invalid results. Use at your own risk.


Trying to unpickle estimator SVR from version 0.19.1 when using version 0.20.3. This might lead to breaking code or invalid results. Use at your own risk.



Analytic Tone WPS Sixltr Dic function. they ipron article negate number negemo anger family female male bio sexual ingest affiliation power focuspast relativ motion time home money relig death swear Period Colon QMark Exclam Apostro



Trying to unpickle estimator SVR from version 0.19.1 when using version 0.20.3. This might lead to breaking code or invalid results. Use at your own risk.



Tone WPS Sixltr Dic function. i we they ipron article prep adverb conj quant negemo anger family tentat hear sexual drives relativ motion time work leisure money death informal swear assent AllPunc Period Comma Colon QMark Exclam OtherP



Trying to unpickle estimator SVR from version 0.19.1 when using version 0.20.3. This might lead to breaking code or invalid results. Use at your own risk.



Clout Tone WPS Sixltr function. we they adverb conj negemo sad social family friend female discrep tentat certain bio sexual drives affiliation focuspast focusfuture motion space work leisure home death informal netspeak nonflu AllPunc Period Comma Colon Exclam Apostro Parenth OtherP



Trying to unpickle estimator SVR from version 0.19.1 when using version 0.20.3. This might lead to breaking code or invalid results. Use at your own risk.



Clout Tone WPS Sixltr you shehe ipron article verb compare interrog number negemo anx sad family female male discrep see health ingest drives reward focusfuture motion space leisure home death informal swear assent Period QMark Exclam Apostro OtherP



Trying to unpickle estimator SVR from version 0.19.1 when using version 0.20.3. This might lead to breaking code or invalid results. Use at your own risk.



Clout Tone WPS Sixltr Dic i you article conj negate number negemo sad social family insight certain sexual drives power reward focuspast focuspresent focusfuture space time home relig death informal netspeak assent AllPunc Period Comma QMark Exclam Apostro OtherP
     Index  PartyID       start         end       agr       con       ext  \
0        0        1  2014-04-01  2014-06-30  3.554271  3.618112  3.524346   
1        1        1  2013-07-01  2013-09-30  3.502590  3.582245  3.491311   
2        2        1  2011-01-01  2011-03-31  3.418823  3.478592  3.416419   
3        3        1  2010-07-01  2010-09-30  3.454552  3.508563  3.449513   
4        4        1  2017-01-01  2017-03-31  3.476942  3.558103  3.573707   
5        5        1  2009-07-01  2009-09-30  3.448107  3.533374  3.434257   
6        6        1  2015-07-01  2015-09-30  3.504932  3.654864  3.564306   
7        7        1  2011-04-01  2011-06-30  3.443496  3.420468  3.377166   
8        8        1  2012-01-01  2012-03-31

In [None]:
traces = []
for trait in TRAITS:
    traces.append(Bar(
        x = svr_prediction['Index'],
        y = svr_prediction[trait],
        name = trait,
        visible = 'legendonly'
    ))
traces[0].visible = True

layout = Layout(
    title = 'SVR Prediction results',
    xaxis = dict(type='lin', title='Index'),
    yaxis = dict(type='lin', title='Five Factor')
)
fig = dict(data = traces, layout=layout)
offline.iplot(fig)

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
from math import pi
 
plt.clf()    
    
# Set data
df = pd.DataFrame({
'OPE': [51, 54],
'CON': [67, 69],
'EXT': [36, 39],
'AGR': [51, 63],
'NEU': [32, 32]
})
 
# ------- PART 1: Create background
 
# number of variable
categories=list(df)[:]
N = len(categories)
 
# What will be the angle of each axis in the plot? (we divide the plot / number of variable)
angles = [n / float(N) * 2 * pi for n in range(N)]
angles += angles[:1]
 
# Initialise the spider plot
ax = plt.subplot(111, polar=True)
 
# If you want the first axis to be on top:
ax.set_theta_offset(pi / 2)
ax.set_theta_direction(-1)
 
# Draw one axe per variable + add labels labels yet
plt.xticks(angles[:-1], categories)
 
# Draw ylabels
ax.set_rlabel_position(0)
plt.yticks([10,20,30,40,50,60,70], ["10","20","30","40","50","60","70"], color="grey", size=7)
plt.ylim(0,70)
 
 
# ------- PART 2: Add plots
 
# Plot each individual = each line of the data
# I don't do a loop, because plotting more than 3 groups makes the chart unreadable
 
# Ind1
values=df.loc[0].values.flatten().tolist()
values += values[:1]
ax.plot(angles, values, linewidth=1, linestyle='solid', label="ApplyMagicSauce")
ax.fill(angles, values, 'b', alpha=0.1)
 
# Ind2
values=df.loc[1].values.flatten().tolist()
values += values[:1]
ax.plot(angles, values, linewidth=1, linestyle='solid', label="our SVR model prediction")
ax.fill(angles, values, 'r', alpha=0.1)
 
# Add legend
plt.legend(loc='upper right', bbox_to_anchor=(0.1, 0.1), title='Home Depot')
plt.savefig("home_depot.svg",bbox_inches="tight")


In [None]:
import matplotlib.pyplot as plt
import pandas as pd
from math import pi
 
plt.clf()
    
# Set data
df = pd.DataFrame({
'OPE': [51, 54],
'CON': [59, 68],
'EXT': [45, 50],
'AGR': [63, 53],
'NEU': [42, 32]
})
 
# ------- PART 1: Create background
 
# number of variable
categories=list(df)[:]
N = len(categories)
 
# What will be the angle of each axis in the plot? (we divide the plot / number of variable)
angles = [n / float(N) * 2 * pi for n in range(N)]
angles += angles[:1]
 
# Initialise the spider plot
ax = plt.subplot(111, polar=True)
 
# If you want the first axis to be on top:
ax.set_theta_offset(pi / 2)
ax.set_theta_direction(-1)
 
# Draw one axe per variable + add labels labels yet
plt.xticks(angles[:-1], categories)
 
# Draw ylabels
ax.set_rlabel_position(0)
plt.yticks([10,20,30,40,50,60,70], ["10","20","30","40","50","60","70"], color="grey", size=7)
plt.ylim(0,70)
 
 
# ------- PART 2: Add plots
 
# Plot each individual = each line of the data
# I don't do a loop, because plotting more than 3 groups makes the chart unreadable
 
# Ind1
values=df.loc[0].values.flatten().tolist()
values += values[:1]
ax.plot(angles, values, linewidth=1, linestyle='solid', label="ApplyMagicSauce")
ax.fill(angles, values, 'b', alpha=0.1)
 
# Ind2
values=df.loc[1].values.flatten().tolist()
values += values[:1]
ax.plot(angles, values, linewidth=1, linestyle='solid', label="our SVR model prediction")
ax.fill(angles, values, 'r', alpha=0.1)
 
# Add legend
plt.legend(loc='upper right', bbox_to_anchor=(0.1, 0.1), title='CVS')
plt.savefig("cvs.svg",bbox_inches="tight")
