In [1]:
# import plotly components for offline display of diagrams
from IPython.display import display, HTML
import plotly.offline as offline
from plotly.graph_objs import *


# enable latex-mathsymbols in plotly
offline.init_notebook_mode(connected=True)

# The polling here is to ensure that plotly.js has already been loaded before
# setting display alignment in order to avoid a race condition.
display(HTML(
    '<script>'
        'var waitForPlotly = setInterval( function() {'
            'if( typeof(window.Plotly) !== "undefined" ){'
                'MathJax.Hub.Config({ SVG: { font: "STIX-Web" }, displayAlign: "center" });'
                'MathJax.Hub.Queue(["setRenderer", MathJax.Hub, "SVG"]);'
                'clearInterval(waitForPlotly);'
            '}}, 250 );'
    '</script>'
))

import pandas as pd
from selected_features_boosting import selected_features_boosting
from sklearn.externals import joblib

TRAITS = ['agr', 'con', 'ext', 'neu', 'ope']

# Predicting brand personalities

Using the crawled brand posts, we calculated LIWC features. Now we can perform personality prediction (big5 traits) using our best predictor.

## Keyfigures of the crawled data

In [2]:
df = pd.read_csv('preprocessed_big5.csv', encoding="UTF-8", index_col=0)
keyfigures = df[['WC','Analytic','Clout','Authentic','Tone','WPS', 'Sixltr', 'Dic', 'function.', 'pronoun', 'ppron', 'i', 'we', 'you', 'shehe', 'they', 'ipron', 'article', 'prep', 'auxverb', 'adverb', 'conj', 'negate', 'verb', 'adj', 'compare', 'interrog', 'number', 'quant', 'affect', 'posemo', 'negemo', 'anx', 'anger', 'sad', 'social', 'family', 'friend', 'female', 'male', 'cogproc', 'insight', 'cause', 'discrep', 'tentat', 'certain', 'differ', 'percept', 'see', 'hear', 'feel', 'bio', 'body', 'health', 'sexual', 'ingest', 'drives', 'affiliation', 'achieve', 'power', 'reward', 'risk', 'focuspast', 'focuspresent', 'focusfuture', 'relativ', 'motion', 'space', 'time', 'work', 'leisure', 'home', 'money', 'relig', 'death', 'informal', 'swear', 'netspeak', 'assent', 'nonflu', 'filler', 'AllPunc', 'Period', 'Comma', 'Colon', 'SemiC', 'QMark', 'Exclam', 'Dash', 'Quote', 'Apostro', 'Parenth', 'OtherP']]
prediction_data = df[(df['Segment'] > 0)].reset_index()

Using the selected features from [Validation of selected features](https://hub.benedikt1992.de:8000/notebooks/Validation%20of%20Selected%20Features.ipynb) we can now predict the brands personality using different models:

- SVM (SVR)
- Decission Tree (Gradient Boosting)
- Neural Network

## SVM:

In [6]:
from cv_svr import RAND_SEED
svr_prediction = pd.DataFrame(columns=['Index', 'AccountID', 'PartyID', 'year', 'quarter', *TRAITS], index=range(0, len(prediction_data)))
svr_prediction['Index'] = prediction_data['Index']
svr_prediction['AccountID'] = prediction_data['AccountID']
svr_prediction['PartyID'] = prediction_data['PartyID']
svr_prediction['year'] = prediction_data['year']
svr_prediction['quarter'] = prediction_data['quarter']
prediction_data = prediction_data.drop(['AccountID', 'PartyID', 'year', 'quarter', 'Segment'], axis=1)

# scale features
scaler = joblib.load("cv/scaler.model")
X_predict = prediction_data.drop(['Index'], axis=1)
X_predict = scaler.transform(X_predict)
X_predict = pd.DataFrame(X_predict, columns = prediction_data.columns[1:]) # features columns without the brand name
    
for trait in TRAITS:
    name = trait + "_" + trait + "_" + str(RAND_SEED)
    model_name = "cv/boosted/univariate_svr_rbf_" + name + ".model"
    clf = joblib.load(model_name)
    
    # perform prediction based on selected features
    print(*selected_features_boosting[trait])
    svr_prediction[trait] = clf.predict(X_predict[[*selected_features_boosting[trait]]])

print(svr_prediction)
file_name = './five-factor-values.csv'
svr_prediction.to_csv(file_name, sep=',', encoding='utf-8', index=False, header=False)
file_name = '../../../setup/querys/five-factor-values.csv'
svr_prediction.to_csv(file_name, sep=',', encoding='utf-8', index=False, header=False)


Trying to unpickle estimator StandardScaler from version 0.19.1 when using version 0.20.3. This might lead to breaking code or invalid results. Use at your own risk.


Data with input dtype int64, float64 were all converted to float64 by StandardScaler.


Trying to unpickle estimator SVR from version 0.19.1 when using version 0.20.3. This might lead to breaking code or invalid results. Use at your own risk.



Analytic Tone WPS Sixltr Dic function. they ipron article negate number negemo anger family female male bio sexual ingest affiliation power focuspast relativ motion time home money relig death swear Period Colon QMark Exclam Apostro



Trying to unpickle estimator SVR from version 0.19.1 when using version 0.20.3. This might lead to breaking code or invalid results. Use at your own risk.



Tone WPS Sixltr Dic function. i we they ipron article prep adverb conj quant negemo anger family tentat hear sexual drives relativ motion time work leisure money death informal swear assent AllPunc Period Comma Colon QMark Exclam OtherP



Trying to unpickle estimator SVR from version 0.19.1 when using version 0.20.3. This might lead to breaking code or invalid results. Use at your own risk.



Clout Tone WPS Sixltr function. we they adverb conj negemo sad social family friend female discrep tentat certain bio sexual drives affiliation focuspast focusfuture motion space work leisure home death informal netspeak nonflu AllPunc Period Comma Colon Exclam Apostro Parenth OtherP



Trying to unpickle estimator SVR from version 0.19.1 when using version 0.20.3. This might lead to breaking code or invalid results. Use at your own risk.



Clout Tone WPS Sixltr you shehe ipron article verb compare interrog number negemo anx sad family female male discrep see health ingest drives reward focusfuture motion space leisure home death informal swear assent Period QMark Exclam Apostro OtherP



Trying to unpickle estimator SVR from version 0.19.1 when using version 0.20.3. This might lead to breaking code or invalid results. Use at your own risk.



Clout Tone WPS Sixltr Dic i you article conj negate number negemo sad social family insight certain sexual drives power reward focuspast focuspresent focusfuture space time home relig death informal netspeak assent AllPunc Period Comma QMark Exclam Apostro OtherP
      Index  AccountID  PartyID  year  quarter       agr       con       ext  \
0         0         10        3  2009        1  3.495984  3.778752  3.326412   
1         1         10        3  2009        2  3.460864  3.550169  3.423918   
2         2         10        3  2009        3  3.485042  3.653572  3.444237   
3         3         10        3  2009        4  3.464628  3.577998  3.359501   
4         4         10        3  2010        1  3.505169  3.699309  3.437948   
5         5         10        3  2010        2  3.504753  3.493131  3.323795   
6         6         10        3  2010        3  3.401656  3.398983  3.047418   
7         7         10        3  2011        1  3.386236  3.459259  3.327633   
8         8     