## Libraries

In [18]:
import os
import re
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
import sklearn
import pickle
np.random.seed(50)

## Dataset Processing Functions
- [Docs: Handling XML](https://beautiful-soup-4.readthedocs.io/en/latest/index.html?highlight=xml#xml)

In [2]:
DATASET_PATH = './dataset/english/'
#os.listdir(DATASET_PATH)

In [3]:
LABELS = ['userid', 'gender', 'age_group', 'extroverted', 'stable', 'agreeable', 'conscientious', 'open']

# for handling URLs and REPLY tags
URL_REGEX = re.compile(r'(https?|ftp)://[^\s]*')
REPLY_REGEX = re.compile(r'@username')
URL_TAG = 'URL'
REPLY_TAG = 'REP'

In [4]:
# extract content from XML format
def ExtractTweets(xml):
    """
    This function extracts each tweet (document) from Twitter Timeline given in XML
    """
    bs_content = BeautifulSoup(xml, 'xml')
    #return [ProcessTweet(doc.text.strip()) for doc in bs_content.find_all("document")]
    return [doc.text.strip() for doc in bs_content.find_all("document")]

def ProcessTweet(text, language='english'):
    """
    Text processing for a single tweet
    """
    text = URL_REGEX.sub(URL_TAG, text) # set URL tags
    text = REPLY_REGEX.sub(REPLY_TAG, text) # set REPLY tags
    text = re.sub(r'(.)\1{3,}', r'\1\1\1', text) # trim repeated chars
    #text = trim_multiple_repeats(text) 
    letters_only = re.sub("[^a-zA-Z]", " ", text)
    words = letters_only.lower().split()   # lower case                        
    stops = set(stopwords.words(language)) # remove stopwords                 
    meaningful_words = [w for w in words if not w in stops]  
    return(" ".join(meaningful_words))

# ====== FOR ADDITIONAL USER FEATURES ======

# number of emoticons
def CountEmoticons(string):
    return len(re.findall(r'(?::|;|:\'|=)(?:-)?(?:\)|\(|D|P|d|p|3)|<3|</3|xd|xD|XD',string))

# number of character repetitions
def CountRepetitions(string):
    return len(re.findall(r'(\w)\1{3,}', string))

# number of replies
def CountReplies(string):
    return len(re.findall(r'@username', string))

# number of hastags
def CountHashtags(string):
    return len(re.findall(r'#(\w+)', string))

# number of exclamation marks
def CountExclamations(string):
    return len(re.findall(r'!+', string))

def GenerateAdditionalFeatures(lst):
    """
    Given a list of user timelines,
    this function generates the feature for average number of the various additional features:
    (1) emoticons
    (2) character repetitions
    (3) hastags
    (4) exclamation mark
    (5) replies
    (6) len of tweet
    (7) len of words
    """
    avg_emoticon_counts = []
    avg_hashtag_counts = []
    avg_exclamation_counts = []
    avg_reply_counts = []
    avg_repetition_counts = []
    avg_tweet_len = []
    #avg_word_len = []
    
    for timeline in lst:
        avg_emoticon_counts.append(np.mean(list(map(CountEmoticons, timeline))))
        avg_hashtag_counts.append(np.mean(list(map(CountHashtags, timeline))))
        avg_exclamation_counts.append(np.mean(list(map(CountExclamations, timeline))))
        avg_reply_counts.append(np.mean(list(map(CountReplies, timeline))))
        avg_repetition_counts.append(np.mean(list(map(CountRepetitions, timeline))))
        avg_tweet_len.append(np.mean(list(map(len, timeline))))
                                     
    return [avg_emoticon_counts, avg_hashtag_counts, avg_exclamation_counts, avg_reply_counts, avg_repetition_counts, avg_tweet_len]

In [5]:
# Author Labels

user_details = []
for labels in open(os.path.join(DATASET_PATH, 'truth.txt'), 'r'):
    user_details.append(list(map(str.strip, labels.split(':::'))))

user_df = pd.DataFrame(data=user_details, columns=LABELS)
user_df['userid'] = user_df['userid'].apply(str)
user_df['gender'] = user_df['gender'].apply(str)
user_df['age_group'] = user_df['age_group'].apply(str)
user_df['extroverted'] = user_df['extroverted'].apply(float)
user_df['stable'] = user_df['stable'].apply(float)
user_df['agreeable'] = user_df['agreeable'].apply(float)
user_df['conscientious'] = user_df['conscientious'].apply(float)
user_df['open'] = user_df['open'].apply(float)
user_df

Unnamed: 0,userid,gender,age_group,extroverted,stable,agreeable,conscientious,open
0,user552,M,25-34,0.3,0.5,0.1,0.2,0.2
1,user865,M,25-34,0.2,0.4,0.2,0.0,0.2
2,user103,M,18-24,0.1,0.2,0.1,0.1,0.1
3,user179,F,18-24,0.1,0.1,-0.1,0.5,0.1
4,user321,F,18-24,0.0,-0.1,0.1,0.3,0.4
...,...,...,...,...,...,...,...,...
147,user709,M,25-34,0.2,0.2,0.1,0.1,0.5
148,user931,F,18-24,0.5,0.3,0.0,0.1,0.3
149,user96,F,25-34,0.4,-0.3,0.0,0.1,0.3
150,user551,M,25-34,0.3,0.5,0.1,0.2,0.2


In [6]:
# Author Tweets
data = []
user_xmls = list(filter(lambda file: file != 'truth.txt', os.listdir(DATASET_PATH)))
for xml_filename in user_xmls:
    userid = os.path.splitext(xml_filename)[0]
    path = os.path.join(DATASET_PATH, xml_filename)
    with open(path, 'r', encoding='utf-8') as timeline:
        tweets = ExtractTweets(timeline)
    data.append([userid, tweets])
data = pd.DataFrame(data=data, columns=['userid', 'tweets'])
data

Unnamed: 0,userid,tweets
0,user1,"[""Fun is the enjoyment of pleasure"", @username..."
1,user1007,[I just became the mayor of Porta Romana on @u...
2,user103,[@username @username @username @username #IfM...
3,user104,[What You're Saying With Your Facial Hair http...
4,user112,"[@username sactan ne haber?, Aurora borealis i..."
...,...,...
147,user96,[Nasa to turn ISS into perfect Earth-observing...
148,user97,[A tour of the International Space Station - h...
149,user973,"[@username @username ay friend, q te fumasteSS..."
150,user989,[-2 to help save tha interwebs: http://t.co/Kf...


In [7]:
additional_features = pd.DataFrame(np.array(GenerateAdditionalFeatures(data['tweets'])).T, columns=['avg_emoticon','avg_hashtag','avg_exclamation','avg_reply','avg_repetition','avg_tweet_len'])
additional_features

Unnamed: 0,avg_emoticon,avg_hashtag,avg_exclamation,avg_reply,avg_repetition,avg_tweet_len
0,0.010526,0.178947,0.284211,0.821053,0.000000,90.936842
1,0.040000,0.230000,0.410000,0.760000,0.020000,79.670000
2,0.130000,0.500000,0.150000,1.330000,0.000000,97.490000
3,0.020000,0.490000,0.110000,0.580000,0.010000,86.180000
4,0.050000,0.350000,0.200000,0.330000,0.040000,78.000000
...,...,...,...,...,...,...
147,0.030000,0.010000,0.000000,0.010000,0.000000,111.970000
148,0.010309,0.020619,0.020619,0.041237,0.000000,108.886598
149,0.053571,0.339286,0.589286,1.339286,0.125000,78.821429
150,0.060000,0.350000,0.130000,0.630000,0.000000,79.210000


In [8]:
data['tweets'] = data['tweets'].apply(lambda x: " ".join(list(map(ProcessTweet, x))))
data

Unnamed: 0,userid,tweets
0,user1,fun enjoyment pleasure rep det fanns ett utvik...
1,user1007,became mayor porta romana rep url became mayor...
2,user103,rep rep rep rep ifmynamewas yashchopra name le...
3,user104,saying facial hair url via rep rep south india...
4,user112,rep sactan ne haber aurora borealis icin yasiy...
...,...,...
147,user96,nasa turn iss perfect earth observing platform...
148,user97,tour international space station url via url s...
149,user973,rep rep ay friend q te fumastesss invita rep y...
150,user989,help save tha interwebs url rep savethekitten ...


In [9]:
data = pd.concat([data, additional_features], axis=1)
data

Unnamed: 0,userid,tweets,avg_emoticon,avg_hashtag,avg_exclamation,avg_reply,avg_repetition,avg_tweet_len
0,user1,fun enjoyment pleasure rep det fanns ett utvik...,0.010526,0.178947,0.284211,0.821053,0.000000,90.936842
1,user1007,became mayor porta romana rep url became mayor...,0.040000,0.230000,0.410000,0.760000,0.020000,79.670000
2,user103,rep rep rep rep ifmynamewas yashchopra name le...,0.130000,0.500000,0.150000,1.330000,0.000000,97.490000
3,user104,saying facial hair url via rep rep south india...,0.020000,0.490000,0.110000,0.580000,0.010000,86.180000
4,user112,rep sactan ne haber aurora borealis icin yasiy...,0.050000,0.350000,0.200000,0.330000,0.040000,78.000000
...,...,...,...,...,...,...,...,...
147,user96,nasa turn iss perfect earth observing platform...,0.030000,0.010000,0.000000,0.010000,0.000000,111.970000
148,user97,tour international space station url via url s...,0.010309,0.020619,0.020619,0.041237,0.000000,108.886598
149,user973,rep rep ay friend q te fumastesss invita rep y...,0.053571,0.339286,0.589286,1.339286,0.125000,78.821429
150,user989,help save tha interwebs url rep savethekitten ...,0.060000,0.350000,0.130000,0.630000,0.000000,79.210000


In [10]:
dataset = data.merge(user_df, how='inner')

## Dataset

In [11]:
dataset

Unnamed: 0,userid,tweets,avg_emoticon,avg_hashtag,avg_exclamation,avg_reply,avg_repetition,avg_tweet_len,gender,age_group,extroverted,stable,agreeable,conscientious,open
0,user1,fun enjoyment pleasure rep det fanns ett utvik...,0.010526,0.178947,0.284211,0.821053,0.000000,90.936842,M,50-XX,0.4,0.3,0.5,0.2,0.3
1,user1007,became mayor porta romana rep url became mayor...,0.040000,0.230000,0.410000,0.760000,0.020000,79.670000,F,25-34,0.5,0.1,0.1,0.1,0.1
2,user103,rep rep rep rep ifmynamewas yashchopra name le...,0.130000,0.500000,0.150000,1.330000,0.000000,97.490000,M,18-24,0.1,0.2,0.1,0.1,0.1
3,user104,saying facial hair url via rep rep south india...,0.020000,0.490000,0.110000,0.580000,0.010000,86.180000,M,18-24,0.1,0.2,0.1,0.1,0.1
4,user112,rep sactan ne haber aurora borealis icin yasiy...,0.050000,0.350000,0.200000,0.330000,0.040000,78.000000,F,18-24,-0.1,0.3,0.2,0.0,0.3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
147,user96,nasa turn iss perfect earth observing platform...,0.030000,0.010000,0.000000,0.010000,0.000000,111.970000,F,25-34,0.4,-0.3,0.0,0.1,0.3
148,user97,tour international space station url via url s...,0.010309,0.020619,0.020619,0.041237,0.000000,108.886598,F,25-34,0.4,-0.3,0.0,0.1,0.3
149,user973,rep rep ay friend q te fumastesss invita rep y...,0.053571,0.339286,0.589286,1.339286,0.125000,78.821429,F,35-49,0.0,0.2,0.2,0.3,0.2
150,user989,help save tha interwebs url rep savethekitten ...,0.060000,0.350000,0.130000,0.630000,0.000000,79.210000,M,35-49,0.2,0.1,0.3,0.3,0.1


In [12]:
X = dataset[['tweets', 'avg_emoticon', 'avg_hashtag','avg_exclamation', 'avg_reply', 'avg_repetition', 'avg_tweet_len']]
y = dataset[['extroverted', 'stable', 'agreeable', 'conscientious', 'open']]

## Machine Learning Models

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error

In [14]:
def ModelEvaluator(X_train, X_test, y_train, y_test, model, parameters, scoring=None, nb_folds=5, test_size=0.3):

    grid_cv = GridSearchCV(model, parameters, scoring=scoring, n_jobs=-1, verbose= 1, cv=nb_folds)
    grid_cv.fit(X_train, y_train)
    
    estimator = grid_cv.best_estimator_
    
    print(f'model: {estimator}')
    print(f'best_params: {grid_cv.best_params_}')
    print(f'model score : {estimator.score(X_test, y_test)}')
    
    y_pred = estimator.predict(X_test)
    
    rmse = np.sqrt(mean_squared_error(y_pred, y_test))
    
    print(f'RMSE: {rmse}')
        
    return estimator, rmse, grid_cv.best_params_

In [19]:
def Evaluate_Models(trait_name, X, y, models, parameters, nb_folds=5, scoring='neg_mean_squared_error'):
    
    X_train, X_test, y_train, y_test = train_test_split(X, y[trait_name], test_size=0.2)
    
    vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(3,3))
    X_train_text_vec = vectorizer.fit_transform(X_train['tweets']).toarray()
    X_test_text_vec = vectorizer.transform(X_test['tweets']).toarray()
    print(f'x_train shape: {X_train_text_vec.shape}')
    print(f'x_test shape: {X_test_text_vec.shape}')
    
    pickle.dump(vectorizer, open(f'{trait_name}_vectorizer.pkl', 'wb'))
    
    X_train_features = X_train[['avg_emoticon', 'avg_hashtag', 'avg_exclamation', 'avg_reply', 'avg_repetition', 'avg_tweet_len']]
    X_test_features = X_test[['avg_emoticon', 'avg_hashtag', 'avg_exclamation', 'avg_reply', 'avg_repetition', 'avg_tweet_len']]
    X_train_features_vec = np.array(X_train_features)
    X_test_features_vec = np.array(X_test_features)
    
    X_train_vec = np.column_stack([X_train_text_vec, X_train_features_vec])
    X_test_vec = np.column_stack([X_test_text_vec, X_test_features_vec])
    print(f'x_train_input shape: {X_train_vec.shape}')
    print(f'x_test_input shape: {X_test_vec.shape}')
    
    y_train_vec, y_test_vec = np.array(y_train), np.array(y_test)
    
    best_model = None
    min_RMSE = 1
    best_params = {}
    
    for model, parameter in zip(models, parameters):
        
        print(f"====== Evaluating <{str(model).split('(')[0]}> ======")
        model, rmse, param = ModelEvaluator(X_train_vec, X_test_vec, y_train_vec, y_test_vec, model, parameter, nb_folds=nb_folds, scoring=scoring)
        
        if rmse < min_RMSE:
            best_model = model
            min_RMSE = rmse
            best_params = param
        print()
        
    print('\n')
    print(f"Best Model: {best_model}")
    print(f"RMSE: {min_RMSE}")
    print(f"Best Parameters: {best_params}")
    # serialize
    pickle.dump(best_model, open(f'{trait_name}_model.pkl', 'wb'))
    
    return best_model

In [20]:
MODELS = [LinearRegression(), DecisionTreeRegressor(), RandomForestRegressor(), SVR()]
# parameters for each of the models
MODEL_PARAMS = [{},{},{},{'kernel':('linear', 'rbf','poly','sigmoid'), 'C':np.linspace(1, 100,10)}]

In [21]:
extroverted_model = Evaluate_Models('extroverted', X, y, MODELS, MODEL_PARAMS)

x_train shape: (121, 7996)
x_test shape: (31, 7996)
x_train_input shape: (121, 8002)
x_test_input shape: (31, 8002)
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    2.1s finished


model: LinearRegression()
best_params: {}
model score : 0.2716115097916242
RMSE: 0.14541926495121907

Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.5s finished


model: DecisionTreeRegressor()
best_params: {}
model score : -0.9888888888888894
RMSE: 0.24029551698662832

Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   14.0s finished


model: RandomForestRegressor()
best_params: {}
model score : 0.15336999999999978
RMSE: 0.15677876337425709

Fitting 5 folds for each of 40 candidates, totalling 200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    1.7s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:    7.9s


model: SVR(C=100.0)
best_params: {'C': 100.0, 'kernel': 'rbf'}
model score : -0.1750514886158614
RMSE: 0.18470083393582587



Best Model: LinearRegression()
RMSE: 0.14541926495121907
Best Parameters: {}


[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:    8.3s finished


In [22]:
stable_model = Evaluate_Models('stable', X, y, MODELS, MODEL_PARAMS)

x_train shape: (121, 8068)
x_test shape: (31, 8068)
x_train_input shape: (121, 8074)
x_test_input shape: (31, 8074)
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.1s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


model: LinearRegression()
best_params: {}
model score : 0.2429091871865643
RMSE: 0.17077744092587838

Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.2s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


model: DecisionTreeRegressor()
best_params: {}
model score : -1.8889789303079412
RMSE: 0.33360204223092693

Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   12.7s finished


model: RandomForestRegressor()
best_params: {}
model score : -0.0724576445164773
RMSE: 0.2032575039284548

Fitting 5 folds for each of 40 candidates, totalling 200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    2.3s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:    9.7s
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:   10.1s finished


model: SVR(C=12.0, kernel='linear')
best_params: {'C': 12.0, 'kernel': 'linear'}
model score : 0.15466969311089795
RMSE: 0.18045533794186114



Best Model: LinearRegression()
RMSE: 0.17077744092587838
Best Parameters: {}


In [23]:
agreeable_model = Evaluate_Models('agreeable', X, y, MODELS, MODEL_PARAMS)

x_train shape: (121, 7965)
x_test shape: (31, 7965)
x_train_input shape: (121, 7971)
x_test_input shape: (31, 7971)
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.1s finished


model: LinearRegression()
best_params: {}
model score : 0.06029760852168364
RMSE: 0.11429749210507276

Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.2s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


model: DecisionTreeRegressor()
best_params: {}
model score : -3.2462574850299406
RMSE: 0.24296554913097484

Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   13.3s finished


model: RandomForestRegressor()
best_params: {}
model score : -0.049039071856286975
RMSE: 0.12076396599926223

Fitting 5 folds for each of 40 candidates, totalling 200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  76 tasks      | elapsed:    2.6s


model: SVR(C=89.0)
best_params: {'C': 89.0, 'kernel': 'rbf'}
model score : -0.0719939086573873
RMSE: 0.12207808106642343



Best Model: LinearRegression()
RMSE: 0.11429749210507276
Best Parameters: {}


[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:    7.2s finished


In [24]:
conscientious_model = Evaluate_Models('conscientious', X, y, MODELS, MODEL_PARAMS)

x_train shape: (121, 8052)
x_test shape: (31, 8052)
x_train_input shape: (121, 8058)
x_test_input shape: (31, 8058)
Fitting 5 folds for each of 1 candidates, totalling 5 fits
model: LinearRegression()
best_params: {}


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.1s finished


model score : -0.1903055148750885
RMSE: 0.1655236636714224

Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.2s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


model: DecisionTreeRegressor()
best_params: {}
model score : -1.1722423146473782
RMSE: 0.223606797749979

Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   10.9s finished


model: RandomForestRegressor()
best_params: {}
model score : -0.013569665461121039
RMSE: 0.15274171747778834

Fitting 5 folds for each of 40 candidates, totalling 200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  76 tasks      | elapsed:    2.8s
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:    7.6s finished


model: SVR(kernel='linear')
best_params: {'C': 1.0, 'kernel': 'linear'}
model score : 0.01325904952404422
RMSE: 0.1507066594943641



Best Model: SVR(kernel='linear')
RMSE: 0.1507066594943641
Best Parameters: {'C': 1.0, 'kernel': 'linear'}


In [25]:
open_model = Evaluate_Models('open', X, y, MODELS, MODEL_PARAMS)

x_train shape: (121, 8000)
x_test shape: (31, 8000)
x_train_input shape: (121, 8006)
x_test_input shape: (31, 8006)
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.1s finished


model: LinearRegression()
best_params: {}
model score : 0.418773629934887
RMSE: 0.1161350717625401

Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.2s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


model: DecisionTreeRegressor()
best_params: {}
model score : -0.5708520179372207
RMSE: 0.19092305492848635

Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   11.1s finished


model: RandomForestRegressor()
best_params: {}
model score : 0.17119901345291433
RMSE: 0.1386806496258406

Fitting 5 folds for each of 40 candidates, totalling 200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  76 tasks      | elapsed:    2.7s
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:    7.2s finished


model: SVR(kernel='linear')
best_params: {'C': 1.0, 'kernel': 'linear'}
model score : 0.29371767491488554
RMSE: 0.12802061427314798



Best Model: LinearRegression()
RMSE: 0.1161350717625401
Best Parameters: {}


## Testing

In [26]:
class Dataset(object):
    def __init__(self, timeline_list):
        self.timeline = [] # a list of unprocessed tweets (str)
        self.avg_emoticon_counts = None # additional features
        self.avg_hashtag_counts = None
        self.avg_exclamation_counts = None
        self.avg_reply_counts = None
        self.avg_repetition_counts = None
        self.avg_tweet_len = None
        self.processed_timeline = None
        
        if not timeline_list is None:
            for tweet_dict in timeline_list:
                self.timeline.append(tweet_dict["text"])
    def get_attributes(self):
         return self.avg_emoticon_counts#, self.avg_hashtag_counts, #self.avg_exclamation_counts, self.avg_reply_counts, self.avg_repetition_counts, #self.avg_tweet_len]            
    def process(self):
        # make ready data for prediction
        self.processed_timeline = " ".join(list(map(self.process_tweet, self.timeline)))
        
        return [self.processed_timeline], [self.avg_emoticon_counts, self.avg_hashtag_counts, self.avg_exclamation_counts, self.avg_reply_counts, self.avg_repetition_counts, self.avg_tweet_len]
    
    def process_tweet(self, text, language='english'):
        """
        text processing for a single tweet
        """
        text = URL_REGEX.sub(URL_TAG, text) # set URL tags
        text = REPLY_REGEX.sub(REPLY_TAG, text) # set REPLY tags
        text = re.sub(r'(.)\1{3,}', r'\1\1\1', text) # trim repeated chars
        #text = trim_multiple_repeats(text) 
        letters_only = re.sub("[^a-zA-Z]", " ", text)
        words = letters_only.lower().split()   # lower case                        
        stops = set(stopwords.words(language)) # remove stopwords                 
        meaningful_words = [w for w in words if not w in stops]  
        return(" ".join(meaningful_words))

    # number of emoticons
    def count_avg_emoticons(self):
        if not self.timeline is None:
            count = 0
            for tweet in self.timeline:
                count += len(re.findall(r'(?::|;|:\'|=)(?:-)?(?:\)|\(|D|P|d|p|3)|<3|</3|xd|xD|XD', tweet))
            return count/len(self.timeline)

    # number of character repetitions
    def count_avg_repetitions(self):
        if not self.timeline is None:
            count = 0
            for tweet in self.timeline:
                count += len(re.findall(r'(\w)\1{3,}', tweet))
            return count/len(self.timeline)

    # number of replies
    def count_avg_replies(self):
        if not self.timeline is None:
            count = 0
            for tweet in self.timeline:
                count += len(re.findall(r'@username', tweet))
            return count/len(self.timeline)

    # number of hastags
    def count_avg_hashtags(self):
        if not self.timeline is None:
            count = 0
            for tweet in self.timeline:
                count += len(re.findall(r'#(\w+)', tweet))
            return count/len(self.timeline)

    # number of exclamation marks
    def count_avg_exclamations(self):
        if not self.timeline is None:
            count = 0
            for tweet in self.timeline:
                count += len(re.findall(r'!+', tweet))
            return count/len(self.timeline)
        
    # tweet len
    def count_avg_tweet_len(self):
        if not self.timeline is None:
            count = 0
            for tweet in self.timeline:
                count += len(tweet)
            return count/len(self.timeline)
        
    def generate_features(self):
        """
        Given timeline, generate additional features:
        (1) emoticons
        (2) character repetitions
        (3) hastags
        (4) exclamation mark
        (5) replies
        (6) len of tweet
        """
        self.avg_emoticon_counts = self.count_avg_emoticons()
        self.avg_hashtag_counts = self.count_avg_hashtags()
        self.avg_exclamation_counts = self.count_avg_exclamations()
        self.avg_reply_counts = self.count_avg_replies()
        self.avg_repetition_counts = self.count_avg_repetitions()
        self.avg_tweet_len = self.count_avg_tweet_len()

In [27]:
post_data = [
    {"text": "Fantastic work from @OliverDowden and the DCMS team getting rehearsals, set building, and streaming agreed in new guidance. The next month will be tough for everyone; these measures will make all the difference. Thank you. - ALW"}, 
    {"text": "Marvellous work by Henry and his team in this project. Keep up the good work."}
    ]

In [28]:
data = Dataset(post_data)
data.generate_features()
personalities = ['extroverted', 'stable', 'agreeable', 'conscientious', 'open']

In [29]:
processed_timeline, features = data.process()

In [30]:
features

[0.0, 0.0, 0.0, 0.0, 0.0, 152.5]

In [37]:
# load all the corresponding vectorizers for each trait
vectorizers = {}
for personality in personalities:
    if os.path.isfile(f'{personality}_vectorizer.pkl'):
        vectorizers[personality] = pickle.load(open(f'{personality}_vectorizer.pkl','rb'))

# load all the existing models for each trait
models = {}
for personality in personalities:
    if os.path.isfile(f'{personality}_model.pkl'):
        models[personality] = pickle.load(open(f'{personality}_model.pkl','rb'))
    
# make prediction for each trait
output = {}
for personality in personalities:

    text_vec = vectorizers[personality].transform(processed_timeline).toarray()
    print(personality)
    print(text_vec.shape)
    feature_vec = np.array([features])
    print(feature_vec.shape)
    input_vec = np.column_stack([text_vec, feature_vec])
    print(input_vec.shape)
    
    output[personality] = round(models[personality].predict(input_vec)[0],2)

extroverted
(1, 7996)
(1, 6)
(1, 8002)
stable
(1, 8068)
(1, 6)
(1, 8074)
agreeable
(1, 7965)
(1, 6)
(1, 7971)
conscientious
(1, 8052)
(1, 6)
(1, 8058)
open
(1, 8000)
(1, 6)
(1, 8006)


In [38]:
output

{'extroverted': 0.19,
 'stable': 0.11,
 'agreeable': 0.05,
 'conscientious': 0.28,
 'open': 0.45}