# Create Collection of Predictive Words per Personality Trait using LogisticRegression

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.feature_selection import chi2
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

<br><br>
### Import Data

In [2]:
data = pd.read_csv('../../data/cleaned/expanded_mbti_df.csv')

In [3]:
data.head(2)

Unnamed: 0,type,posts,comp_score,neg_score,neu_score,pos_score,post_count,avg_word_count,posts_cleaned,cleaned_comp_score,...,diff_comp_init-no_punct,E_I,N_S,F_T,J_P,E_I_code,N_S_code,F_T_code,J_P_code,type_code
0,INFJ,"[""'http://www.youtube.com/watch?v=qsXHcwe3krw""...",0.9877,0.054,0.829,0.116,50,90,['enfp and intj moments sportscenter not top t...,0.9839,...,0.0074,I,N,F,J,0,1,1,1,111
1,ENTP,"[""'I'm finding the lack of me in these posts v...",0.9994,0.068,0.752,0.18,50,138,"[""'I'm finding the lack of me in these posts v...",0.9993,...,0.0009,E,N,T,P,1,1,0,0,1100


<br><br>
### Custom Stopword Collection

In [4]:
with open('../../data/full_stopwords.txt', 'r') as filehandle:
    custom_stopwords = [words.rstrip() for words in filehandle.readlines()]

<br><br>
### TFDIF Vectorizer Model: using *stop_words=custom_stopwords*

In [6]:
targets = ['E_I', 'N_S', 'F_T', 'J_P']

In [7]:
targets_dict_tf = {}

for target in targets:   
    target_dict_L1 = {}
    target_dict_L2 = {}
    ngram_pos_dict = {}
    ngram_neg_dict = {}

    for i in range(1,5):
        word_pos_dict = {}
        word_neg_dict = {}
        
        print('FLAG: '+target+'...'+str(i))
        classifier = LogisticRegression(C=1.0, class_weight="balanced", max_iter = 10_000)
        tf_idf = Pipeline([
                    ('tfidf', TfidfVectorizer(stop_words=custom_stopwords, ngram_range=(i,i), min_df=10)),
                    ("classifier", classifier)])
        tf_idf.fit(data.posts_no_digits, data[target])
        coefs = tf_idf.named_steps["classifier"].coef_
        coefs.tolist()
        feature_names = tf_idf.named_steps["tfidf"].get_feature_names()
        coefs_and_features = list(zip(coefs[0], feature_names))
        positive_results = sorted(coefs_and_features, key=lambda x: x[0], reverse=True)[:30]
        negative_results = sorted(coefs_and_features, key=lambda x: x[0])[:30]

        for result in positive_results:
            word_pos_dict['word_'+str(len(word_pos_dict)+1)] = {'coef':abs(result[0]), 'word':result[1]}
        ngram_pos_dict['ngram_'+str(i)] = word_pos_dict
        target_dict_L1.update(ngram_pos_dict)

        for result in negative_results:
            word_neg_dict['word_'+str(len(word_neg_dict)+1)] = {'coef':abs(result[0]), 'word':result[1]}
        ngram_neg_dict['ngram_'+str(i)] = word_neg_dict
        target_dict_L2.update(ngram_neg_dict)
    
    targets_dict_tf[target[0]] = target_dict_L1
    targets_dict_tf[target[2]] = target_dict_L2

FLAG: E_I...1
FLAG: E_I...2
FLAG: E_I...3
FLAG: E_I...4
FLAG: N_S...1
FLAG: N_S...2
FLAG: N_S...3
FLAG: N_S...4
FLAG: F_T...1
FLAG: F_T...2
FLAG: F_T...3
FLAG: F_T...4
FLAG: J_P...1
FLAG: J_P...2
FLAG: J_P...3
FLAG: J_P...4


In [9]:
reindexed_target_dict = {}

for targetKey, ngramDict in targets_dict_tf.items():
    for ngramKey, wordDict in ngramDict.items():
        for word_indexKey, word_indexValue in wordDict.items():
            reindexed_target_dict[(targetKey, ngramKey, word_indexKey)] = word_indexValue

ngrams_tf_df = pd.DataFrame(reindexed_target_dict).T
ngrams_tf_df

Unnamed: 0,Unnamed: 1,Unnamed: 2,coef,word
E,ngram_1,word_1,2.35492,family
E,ngram_1,word_2,2.22272,games
E,ngram_1,word_3,2.10766,feel
E,ngram_1,word_4,1.98206,mind
E,ngram_1,word_5,1.96383,dream
...,...,...,...,...
P,ngram_4,word_26,0.641491,could anthropomorphize animal would
P,ngram_4,word_27,0.641491,would animal would represent
P,ngram_4,word_28,0.618634,often think society downhill
P,ngram_4,word_29,0.611839,score scale ranging low


In [8]:
targets_dict_cv = {}

for target in targets:   
    target_dict_L1 = {}
    target_dict_L2 = {}
    ngram_pos_dict = {}
    ngram_neg_dict = {}

    for i in range(1,5):
        word_pos_dict = {}
        word_neg_dict = {}
        
        print('FLAG: '+target+'...'+str(i))
        classifier = LogisticRegression(C=1.0, class_weight="balanced", max_iter = 10_000)
        cvect = Pipeline([
                    ('countvect', CountVectorizer(stop_words=custom_stopwords, ngram_range=(i,i), min_df=10)),
                    ("classifier", classifier)])
        cvect.fit(data.posts_no_digits, data[target])
        coefs = cvect.named_steps["classifier"].coef_
        coefs.tolist()
        feature_names = cvect.named_steps["countvect"].get_feature_names()
        coefs_and_features = list(zip(coefs[0], feature_names))
        positive_results = sorted(coefs_and_features, key=lambda x: x[0], reverse=True)[:30]
        negative_results = sorted(coefs_and_features, key=lambda x: x[0])[:30]

        for result in positive_results:
            word_pos_dict['word_'+str(len(word_pos_dict)+1)] = {'coef':abs(result[0]), 'word':result[1]}
        ngram_pos_dict['ngram_'+str(i)] = word_pos_dict
        target_dict_L1.update(ngram_pos_dict)

        for result in negative_results:
            word_neg_dict['word_'+str(len(word_neg_dict)+1)] = {'coef':abs(result[0]), 'word':result[1]}
        ngram_neg_dict['ngram_'+str(i)] = word_neg_dict
        target_dict_L2.update(ngram_neg_dict)
    
    targets_dict_cv[target[0]] = target_dict_L1
    targets_dict_cv[target[2]] = target_dict_L2

FLAG: E_I...1
FLAG: E_I...2
FLAG: E_I...3
FLAG: E_I...4
FLAG: N_S...1
FLAG: N_S...2
FLAG: N_S...3
FLAG: N_S...4
FLAG: F_T...1
FLAG: F_T...2
FLAG: F_T...3
FLAG: F_T...4
FLAG: J_P...1
FLAG: J_P...2
FLAG: J_P...3
FLAG: J_P...4


In [10]:
reindexed_target_dict = {}

for targetKey, ngramDict in targets_dict_cv.items():
    for ngramKey, wordDict in ngramDict.items():
        for word_indexKey, word_indexValue in wordDict.items():
            reindexed_target_dict[(targetKey, ngramKey, word_indexKey)] = word_indexValue

ngrams_cv_df = pd.DataFrame(reindexed_target_dict).T
ngrams_cv_df

Unnamed: 0,Unnamed: 1,Unnamed: 2,coef,word
E,ngram_1,word_1,0.848481,organize
E,ngram_1,word_2,0.741726,letting
E,ngram_1,word_3,0.713516,spoken
E,ngram_1,word_4,0.711239,severe
E,ngram_1,word_5,0.702281,offense
...,...,...,...,...
P,ngram_4,word_26,0.589868,ever consider going bungee
P,ngram_4,word_27,0.589868,going bungee jumping skydiving
P,ngram_4,word_28,0.587306,thank much taking time
P,ngram_4,word_29,0.581126,welcome hope enjoy time


In [11]:
ngrams_cv_df.to_csv('ngrams_cv_logreg.csv')

In [12]:
ngrams_tf_df.to_csv('ngrams_tf_logreg.csv')