# EDA - 2

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.feature_selection import chi2
from itertools import permutations

<br><br>
### Import Data

In [2]:
data = pd.read_csv('../../data/cleaned/mbti_v5.csv')

In [3]:
# pad type_code column with leading zeros where needed
data['type_code'] = data['type_code'].apply(lambda x: '{0:0>4}'.format(x))

In [4]:
data.head(2)

Unnamed: 0,type,posts,E_I,N_S,F_T,J_P,E_I_code,N_S_code,F_T_code,J_P_code,type_code
0,INFJ,enfp and intj moments sportscenter not top ten...,I,N,F,J,0,1,1,1,111
1,ENTP,i m finding the lack of me in these posts very...,E,N,T,P,1,1,0,0,1100


<br><br>
### Create Type_Code Dictionary

In [5]:
# code to create type_code dictionary
type_cols = data[['type','type_code']]
type_dict_vals = set(type_cols.apply(lambda x: ':'.join(x.values.astype(str)), axis=1))
type_code_dict = {value.split(':')[0]:value.split(':')[1] for value in type_dict_vals}

<br><br>
### Create Custom Stopword Collection

In [31]:
# create list of mbti types and their plurals to add to stop word collection
mbti_types = [key.lower() for key in type_code_dict.keys()]
mbti_types += [(each+'s') for each in mbti_types]

In [32]:
# create list of bi-trait strings to add to stop word collection
trait1 = ["e", "i"]
trait2 = ["n", "s"]
trait3 = ["f", "t"]
trait4 = ["j", "p"]
type_combos = []
type_pairs = []

type_combos += [list(zip(each_permutation, trait2)) for each_permutation in permutations(trait1, len(trait2))]
type_combos += [list(zip(each_permutation, trait1)) for each_permutation in permutations(trait2, len(trait1))]
type_combos += [list(zip(each_permutation, trait3)) for each_permutation in permutations(trait1, len(trait3))]
type_combos += [list(zip(each_permutation, trait1)) for each_permutation in permutations(trait3, len(trait1))]
type_combos += [list(zip(each_permutation, trait4)) for each_permutation in permutations(trait1, len(trait4))]
type_combos += [list(zip(each_permutation, trait1)) for each_permutation in permutations(trait4, len(trait1))]
type_combos += [list(zip(each_permutation, trait3)) for each_permutation in permutations(trait2, len(trait3))]
type_combos += [list(zip(each_permutation, trait2)) for each_permutation in permutations(trait3, len(trait2))]
type_combos += [list(zip(each_permutation, trait4)) for each_permutation in permutations(trait2, len(trait4))]
type_combos += [list(zip(each_permutation, trait2)) for each_permutation in permutations(trait4, len(trait2))]
type_combos += [list(zip(each_permutation, trait4)) for each_permutation in permutations(trait3, len(trait4))]
type_combos += [list(zip(each_permutation, trait3)) for each_permutation in permutations(trait4, len(trait3))]

for i in range(24):
    type_pairs.append(''.join([type_combos[i][0][0],type_combos[i][0][1]]))
    type_pairs.append(''.join([type_combos[i][1][0],type_combos[i][1][1]]))

<div class="alert alert-block alert-warning">
<b>WARNING:</b> DEPRECATED misc_word_list.<br>
* Updated list used in future models.<br>
[timestamp: 2/18-1:40p]
</div>

In [33]:
misc_word_list = ['enneagram', 'enneagrams', 'mbti', 'mbtis', 'meyer', 'meyers', 'briggs', 'brigg']

<div class="alert alert-block alert-warning">
<b>WARNING:</b> Updated misc_word_list below.<br>
[timestamp: 2/18-1:40p]
</div>

In [34]:
custom_stopwords = mbti_types + type_pairs + misc_word_list

<br><br>
### TFDIF Vectorizer Model: using *stop_words=custom_stopwords*

In [35]:
#TFIDF Vectorizer
tfidf = TfidfVectorizer(stop_words=custom_stopwords, ngram_range=(1, 3), min_df=10)
features = tfidf.fit_transform(data.posts).toarray()
labels = data.type_code

In [37]:
# sourced from: https://towardsdatascience.com/multi-class-text-classification-with-scikit-learn-12f1e60e0a9f

for type, type_code in type_code_dict.items():
    features_chi2 = chi2(features, labels == type_code)
    indices = np.argsort(features_chi2[0])
    feature_names = np.array(tfidf.get_feature_names())[indices]
    unigrams = [v for v in feature_names if len(v.split(' ')) == 1]
    bigrams = [v for v in feature_names if len(v.split(' ')) == 2]
    trigrams = [v for v in feature_names if len(v.split(' ')) == 3]
    print("# '{}':".format(type))
    print("  . Most correlated unigrams:\n. {}".format('\n. '.join(unigrams[-10:])))
    print("  . Most correlated bigrams:\n. {}".format('\n. '.join(bigrams[-10:])))
    print("  . Most correlated trigrams:\n. {}".format('\n. '.join(trigrams[-10:])))

# 'ENTJ':
  . Most correlated unigrams:
. eights
. 8s
. sangoire
. siggy
. prada
. stackings
. elistra
. canis
. stawker
. msbossypants
  . Most correlated bigrams:
. be competent
. year plan
. taking charge
. knows not
. case closed
. for sx
. term goals
. rational type
. homeless man
. mr canis
  . Most correlated trigrams:
. the facts are
. am proud to
. not the problem
. man on the
. complaining about the
. from my samsung
. them not sure
. when see an
. long term goals
. type type type
# 'ENFP':
  . Most correlated unigrams:
. sx
. moby
. danse
. finaille
. tridentus
. viva
. musicalmeggie
. musicbird
. alysaria
. xd
  . Most correlated bigrams:
. musicbird eerie
. viva so
. sx ace
. sx alysaria
. cue5c so
. alysaria sx
. so musicbird
. ace face
. hug hug
. dear anonymous
  . Most correlated trigrams:
. so sx alysaria
. so sx ace
. sx ace face
. dreams so sx
. viva so sx
. sx alysaria sx
. sx so musicbird
. alysaria sx so
. king of dreams
. an and wish
# 'ISFJ':
  . Most correlate

<div class="alert alert-block alert-info">
<b>Further Actions:</b><br>
* still need to decide how to handle emojis <br>
* consider allowing single quote to remain during preprocessing so as not to lose negative conjugations <br>
* ~~identify appearance of 'sx'~~ <br>
* apply strip_accent commands<br>
[timestamp: 2/17-11:40p]<br><br>
<b>UPDATED: Struck-through completed actions</b> <br>
[timestamp: 2/18-1:30p]
</div>

<br><br>
### **Investigation Thread**: What does 'sx' mean?

In [72]:
# create subset of data for only enfp personality type
enfp_data = data[data['type']=='ENFP']

# identify roughly many rows 'sx' appears in
len(enfp_data['posts'].str.find('sx').value_counts())

# examine instance of 'sx' appearance in preprocessed data
location = enfp_data.loc[8617]['posts'].find('sx')
enfp_data.loc[8617]['posts'][location-150:location+150]

# compare instance of 'sx' appearance in preprocessed data to corresponding appearance in original data
orig_data = pd.read_csv('../../data/original/mbti_1.csv')
location = orig_data.loc[8617]['posts'].find('sx')
orig_data.loc[8617]['posts'][location-150:location+150]

'amazing lawyers - in fact I was going to be one and I had many many lawyers recognize that passion in me. One of the career...|||Viva: 4w3 7w6 8w9 so/sx Ace Face: 7w8, 8w7, 3w2 cue5c: 3w2 so/sp The King of Dreams: 9w1 7w6 2w1 so/sx Alysaria: 7w6, 2w3, 9w8 sx/so MusicBird: 7w6, 9w8, 4w3 Eerie: 7w8-1w'

<div class="alert alert-block alert-info">
<b>Findings:</b><br>
* ~~'sx' is a peronality trait term and should be added to custom_stopwords collection along with so, sp, sx~~<br>
[timestamp: 2/18-1:30p]<br><br>
<b>UPDATED: Struck-through completed tasks.</b> <br>
[timestamp: 2/18-1:35p]
</div>

<br><br>
### Expand Custom Stopword Collection

In [74]:
# include sp, sx, so
misc_word_list = ['enneagram', 'enneagrams', 'mbti', 'mbtis', 'meyer', 'meyers', 'briggs', 'brigg', 'sp', 'sx', 'so']
custom_stopwords = mbti_types + type_pairs + misc_word_list

<br><br>
### TFDIF Vectorizer Model: using *UPDATED stop_words=custom_stopwords*

In [75]:
#TFIDF Vectorizer
tfidf = TfidfVectorizer(stop_words=custom_stopwords, ngram_range=(1, 3), min_df=10)
features = tfidf.fit_transform(data.posts).toarray()
labels = data.type_code

In [76]:
# sourced from: https://towardsdatascience.com/multi-class-text-classification-with-scikit-learn-12f1e60e0a9f

for type, type_code in type_code_dict.items():
  features_chi2 = chi2(features, labels == type_code)
  indices = np.argsort(features_chi2[0])
  feature_names = np.array(tfidf.get_feature_names())[indices]
  unigrams = [v for v in feature_names if len(v.split(' ')) == 1]
  bigrams = [v for v in feature_names if len(v.split(' ')) == 2]
  trigrams = [v for v in feature_names if len(v.split(' ')) == 3]
  print("# '{}':".format(type))
  print("  . Most correlated unigrams:\n. {}".format('\n. '.join(unigrams[-10:])))
  print("  . Most correlated bigrams:\n. {}".format('\n. '.join(bigrams[-10:])))
  print("  . Most correlated trigrams:\n. {}".format('\n. '.join(trigrams[-10:])))

# 'ENTJ':
  . Most correlated unigrams:
. eights
. 8s
. sangoire
. siggy
. prada
. stackings
. elistra
. canis
. stawker
. msbossypants
  . Most correlated bigrams:
. share much
. year plan
. am pleased
. taking charge
. knows not
. case closed
. term goals
. rational type
. homeless man
. mr canis
  . Most correlated trigrams:
. are you like
. the facts are
. am proud to
. man on the
. not the problem
. complaining about the
. from my samsung
. when see an
. long term goals
. type type type
# 'ENFP':
  . Most correlated unigrams:
. supernova
. moby
. danse
. finaille
. tridentus
. viva
. musicalmeggie
. musicbird
. alysaria
. xd
  . Most correlated bigrams:
. danse macabre
. musicbird eerie
. dreams alysaria
. face cue5c
. viva ace
. cue5c the
. alysaria musicbird
. ace face
. hug hug
. dear anonymous
  . Most correlated trigrams:
. the king of
. alysaria musicbird eerie
. cue5c the king
. viva ace face
. of dreams alysaria
. face cue5c the
. ace face cue5c
. dreams alysaria musicbird