# Create POS Tagger and Results Output Process

## 

In [2]:
import pandas as pd
import numpy as np
import nltk 
from nltk.tokenize import word_tokenize, sent_tokenize
from collections import Counter
from itertools import permutations
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/rebeccawright/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

<br><Br>
### Importing the data

In [3]:
data = pd.read_csv('../../data/cleaned/expanded_mbti_df.csv')

In [29]:
data.head(2)

Unnamed: 0,type,posts,comp_score,neg_score,neu_score,pos_score,post_count,avg_word_count,posts_cleaned,cleaned_comp_score,...,diff_comp_init-no_punct,E_I,N_S,F_T,J_P,E_I_code,N_S_code,F_T_code,J_P_code,type_code
0,INFJ,"[""'http://www.youtube.com/watch?v=qsXHcwe3krw""...",0.9877,0.054,0.829,0.116,50,90,['enfp and intj moments sportscenter not top t...,0.9839,...,0.0074,I,N,F,J,0,1,1,1,111
1,ENTP,"[""'I'm finding the lack of me in these posts v...",0.9994,0.068,0.752,0.18,50,138,"[""'I'm finding the lack of me in these posts v...",0.9993,...,0.0009,E,N,T,P,1,1,0,0,1100


In [30]:
data.columns

Index(['type', 'posts', 'comp_score', 'neg_score', 'neu_score', 'pos_score',
       'post_count', 'avg_word_count', 'posts_cleaned', 'cleaned_comp_score',
       'cleaned_neg_score', 'cleaned_neu_score', 'cleaned_pos_score',
       'post_count_cleaned', 'avg_word_count_cleaned', 'posts_no_digits',
       'post_count_no_digits', 'avg_word_count_no_digits', 'posts_no_punct',
       'no_punct_comp_score', 'no_punct_neg_score', 'no_punct_neu_score',
       'no_punct_pos_score', 'diff_post_count_init-cleaned',
       'diff_word_count_init-cleaned', 'diff_post_count_cleaned-no_digits',
       'diff_word_count_cleaned-no_digits', 'diff_post_count_init-no_digits',
       'diff_word_count_init-no_digits', 'diff_comp_init-clean',
       'diff_comp_clean-no_punct', 'diff_comp_init-no_punct', 'E_I', 'N_S',
       'F_T', 'J_P', 'E_I_code', 'N_S_code', 'F_T_code', 'J_P_code',
       'type_code'],
      dtype='object')

<br><br>
<div class="alert alert-block alert-warning">
<b>Notes</b><br>
* data['posts_no_punct'] is the furthest cleaned version <br>
* implement code to cast list of posts to single string values inside pos_tag_calc function<br>
</div>
<br>

<br><Br>
### Create empty dataframe for pos_tag results

In [75]:
# empty df with columns for each of the 35 unique POS tags and 1 column for mbti type
df = pd.DataFrame(columns = ['CC', 'CD', 'DT', 'EX', 'FW', 'IN', 'JJ', 'JJR', 'JJS', 'LS', 'MD', 'NN', 'NNS', 'NNP', 'NNPS', 'PDT', 'POS', 'PRP', 'PRP$', 'RB', 'RBR', 'RBS', 'RP', 'TO', 'UH', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'WDT', 'WP', 'WP$', 'WRB'])
df

Unnamed: 0,CC,CD,DT,EX,FW,IN,JJ,JJR,JJS,LS,...,VB,VBD,VBG,VBN,VBP,VBZ,WDT,WP,WP$,WRB


<br><Br>
### Create pos_tag function

In [56]:
# pos tag calculation function
def pos_tag_calc(row):
    text = ' '.join(row)
    tokenized = sent_tokenize(text.lower()) 
    for i in tokenized: 
        wordsList = nltk.word_tokenize(i) 
        tagged = nltk.pos_tag(wordsList)     
    counts = Counter(tag for word,tag in tagged)
    total = sum(counts.values())
    results = dict((word, round(float(count)/total, 3)) for word,count in counts.items())
    return results

<br><Br>
### Apply pos_tag function and add results to dataframe

In [76]:
results = data['posts_no_punct'].apply(lambda x: pos_tag_calc(x))
results

0       {'NN': 0.621, '''': 0.02, 'JJ': 0.099, 'NNS': ...
1       {'IN': 0.026, '``': 0.011, 'NN': 0.625, '''': ...
2       {'NN': 0.627, '''': 0.013, 'JJ': 0.111, 'IN': ...
3       {'NN': 0.626, '''': 0.017, 'JJ': 0.098, 'DT': ...
4       {'NN': 0.633, '``': 0.009, 'JJ': 0.097, '''': ...
                              ...                        
8670    {'NN': 0.606, '''': 0.019, 'JJ': 0.108, 'VBP':...
8671    {'NN': 0.618, '''': 0.015, 'JJ': 0.117, 'VBP':...
8672    {'NN': 0.629, '''': 0.017, 'JJ': 0.105, 'FW': ...
8673    {'NN': 0.627, '''': 0.01, 'VBZ': 0.027, 'DT': ...
8674    {'IN': 0.025, '``': 0.009, 'JJ': 0.108, 'VBP':...
Name: posts_no_punct, Length: 8675, dtype: object

In [77]:
for result in results:
    df = df.append(result, ignore_index=True)

In [78]:
df.head(2)

Unnamed: 0,CC,CD,DT,EX,FW,IN,JJ,JJR,JJS,LS,...,VBP,VBZ,WDT,WP,WP$,WRB,'',",",SYM,``
0,,,0.075,,0.002,0.028,0.099,,,,...,0.074,0.031,,,,0.0,0.02,0.013,0.0,0.01
1,,,0.073,,0.001,0.026,0.111,,,,...,0.075,0.024,,,,0.001,0.015,0.009,0.0,0.011


In [80]:
df = df.iloc[:, :-4]

In [81]:
df['mbti_type']=data['type']
df.head(2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['mbti_type']=data['type']


Unnamed: 0,CC,CD,DT,EX,FW,IN,JJ,JJR,JJS,LS,...,VBD,VBG,VBN,VBP,VBZ,WDT,WP,WP$,WRB,mbti_type
0,,,0.075,,0.002,0.028,0.099,,,,...,0.017,,,0.074,0.031,,,,0.0,INFJ
1,,,0.073,,0.001,0.026,0.111,,,,...,0.019,,,0.075,0.024,,,,0.001,ENTP


In [84]:
df.replace(0, np.nan, inplace=True)
df = df.dropna(how='all', axis='columns')
df.replace(np.nan, 0, inplace=True)

df.head(2)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().replace(


<br><br>

<div class="alert alert-block alert-warning">
<b>Notes:</b>  Remove custom_stopword collection and repeat pos_tag process<br>
</div>
<br>

<br>

***
***

<br>

## Pos Tagger on DataFrame with Custom_stopwords Removed

### Create Custom Stopword Collection

In [4]:
# code to create type_code dictionary
type_cols = data[['type','type_code']]
type_dict_vals = set(type_cols.apply(lambda x: ':'.join(x.values.astype(str)), axis=1))
type_code_dict = {value.split(':')[0]:value.split(':')[1] for value in type_dict_vals}

In [5]:
# create list of mbti types and their plurals to add to stop word collection
mbti_types = [key.lower() for key in type_code_dict.keys()]
mbti_types += [(each+'s') for each in mbti_types]

In [6]:
# create list of bi-trait strings to add to stop word collection
trait1 = ["e", "i"]
trait2 = ["n", "s"]
trait3 = ["f", "t"]
trait4 = ["j", "p"]
type_combos = []
type_pairs = []

type_combos += [list(zip(each_permutation, trait2)) for each_permutation in permutations(trait1, len(trait2))]
type_combos += [list(zip(each_permutation, trait1)) for each_permutation in permutations(trait2, len(trait1))]
type_combos += [list(zip(each_permutation, trait3)) for each_permutation in permutations(trait1, len(trait3))]
type_combos += [list(zip(each_permutation, trait1)) for each_permutation in permutations(trait3, len(trait1))]
type_combos += [list(zip(each_permutation, trait4)) for each_permutation in permutations(trait1, len(trait4))]
type_combos += [list(zip(each_permutation, trait1)) for each_permutation in permutations(trait4, len(trait1))]
type_combos += [list(zip(each_permutation, trait3)) for each_permutation in permutations(trait2, len(trait3))]
type_combos += [list(zip(each_permutation, trait2)) for each_permutation in permutations(trait3, len(trait2))]
type_combos += [list(zip(each_permutation, trait4)) for each_permutation in permutations(trait2, len(trait4))]
type_combos += [list(zip(each_permutation, trait2)) for each_permutation in permutations(trait4, len(trait2))]
type_combos += [list(zip(each_permutation, trait4)) for each_permutation in permutations(trait3, len(trait4))]
type_combos += [list(zip(each_permutation, trait3)) for each_permutation in permutations(trait4, len(trait3))]

for i in range(24):
    type_pairs.append(''.join([type_combos[i][0][0],type_combos[i][0][1]]))
    type_pairs.append(''.join([type_combos[i][1][0],type_combos[i][1][1]]))

In [7]:
misc_word_list = ['enneagram', 'enneagrams', 'mbti', 'mbtis', 'meyer', 'meyers', 'briggs', 'brigg', 'sp', 'sx', 'so']

In [8]:
custom_stopwords = mbti_types + type_pairs + misc_word_list

<br><Br>
### Create empty dataframe for pos_tag results

In [93]:
df_no_stop = pd.DataFrame(columns = ['CC', 'CD', 'DT', 'EX', 'FW', 'IN', 'JJ', 'JJR', 'JJS', 'LS', 'MD', 'NN', 'NNS', 'NNP', 'NNPS', 'PDT', 'POS', 'PRP', 'PRP$', 'RB', 'RBR', 'RBS', 'RP', 'TO', 'UH', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'WDT', 'WP', 'WP$', 'WRB'])

<br><Br>
### Create pos_tag function that removes stopwords

In [91]:
def pos_tag_calc_no_stop(row):
    text = ' '.join(row)
    tokenized = sent_tokenize(text.lower()) 
    for i in tokenized: 
        wordsList = nltk.word_tokenize(i)
        wordsList = [word for word in wordsList if not word in custom_stopwords]  
        tagged = nltk.pos_tag(wordsList)     
    counts = Counter(tag for word,tag in tagged)
    total = sum(counts.values())
    results = dict((word, round(float(count)/total, 3)) for word,count in counts.items())
    return results

<br><Br>
### Apply pos_tag function and add results to dataframe

In [92]:
results_no_stop = data['posts_no_punct'].apply(lambda x: pos_tag_calc_no_stop(x))
results_no_stop

0       {'NN': 0.621, '''': 0.02, 'JJ': 0.099, 'NNS': ...
1       {'IN': 0.026, '``': 0.011, 'NN': 0.625, '''': ...
2       {'NN': 0.627, '''': 0.013, 'JJ': 0.111, 'IN': ...
3       {'NN': 0.626, '''': 0.017, 'JJ': 0.098, 'DT': ...
4       {'NN': 0.633, '``': 0.009, 'JJ': 0.097, '''': ...
                              ...                        
8670    {'NN': 0.606, '''': 0.019, 'JJ': 0.108, 'VBP':...
8671    {'NN': 0.618, '''': 0.015, 'JJ': 0.117, 'VBP':...
8672    {'NN': 0.629, '''': 0.017, 'JJ': 0.105, 'FW': ...
8673    {'NN': 0.627, '''': 0.01, 'VBZ': 0.027, 'DT': ...
8674    {'IN': 0.025, '``': 0.009, 'JJ': 0.108, 'VBP':...
Name: posts_no_punct, Length: 8675, dtype: object

In [94]:
for result in results_no_stop:
    df_no_stop = df_no_stop.append(result, ignore_index=True)

In [95]:
df_no_stop = df_no_stop.iloc[:, :-4]

In [96]:
df_no_stop['mbti_type']=data['type']
df_no_stop.head(2)

Unnamed: 0,CC,CD,DT,EX,FW,IN,JJ,JJR,JJS,LS,...,VBD,VBG,VBN,VBP,VBZ,WDT,WP,WP$,WRB,mbti_type
0,,,0.075,,0.002,0.028,0.099,,,,...,0.017,,,0.074,0.031,,,,0.0,INFJ
1,,,0.073,,0.001,0.026,0.111,,,,...,0.019,,,0.075,0.024,,,,0.001,ENTP


In [97]:
df_no_stop.replace(0, np.nan, inplace=True)
df_no_stop = df_no_stop.dropna(how='all', axis='columns')
df_no_stop.replace(np.nan, 0, inplace=True)

In [98]:
df_no_stop

Unnamed: 0,CC,DT,FW,IN,JJ,MD,NN,NNS,NNP,PDT,...,PRP,RB,RBS,RP,VB,VBD,VBP,VBZ,WRB,mbti_type
0,0.0,0.075,0.002,0.028,0.099,0.000,0.621,0.002,0.002,0.000,...,0.0,0.002,0.0,0.0,0.002,0.017,0.074,0.031,0.000,INFJ
1,0.0,0.073,0.001,0.026,0.111,0.000,0.625,0.001,0.003,0.001,...,0.0,0.000,0.0,0.0,0.001,0.019,0.075,0.024,0.001,ENTP
2,0.0,0.065,0.001,0.028,0.111,0.000,0.627,0.002,0.002,0.001,...,0.0,0.001,0.0,0.0,0.001,0.016,0.082,0.022,0.001,INTP
3,0.0,0.076,0.001,0.028,0.098,0.000,0.626,0.002,0.003,0.001,...,0.0,0.000,0.0,0.0,0.001,0.020,0.079,0.023,0.000,INTJ
4,0.0,0.074,0.000,0.029,0.097,0.000,0.633,0.002,0.002,0.002,...,0.0,0.001,0.0,0.0,0.000,0.023,0.079,0.020,0.001,ENTJ
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8670,0.0,0.081,0.001,0.031,0.108,0.001,0.606,0.002,0.002,0.002,...,0.0,0.000,0.0,0.0,0.001,0.023,0.069,0.024,0.002,ISFP
8671,0.0,0.070,0.001,0.027,0.117,0.000,0.618,0.002,0.002,0.001,...,0.0,0.000,0.0,0.0,0.002,0.019,0.078,0.025,0.000,ENFP
8672,0.0,0.072,0.002,0.025,0.105,0.000,0.629,0.002,0.002,0.001,...,0.0,0.000,0.0,0.0,0.001,0.016,0.081,0.025,0.001,INTP
8673,0.0,0.077,0.001,0.028,0.102,0.000,0.627,0.002,0.002,0.001,...,0.0,0.000,0.0,0.0,0.001,0.017,0.083,0.027,0.001,INFP


<br>

***
***

<br>

<div class="alert alert-block alert-warning">
<b>Notes:</b>  Add on Summary columns for consolidated pos_tag percentage values<br>
*write out interim dataframe to csvs*
</div>
<br>

## Write out Pos Tagger result dataframes to csvs

<br>

***
***

<br>

## Create and Append summary total columns for high-level pos tag type percentage values

<div class="alert alert-block alert-warning">
<b>Identified scalability issue:</b><br>
* In future, convert null values to zeros and do not drop empty columns.<br>
* POS_TAG summary columns had to be custom coded to account for not all 35 unique pos_tag value columns existing.</div>
<br>

**NLTK List of all POS tags (35 total):**

* CC coordinating conjunction
* CD cardinal digit
* DT determiner
* EX existential there (like: "there is" ... think of it like "there exists")
* FW foreign word
* IN preposition/subordinating conjunction
* JJ adjective 'big'
* JJR adjective, comparative 'bigger'
* JJS adjective, superlative 'biggest'
* LS list marker 1)
* MD modal could, will
* NN noun, singular 'desk'
* NNS noun plural 'desks'
* NNP proper noun, singular 'Harrison'
* NNPS proper noun, plural 'Americans'
* PDT predeterminer 'all the kids'
* POS possessive ending parent's
* PRP personal pronoun I, he, she
* PRP\$ possessive pronoun my, his, hers
* RB adverb very, silently,
* RBR adverb, comparative better
* RBS adverb, superlative best
* RP particle give up
* TO to go 'to' the store.
* UH interjection errrrrrrrm
* VB verb, base form take
* VBD verb, past tense took
* VBG verb, gerund/present participle taking
* VBN verb, past participle taken
* VBP verb, sing. present, non-3d take
* VBZ verb, 3rd person sing. present takes
* WDT wh-determiner which
* WP wh-pronoun who, what
* WP\$ possessive wh-pronoun whose
* WRB wh-abverb where, when

In [100]:
df.head(2)

Unnamed: 0,CC,DT,FW,IN,JJ,MD,NN,NNS,NNP,PDT,...,PRP,RB,RBS,RP,VB,VBD,VBP,VBZ,WRB,mbti_type
0,0.0,0.075,0.002,0.028,0.099,0.0,0.621,0.002,0.002,0.0,...,0.0,0.002,0.0,0.0,0.002,0.017,0.074,0.031,0.0,INFJ
1,0.0,0.073,0.001,0.026,0.111,0.0,0.625,0.001,0.003,0.001,...,0.0,0.0,0.0,0.0,0.001,0.019,0.075,0.024,0.001,ENTP


In [101]:
df['pos_conj'] = df['CC'] #conjugations
df['pos_fw'] = df['FW'] #foreign
df['pos_prep'] = df['IN'] #preposition
df['pos_adj'] = df[['JJ','PDT','DT']].sum(axis=1) #adjective
df['pos_adv'] = df[['RB','RBS','WRB']].sum(axis=1) #adverbs
df['pos_noun'] = df[['NN','NNS','POS']].sum(axis=1) #nouns
df['pos_pnoun'] = df[['NNP','PRP']].sum(axis=1) #proper noun
df['pos_par'] = df['RP'] #particle
df['pos_verb_past'] = df['VBD']
df['pos_verb_present'] = df[['VB','VBP','VBZ','MD']].sum(axis=1)

In [102]:
df.head(2)

Unnamed: 0,CC,DT,FW,IN,JJ,MD,NN,NNS,NNP,PDT,...,pos_conj,pos_fw,pos_prep,pos_adj,pos_adv,pos_noun,pos_pnoun,pos_par,pos_verb_past,pos_verb_present
0,0.0,0.075,0.002,0.028,0.099,0.0,0.621,0.002,0.002,0.0,...,0.0,0.002,0.028,0.174,0.002,0.625,0.002,0.0,0.017,0.107
1,0.0,0.073,0.001,0.026,0.111,0.0,0.625,0.001,0.003,0.001,...,0.0,0.001,0.026,0.185,0.001,0.628,0.003,0.0,0.019,0.1


<br><br>

In [103]:
df_no_stop.head(2)

Unnamed: 0,CC,DT,FW,IN,JJ,MD,NN,NNS,NNP,PDT,...,PRP,RB,RBS,RP,VB,VBD,VBP,VBZ,WRB,mbti_type
0,0.0,0.075,0.002,0.028,0.099,0.0,0.621,0.002,0.002,0.0,...,0.0,0.002,0.0,0.0,0.002,0.017,0.074,0.031,0.0,INFJ
1,0.0,0.073,0.001,0.026,0.111,0.0,0.625,0.001,0.003,0.001,...,0.0,0.0,0.0,0.0,0.001,0.019,0.075,0.024,0.001,ENTP


In [104]:
df_no_stop['pos_conj'] = df_no_stop['CC'] #conjugations
df_no_stop['pos_fw'] = df_no_stop['FW'] #foreign
df_no_stop['pos_prep'] = df_no_stop['IN'] #preposition
df_no_stop['pos_adj'] = df_no_stop[['JJ','PDT','DT']].sum(axis=1) #adjective
df_no_stop['pos_adv'] = df_no_stop[['RB','RBS','WRB']].sum(axis=1) #adverbs
df_no_stop['pos_noun'] = df_no_stop[['NN','NNS','POS']].sum(axis=1) #nouns
df_no_stop['pos_pnoun'] = df_no_stop[['NNP','PRP']].sum(axis=1) #proper noun
df_no_stop['pos_par'] = df_no_stop['RP'] #particle
df_no_stop['pos_verb_past'] = df_no_stop['VBD']
df_no_stop['pos_verb_present'] = df_no_stop[['VB','VBP','VBZ','MD']].sum(axis=1)

In [105]:
df_no_stop.head(2)

Unnamed: 0,CC,DT,FW,IN,JJ,MD,NN,NNS,NNP,PDT,...,pos_conj,pos_fw,pos_prep,pos_adj,pos_adv,pos_noun,pos_pnoun,pos_par,pos_verb_past,pos_verb_present
0,0.0,0.075,0.002,0.028,0.099,0.0,0.621,0.002,0.002,0.0,...,0.0,0.002,0.028,0.174,0.002,0.625,0.002,0.0,0.017,0.107
1,0.0,0.073,0.001,0.026,0.111,0.0,0.625,0.001,0.003,0.001,...,0.0,0.001,0.026,0.185,0.001,0.628,0.003,0.0,0.019,0.1


<br>

***
***

<br>

## Write out Pos Tagger Result with Summary Columns to csvs

<br>

***
***

<br>

## Write out custom stopwords collection to txt file