# Create Combined DataFrame of All 3 Feature/Data Category Types For Use By Models

**Data Files Imported:**
* expanded_mbti_df.csv
* countvect_matrix_lemma.npz
* tfidfvect_matrix_lemma.npz

**Data Files Written:**
* df_cv_feat_lemma.pkl
* df_tf_feat_lemma.pkl

**Notebook Summary Overview:**
* creates data file for modeling with following components:
    * engineered features as stored in expanded_mbti_df.csv
    * vectorized text columns as stored in either:
        * countvect_matrix_lemma.npz
        * tfidfvect_matrix_lemma.npz

In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder
import scipy

In [81]:
data = pd.read_csv('../data/cleaned/expanded_mbti_df.csv')

In [82]:
data.head(2)

Unnamed: 0,type,posts,comp_score,neg_score,neu_score,pos_score,post_count,avg_word_count,posts_cleaned,cleaned_comp_score,...,diff_comp_init-no_punct,E_I,N_S,F_T,J_P,E_I_code,N_S_code,F_T_code,J_P_code,type_code
0,INFJ,"[""'http://www.youtube.com/watch?v=qsXHcwe3krw""...",0.9877,0.054,0.829,0.116,50,90,['enfp and intj moments sportscenter not top t...,0.9839,...,0.0074,I,N,F,J,0,1,1,1,111
1,ENTP,"[""'I'm finding the lack of me in these posts v...",0.9994,0.068,0.752,0.18,50,138,"[""'I'm finding the lack of me in these posts v...",0.9993,...,0.0009,E,N,T,P,1,1,0,0,1100


In [83]:
data.columns

Index(['type', 'posts', 'comp_score', 'neg_score', 'neu_score', 'pos_score',
       'post_count', 'avg_word_count', 'posts_cleaned', 'cleaned_comp_score',
       'cleaned_neg_score', 'cleaned_neu_score', 'cleaned_pos_score',
       'post_count_cleaned', 'avg_word_count_cleaned', 'posts_no_digits',
       'post_count_no_digits', 'avg_word_count_no_digits', 'posts_no_punct',
       'no_punct_comp_score', 'no_punct_neg_score', 'no_punct_neu_score',
       'no_punct_pos_score', 'diff_post_count_init-cleaned',
       'diff_word_count_init-cleaned', 'diff_post_count_cleaned-no_digits',
       'diff_word_count_cleaned-no_digits', 'diff_post_count_init-no_digits',
       'diff_word_count_init-no_digits', 'diff_comp_init-clean',
       'diff_comp_clean-no_punct', 'diff_comp_init-no_punct', 'E_I', 'N_S',
       'F_T', 'J_P', 'E_I_code', 'N_S_code', 'F_T_code', 'J_P_code',
       'type_code'],
      dtype='object')

In [84]:
data['post_count_no_digits'].describe()

count    8675.000000
mean       47.181556
std         6.047531
min         1.000000
25%        48.000000
50%        50.000000
75%        50.000000
max        59.000000
Name: post_count_no_digits, dtype: float64

In [85]:
data['post_count_no_digits']=data['post_count_no_digits'].apply(lambda x: x if x < 50 else 50)

In [86]:
data['post_count_no_digits'].describe()

count    8675.000000
mean       47.158040
std         6.028259
min         1.000000
25%        48.000000
50%        50.000000
75%        50.000000
max        50.000000
Name: post_count_no_digits, dtype: float64

In [87]:
data['type_code'] = data['type_code'].apply(lambda x: '{0:0>4}'.format(x))

In [88]:
df_features = data[['type', 'type_code', 'E_I_code', 'N_S_code', 'F_T_code', 'J_P_code', 'comp_score', 'neg_score', 'neu_score', 'pos_score', 'avg_word_count_no_digits', 'post_count_no_digits']]

In [89]:
df_features.rename(columns={'avg_word_count_no_digits':'avg_word_count', 'post_count_no_digits':'post_count', 'E_I_code':'E_I', 'N_S_code':'N_S', 'F_T_code':'F_T', 'J_P_code':'J_P'}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(


###### **Ordinal Encoding Values**: INFP:9 ||| INFJ:8 ||| INTP:11 ||| INTJ:10 ||| ENTP:3 ||| ENFP:1 ||| ISTP:15 ||| ISFP:13 ||| ENTJ:2 ||| ISTJ:14 ||| ENFJ:0 ||| ISFJ:12 ||| ESTP:7 ||| ESFP:5 ||| ESFJ:4 ||| ESTJ:6

In [91]:
df_features.insert(1,'type_encoded', OrdinalEncoder().fit_transform(df_features[['type']].astype('category')).astype(int))

In [92]:
df_features

Unnamed: 0,type,type_encoded,type_code,E_I,N_S,F_T,J_P,comp_score,neg_score,neu_score,pos_score,avg_word_count,post_count
0,INFJ,8,0111,0,1,1,1,0.9877,0.054,0.829,0.116,89,36
1,ENTP,3,1100,1,1,0,0,0.9994,0.068,0.752,0.180,135,46
2,INTP,11,0100,0,1,0,0,0.9985,0.074,0.751,0.175,116,41
3,INTJ,10,0101,0,1,0,1,0.9966,0.054,0.841,0.106,119,50
4,ENTJ,2,1101,1,1,0,1,0.9725,0.114,0.748,0.138,121,46
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8670,ISFP,13,0010,0,0,1,0,0.9887,0.105,0.755,0.140,99,46
8671,ENFP,1,1110,1,1,1,0,0.9999,0.046,0.726,0.228,148,50
8672,INTP,11,0100,0,1,0,0,0.9958,0.086,0.773,0.142,115,47
8673,INFP,9,0110,0,1,1,0,0.9995,0.057,0.801,0.141,184,50


In [93]:
data2 = pd.read_csv('../data/cleaned/mbti_pos_tag_with_sums.csv')

In [94]:
data2.head(2)

Unnamed: 0,CC,DT,FW,IN,JJ,MD,NN,NNS,NNP,PDT,...,pos_conj,pos_fw,pos_prep,pos_adj,pos_adv,pos_noun,pos_pnoun,pos_par,pos_verb_past,pos_verb_present
0,0.0,0.075,0.002,0.028,0.099,0.0,0.621,0.002,0.002,0.0,...,0.0,0.002,0.028,0.174,0.002,0.625,0.002,0.0,0.017,0.107
1,0.0,0.073,0.001,0.026,0.111,0.0,0.625,0.001,0.003,0.001,...,0.0,0.001,0.026,0.185,0.001,0.628,0.003,0.0,0.019,0.1


In [95]:
data2.columns

Index(['CC', 'DT', 'FW', 'IN', 'JJ', 'MD', 'NN', 'NNS', 'NNP', 'PDT', 'POS',
       'PRP', 'RB', 'RBS', 'RP', 'VB', 'VBD', 'VBP', 'VBZ', 'WRB', 'mbti_type',
       'pos_conj', 'pos_fw', 'pos_prep', 'pos_adj', 'pos_adv', 'pos_noun',
       'pos_pnoun', 'pos_par', 'pos_verb_past', 'pos_verb_present'],
      dtype='object')

In [96]:
df_pos = data2[['pos_conj', 'pos_fw', 'pos_prep', 'pos_adj', 'pos_adv', 'pos_noun', 'pos_pnoun', 'pos_par', 'pos_verb_past', 'pos_verb_present']]

In [97]:
df_pos.columns = df_pos.columns.str.replace('pos', 'pct')

In [98]:
df_features = pd.concat([df_features,df_pos], axis = 1)

In [99]:
df_features

Unnamed: 0,type,type_encoded,type_code,E_I,N_S,F_T,J_P,comp_score,neg_score,neu_score,...,pct_conj,pct_fw,pct_prep,pct_adj,pct_adv,pct_noun,pct_pnoun,pct_par,pct_verb_past,pct_verb_present
0,INFJ,8,0111,0,1,1,1,0.9877,0.054,0.829,...,0.0,0.002,0.028,0.174,0.002,0.625,0.002,0.0,0.017,0.107
1,ENTP,3,1100,1,1,0,0,0.9994,0.068,0.752,...,0.0,0.001,0.026,0.185,0.001,0.628,0.003,0.0,0.019,0.100
2,INTP,11,0100,0,1,0,0,0.9985,0.074,0.751,...,0.0,0.001,0.028,0.177,0.002,0.633,0.002,0.0,0.016,0.105
3,INTJ,10,0101,0,1,0,1,0.9966,0.054,0.841,...,0.0,0.001,0.028,0.175,0.000,0.630,0.003,0.0,0.020,0.103
4,ENTJ,2,1101,1,1,0,1,0.9725,0.114,0.748,...,0.0,0.000,0.029,0.173,0.002,0.638,0.002,0.0,0.023,0.099
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8670,ISFP,13,0010,0,0,1,0,0.9887,0.105,0.755,...,0.0,0.001,0.031,0.191,0.002,0.613,0.002,0.0,0.023,0.095
8671,ENFP,1,1110,1,1,1,0,0.9999,0.046,0.726,...,0.0,0.001,0.027,0.188,0.000,0.623,0.002,0.0,0.019,0.105
8672,INTP,11,0100,0,1,0,0,0.9958,0.086,0.773,...,0.0,0.002,0.025,0.178,0.001,0.632,0.002,0.0,0.016,0.107
8673,INFP,9,0110,0,1,1,0,0.9995,0.057,0.801,...,0.0,0.001,0.028,0.180,0.001,0.632,0.002,0.0,0.017,0.111


In [101]:
cv_matrix = scipy.sparse.load_npz('countvect_matrix_lemma.npz')

In [103]:
df_cv_features = pd.concat([df_features,pd.DataFrame.sparse.from_spmatrix(cv_matrix).add_prefix('cv_')], axis = 1)

In [104]:
df_cv_features

Unnamed: 0,type,type_encoded,type_code,E_I,N_S,F_T,J_P,comp_score,neg_score,neu_score,...,cv_81769,cv_81770,cv_81771,cv_81772,cv_81773,cv_81774,cv_81775,cv_81776,cv_81777,cv_81778
0,INFJ,8,0111,0,1,1,1,0.9877,0.054,0.829,...,0,0,0,0,0,0,0,0,0,0
1,ENTP,3,1100,1,1,0,0,0.9994,0.068,0.752,...,0,0,0,0,0,0,0,0,0,0
2,INTP,11,0100,0,1,0,0,0.9985,0.074,0.751,...,0,0,0,0,0,0,0,0,0,0
3,INTJ,10,0101,0,1,0,1,0.9966,0.054,0.841,...,0,0,0,0,0,0,0,0,0,0
4,ENTJ,2,1101,1,1,0,1,0.9725,0.114,0.748,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8670,ISFP,13,0010,0,0,1,0,0.9887,0.105,0.755,...,0,0,0,0,0,0,0,0,0,0
8671,ENFP,1,1110,1,1,1,0,0.9999,0.046,0.726,...,0,0,0,0,0,0,0,0,0,0
8672,INTP,11,0100,0,1,0,0,0.9958,0.086,0.773,...,0,0,0,0,0,0,0,0,0,0
8673,INFP,9,0110,0,1,1,0,0.9995,0.057,0.801,...,0,0,0,0,0,0,0,0,0,0


In [105]:
tf_matrix = scipy.sparse.load_npz('tfidfvect_matrix_lemma.npz')

In [106]:
df_tf_features = pd.concat([df_features,pd.DataFrame.sparse.from_spmatrix(tf_matrix).add_prefix('tf_')], axis = 1)

In [107]:
df_tf_features

Unnamed: 0,type,type_encoded,type_code,E_I,N_S,F_T,J_P,comp_score,neg_score,neu_score,...,tf_81769,tf_81770,tf_81771,tf_81772,tf_81773,tf_81774,tf_81775,tf_81776,tf_81777,tf_81778
0,INFJ,8,0111,0,1,1,1,0.9877,0.054,0.829,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,ENTP,3,1100,1,1,0,0,0.9994,0.068,0.752,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,INTP,11,0100,0,1,0,0,0.9985,0.074,0.751,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,INTJ,10,0101,0,1,0,1,0.9966,0.054,0.841,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,ENTJ,2,1101,1,1,0,1,0.9725,0.114,0.748,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8670,ISFP,13,0010,0,0,1,0,0.9887,0.105,0.755,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8671,ENFP,1,1110,1,1,1,0,0.9999,0.046,0.726,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8672,INTP,11,0100,0,1,0,0,0.9958,0.086,0.773,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8673,INFP,9,0110,0,1,1,0,0.9995,0.057,0.801,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Write feature dataframes to pickle files

In [111]:
df_cv_features.to_pickle('../data/cleaned/df_cv_feat_lemma.pkl')

In [112]:
df_tf_features.to_pickle('../data/cleaned/df_tf_feat_lemma.pkl')