In [1]:
#imports
import pandas as pd
import ftr_classifier as ftr

### File types
When working with languages which use a lot of non-ascii characters (i.e. Dutch and German), it is
recommended that researchers DO NOT use .csv file formats. Despite claims to support utf-8, some versions
of Excel mangle non-ascii characters when opening and closing .csv files. These vignettes therefore use 
pythons endemic object permanence file format, pickle. If a users wants to save to a format openable outside of
python, it is recommended they use .xlsx formats. However, doing so will mangle all spacy docs stored in a dataframe. If the user wants to save and reload python data objects (lists, spacy docs, etc.), pickle is recommended.

In [3]:
#load data
#import dataframe
df = pd.read_pickle('./toy_data/test.pkl')

#preview
df.head()

Unnamed: 0,response,textLang
0,"ja, hij is moe.",dutch
1,hij zal me bellen.,dutch
2,het water is koud.,dutch
3,de zon komt morgen om zes uur op.,dutch
4,de jongen krijgt het geld volgende week. wanne...,dutch


In [5]:
#classifiy responses
#either 
#df_class = ftr.prepare(df)
#df_class = ftr.score(df_class)
#df_class = ftr_apply_dominance(df_class)

##or (recommended)
df_class = ftr.classify_df(df)

#show results
df_class.head()

Unnamed: 0,response,textLang,spacy_doc,final_sentence,response_clean,present,future,verb_poss,verb_cert,adv_adj_poss,...,particle,will_future,go_future,negated,present_dom,go_future_dom,will_future_dom,future_dom,lexi_poss,lexi_cert
0,"ja, hij is moe.",dutch,"(ja, ,, hij, is, moe, .)","(ja, ,, hij, is, moe, .)","[ja, hij, is, moe]",1,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1,hij zal me bellen.,dutch,"(hij, zal, me, bellen, .)","(hij, zal, me, bellen, .)","[hij, zal, me, bellen]",1,1,0,0,0,...,0,1,0,0,0,0,1,1,0,0
2,het water is koud.,dutch,"(het, water, is, koud, .)","(het, water, is, koud, .)","[het, water, is, koud]",1,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,de zon komt morgen om zes uur op.,dutch,"(de, zon, komt, morgen, om, zes, uur, op, .)","(de, zon, komt, morgen, om, zes, uur, op, .)","[de, zon, komt, morgen, om, zes, uur, op]",1,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,de jongen krijgt het geld volgende week. wanne...,dutch,"(de, jongen, krijgt, het, geld, volgende, week...","(wanneer, hij, het, krijgt, ,, dan, gaat, hij,...","[wanneer, hij, het, krijgt, dan, gaat, hij, ee...",1,1,0,0,0,...,0,0,1,0,0,1,0,1,0,0


In [7]:
# =============================================================================
# count lemma frequency
# =============================================================================
df_lemma_count = ftr.count_lemmas(df_class)

#display results
df_lemma_count.head()

Unnamed: 0,language,feature,lemma,count,num_responses,num_words
348,german,will_future,werden,0,40,319
288,german,verb_poss,können_IND,0,40,319
289,german,verb_poss,können_SUBJ,0,40,319
290,german,verb_poss,sollen_IND,0,40,319
291,german,verb_poss,sollen_SUBJ,0,40,319


In [None]:
# =============================================================================
# ## Clean up dataframe
# It is recommended that after calling ftr.classify_df() and ftr.count_lemmas(),
# the user call ftr.clean_spacy(), which drops the automatically created columns
# containing spacy documents. These are memory intensive, and not necessary to keep
# once the responses have been classified and counted.
# =============================================================================
df_class = ftr.clean_spacy(df_class)

In [None]:
# =============================================================================
# save to desired format (pickle or .xlxs) recommended
# =============================================================================
##pickle
df_class.to_pickle('df_class.pkl')
df_lemma_count.to_pickle('df_lemma_count.pkl')

##excel
df_class.to_excel('df_class.xlsx',index=False) #index = False will save file without explicitly writing pandas index
df_lemma_count.to_excel('df_lemma_count.xlsx',index=False)