In [1]:
import pandas as pd

In [2]:
# https://github.com/pandas-profiling/pandas-profiling
# face-palm momemnt ... don't forget to import ;)
import pandas_profiling

# Make some dummy data into table / dataframe

In [2]:
# https://www.sharpsightlabs.com/blog/pandas-dataframe/
country_gdp_dict = {
    'country':['USA', 'China', 'Japan', 'Germany', 'UK', 'India']
    ,'GDP': [19390604, 12237700, 4872137, 3677439, 2622434, 2597491]
}

In [3]:
df = pd.DataFrame(country_gdp_dict)

In [4]:
df.head()

Unnamed: 0,country,GDP
0,USA,19390604
1,China,12237700
2,Japan,4872137
3,Germany,3677439
4,UK,2622434


~~Make a new column for dataframe X~~

In [5]:
#https://stackoverflow.com/questions/16327055/how-to-add-an-empty-column-to-a-dataframe
#df["been_there"] = ""

In [6]:
#df.head()

# Make target column Y in dataframe X into a list to iterate over

In [7]:
# https://stackoverflow.com/questions/23748995/pandas-dataframe-column-to-list
# https://stackoverflow.com/questions/22341271/get-list-from-pandas-dataframe-column?noredirect=1&lq=1
countries = df['country'].values.tolist()
countries

['USA', 'China', 'Japan', 'Germany', 'UK', 'India']

# Create question & answer function to query user per item

In [8]:
# question text
# acceptable answers list (in only lower case)
def question_user_for_input(question, acceptable_ans):
    user_answer = ""
    asking = True
    while asking:
        user_answer = input(question)
        # https://www.tutorialspoint.com/python/string_lower.htm
        if str.lower(user_answer) in acceptable_ans:
            asking = False
        else:
            print("Please answer using the correct format:")
            print(acceptable_ans)
    return user_answer

In [9]:
print(question_user_for_input("Do you like sushi? ",['yes','no']))

Do you like sushi? cheese
Please answer using the correct format:
['yes', 'no']
Do you like sushi? n
Please answer using the correct format:
['yes', 'no']
Do you like sushi? yes
yes


# Use list comprehension & your new function to create a new list from the old

In [10]:
# https://stackoverflow.com/questions/10834960/how-to-do-multiple-arguments-to-map-function-where-one-remains-the-same-in-pytho
been_there_query = [question_user_for_input("Have you been to " + x + "? ",['yes','no']) for x in countries]

Have you been to USA? yes
Have you been to China? yes
Have you been to Japan? yes
Have you been to Germany? no
Have you been to UK? no
Have you been to India? yes


In [11]:
been_there_query

['yes', 'yes', 'yes', 'no', 'no', 'yes']

# Add list to dataframe X as new column Y

In [12]:
# searched: add list to dataframe as new column
# Add column in dataframe from list
# https://stackoverflow.com/questions/26666919/add-column-in-dataframe-from-list
df['been_there'] = been_there_query

In [13]:
df.head()

Unnamed: 0,country,GDP,been_there
0,USA,19390604,yes
1,China,12237700,yes
2,Japan,4872137,yes
3,Germany,3677439,no
4,UK,2622434,no


# Compose the above functionality into a function

In [14]:
# ['yes','no']
def update_df_with_set_answer_q(df_in, q_func, q_text, ans_list, tar_col, new_col):
    #1. take in the source df
    #2. take in a query func, query text, target column name (that will be made into a list)
    #3. combine resulting list of query with source_df & return it as the new df
    new_list = df_in[tar_col].values.tolist()
    in_query = [q_func(q_text + x + ": ",ans_list) for x in new_list]
    df_out = df_in.copy()
    df_out[new_col] = in_query
    return df_out

In [15]:
# let's test the function above
df_2 = update_df_with_set_answer_q(df, 
                    question_user_for_input, 
                    "Does this country primarily use the latin alphabet? ",
                    ['yes','no'],
                    "country",
                    "latin_alphabet_main"
                   )

Does this country primarily use the latin alphabet? USA: yes
Does this country primarily use the latin alphabet? China: no
Does this country primarily use the latin alphabet? Japan: no
Does this country primarily use the latin alphabet? Germany: eys
Please answer using the correct format:
['yes', 'no']
Does this country primarily use the latin alphabet? Germany: yes
Does this country primarily use the latin alphabet? UK: yes
Does this country primarily use the latin alphabet? India: yes


In [16]:
df_2.head()

Unnamed: 0,country,GDP,been_there,latin_alphabet_main
0,USA,19390604,yes,yes
1,China,12237700,yes,no
2,Japan,4872137,yes,no
3,Germany,3677439,no,yes
4,UK,2622434,no,yes


# Create meta-data (tag) column dynamically

In [17]:
# open-ended feedback / question text
def ask_user_for_feedback(prompt):
    return input(prompt + " ")

In [18]:
# ['yes','no']
def update_df_with_open_answer_q(df_in, q_func, q_text, tar_col, new_col):
    #1. take in the source df
    #2. take in a query func, query text, target column name (that will be made into a list)
    #3. combine resulting list of query with source_df & return it as the new df
    new_list = df_in[tar_col].values.tolist()
    in_query = [q_func(q_text + x + ": ") for x in new_list]
    df_out = df_in.copy()
    df_out[new_col] = in_query
    return df_out

In [19]:
df_3 = update_df_with_open_answer_q(df_2, 
                                    ask_user_for_feedback, 
                                    "Write some keywords to describe this country: ",
                                    "country",
                                    "keywords")

Write some keywords to describe this country: USA:  blah blah blah
Write some keywords to describe this country: China:  cheese
Write some keywords to describe this country: Japan:  banana
Write some keywords to describe this country: Germany:  frog
Write some keywords to describe this country: UK:  tree
Write some keywords to describe this country: India:  rabbit


In [20]:
df_3.head(6)

Unnamed: 0,country,GDP,been_there,latin_alphabet_main,keywords
0,USA,19390604,yes,yes,blah blah blah
1,China,12237700,yes,no,cheese
2,Japan,4872137,yes,no,banana
3,Germany,3677439,no,yes,frog
4,UK,2622434,no,yes,tree
5,India,2597491,yes,yes,rabbit


# Export dataframe as CSV for future work

In [22]:
# https://chrisalbon.com/python/data_wrangling/pandas_saving_dataframe_as_csv/
####df_3.to_csv('example.csv')

In [3]:
# https://chrisalbon.com/python/data_wrangling/pandas_dataframe_importing_csv/
df_3_test = pd.read_csv('example.csv')
df_3_test = df_3_test.drop(["Unnamed: 0"], axis=1)
df_3_test.head(6)

Unnamed: 0,country,GDP,been_there,latin_alphabet_main,keywords
0,USA,19390604,yes,yes,free casual competitive
1,China,12237700,yes,no,huge diverse oppressive formal
2,Japan,4872137,yes,no,formal quiet clean timely
3,Germany,3677439,no,yes,timely prosperous dark humble
4,UK,2622434,no,yes,subtle independent proud
5,India,2597491,yes,yes,industrious dirty huge competitive


# Extract insights into keywords to normalize data
- freedom of expression
- politeness level
- competitiveness
- cleanliness
- punctuality
- explicitness of communication
- size of population

By the way: Did you know that "IIUC", in Internet slang, stands for "If I understand correctly" ?

In [4]:
#https://github.com/pandas-profiling/pandas-profiling
#conda install -c conda-forge pandas-profiling
#!pip install --user pandas-profiling

In [5]:
#### toggle between df_3_test and df_3 for live demo vs production
profile = df_3_test.profile_report(title='Pandas Profiling Report')
profile.to_file(output_file="output.html")

# todo: Create "add keywords to entry" function
- print entire entry to console
- take in keywords input from user
- append keywords input to keywords field (col) in entry (row)
- add new keywords to corpus with associated map key:value pair

# todo: Create "tag text" to "tag dict"
example: `"clean homogeneous" --> { "cleanliness" : 1, "diverse": -1 }`

This will be a two step process:
1. ~~iterate over all the tags to create a meta-data/tag corpus~~
2. ~~assign to each tag a value & associated key~~

In [16]:
tag_texts = df_3_test['keywords'].values.tolist()

In [17]:
def build_tag_corpus(list_in):
    corpus_out = {}
    for x in list_in:
        for y in x.split():
            if y not in corpus_out:
                corpus_out[y] = 1
            else:
                corpus_out[y] += 1
    return corpus_out

In [19]:
# note: this may also be useful as a frequency generator
corpus = build_tag_corpus(tag_texts)
print("corpus:",corpus)

corpus: {'free': 1, 'casual': 1, 'competitive': 2, 'huge': 2, 'diverse': 1, 'oppressive': 1, 'formal': 2, 'quiet': 1, 'clean': 1, 'timely': 2, 'prosperous': 1, 'dark': 1, 'humble': 1, 'subtle': 1, 'independent': 1, 'proud': 1, 'industrious': 1, 'dirty': 1}


In [26]:
def build_tag_map_from_corpus(map_in):
    map_out = {}
    for k in map_in.keys():
        new_main_key = input(f'Please enter categorical tag for \'{k}\': ')
        new_val = int(input(f'Please enter numerical scalar for tag \'{k}\': '))
        if new_main_key not in map_out:
            map_out[new_main_key] = {k : new_val}
        else:
            map_out[new_main_key][k] = new_val
    return map_out

In [27]:
#### uncomment to re-run corpus generation
#coded_corpus = build_tag_map_from_corpus(corpus)

Please enter categorical tag for 'free': freedom
Please enter numerical scalar for tag 'free': 1
Please enter categorical tag for 'casual': politeness
Please enter numerical scalar for tag 'casual': -1
Please enter categorical tag for 'competitive': competitiveness
Please enter numerical scalar for tag 'competitive': 1
Please enter categorical tag for 'huge': pop_size
Please enter numerical scalar for tag 'huge': 1
Please enter categorical tag for 'diverse': diversity
Please enter numerical scalar for tag 'diverse': 1
Please enter categorical tag for 'oppressive': freedom
Please enter numerical scalar for tag 'oppressive': -1
Please enter categorical tag for 'formal': politeness
Please enter numerical scalar for tag 'formal': 1
Please enter categorical tag for 'quiet': noisiness
Please enter numerical scalar for tag 'quiet': -1
Please enter categorical tag for 'clean': cleanliness
Please enter numerical scalar for tag 'clean': 1
Please enter categorical tag for 'timely': punctuality
Pl

In [28]:
print(coded_corpus)

{'freedom': {'free': 1, 'oppressive': -1}, 'politeness': {'casual': -1, 'formal': 1}, 'competitiveness': {'competitive': 1}, 'pop_size': {'huge': 1}, 'diversity': {'diverse': 1}, 'noisiness': {'quiet': -1}, 'cleanliness': {'clean': 1, 'dirty': -1}, 'punctuality': {'timely': 1}, 'prosperity': {'prosperous': 1}, 'dark_history': {'dark': 1}, 'humility': {'humble': 1, 'proud': -1}, 'subtlety': {'subtle': 1}, 'independency': {'independent': 1}, 'industriousness': {'industrious': 1}}


# todo: Create column from tag text with key value map corpus
with scale from -1 (negative) to 1 (positive), with 0 (neutral) and NaN also options

Now that we have a coded corpus, we can go ahead and create new columns for the dataframe using the overarching meta-tags (instead of 'casual' and 'formal' we can simply use 'politeness' with values of -1 or 1