<a href="https://colab.research.google.com/github/dustin-py/DS-Unit-1-Sprint-1-Data-Wrangling-and-Storytelling/blob/master/Untitled3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Research Question:** 
- ## Is there a large difference in base personality traits of individuals among a range of countries?

In [197]:
# Load in "The Big Five" data set:
! gdown --id  13JxLbd2DqgeDmm1NlOUYi67ca2v9wzC9

Downloading...
From: https://drive.google.com/uc?id=13JxLbd2DqgeDmm1NlOUYi67ca2v9wzC9
To: /content/data-final.csv
416MB [00:04, 86.2MB/s]


In [198]:
# Import external libraries:
import pandas as pd 
import numpy as np
from textblob import TextBlob
import plotly.graph_objects as go
from sklearn.preprocessing import StandardScaler


# Instantiate a dataframe by using pandas:
df = pd.read_csv('data-final.csv',sep='\t')


# Get rid of unnecessary columns, by instantiating a new dataframe so that we can 
# keep the original with out having to rerun the whole note book:
big5 = df.drop(columns=['screenw','screenh','introelapse','endelapse',
                        'lat_appx_lots_of_err','long_appx_lots_of_err']).copy()


# A little further cleaning to remove the elapsed time spent on each question,
# may refer back to those columns later, but for now they will be taken out:
for attr in big5.columns:
    if '_E' in attr:
        big5.drop(columns=attr,inplace=True)


# Now lets get rid of all the country values equal to 'NONE':    
remove_none = big5.set_index('country')
remove_none.drop(index='NONE',inplace=True)
big5 = remove_none.reset_index()    
# Only samples with 'IPC' == 1 in the big5 dataframe so that we know we don't
# have repeat tests so that our sample stay valid.
big5 = big5[big5['IPC']==1]
big5

In [23]:
# function to get the question result means by country:
def question_result_means_by_country(dataframe):
    '''
    Obtain the means of the test results for each 
    country in the dataframe. 
    '''
    df = pd.DataFrame()
    for i in dataframe.columns:
        s_list = []
        x,y=1,11
        while y <= 51:
            s_list.append(dataframe[i].iloc[x:y].mean())
            series = pd.Series(s_list)    
            x,y=x+10,y+10
        df[i] = series
        s_list.clear()    
    return df
# function to create a new data frame base on the means results:
def countrys_dataframe(itr_df,dataframe):
    '''
    By iterating through a dataframe column
    and using the unique values from the iterated column to
    return a dataframe based on the new means data. 
    '''
    df = pd.DataFrame()
    for i in itr_df.country.unique():
        new = dataframe[dataframe['country']==i].iloc[:,:51]
        df[i] = new.iloc[1:].mean()
    return df

In [185]:
# Instantiate a dataframe of the 20 with the most test results:
top_20_test_numbers = pd.DataFrame(big5.country.value_counts()[:20]).index
top_20_test_numbers = pd.Series(top_20_test_numbers,name='country')
top_20_test_numbers = pd.DataFrame(top_20_test_numbers)
# instantiate a data frame of result means by country 
# useing the function created earlier:
countrys = countrys_dataframe(top_20_test_numbers,big5)        
# drop NaN values from the rows:
countrys = countrys.dropna(1)
# instantiate a new dataframe after passing it through
# the other function created earliear:
countrys = question_result_means_by_country(countrys)
# replace the numbered columns with the result section names:
countrys = countrys.rename({0:'EXT',1:'EST',2:'AGR',3:'CSN',4:'OPN'})
# reset the index in order to use the index values in a column:
countrys = countrys.reset_index()
countrys['section'] = countrys['index']
countrys = countrys.drop('index',1)
# display new cleaned dataframe:
countrys

Unnamed: 0,US,GB,CA,AU,DE,IN,PH,MX,NO,NL,SE,MY,NZ,ID,BR,SG,FR,IT,ES,PL,section
0,3.094591,3.069313,3.097143,3.090788,3.008338,3.100303,3.138585,3.087827,3.026146,3.029864,3.006741,3.148076,3.084726,3.137317,3.069526,3.145286,3.079422,3.034064,3.045647,3.039172,EXT
1,2.854795,2.996495,2.893585,2.901359,2.934456,3.085849,3.197377,3.124908,2.891218,2.877431,2.881714,3.201356,2.899694,3.156083,3.058329,3.018798,3.028044,3.028868,2.953869,3.104522,EST
2,3.280822,3.211014,3.270526,3.271627,3.160071,3.257981,3.286968,3.20647,3.202116,3.229981,3.158719,3.326353,3.246051,3.293007,3.081295,3.322218,3.161247,3.140117,3.190327,3.111994,AGR
3,3.178423,3.149463,3.16962,3.142656,3.192782,3.071763,3.191827,3.276206,3.150222,3.147553,3.135051,3.046569,3.130854,3.173296,3.228629,3.067637,3.178874,3.210888,3.223699,3.21059,CSN
4,3.248023,3.240665,3.248384,3.233901,3.251477,3.139974,3.160649,3.254622,3.2355,3.237099,3.263381,3.115007,3.217019,3.188845,3.269466,3.172744,3.275639,3.205507,3.2267,3.239172,OPN


In [186]:
# instantiate a new data frame of the previous dataframe transposed:
countrys_t = countrys.T.reset_index()
# instantiate a new 'country' column by using the index column:
countrys_t['country'] = countrys_t['index']
# now drop the unnecessary index column:
countrys_t = countrys_t.drop('index',1)
# rename the new columns by mapping them using a dictionary:
countrys_t.rename(columns={0:'Extroversion',1:'Neuroticism',2:'Agreeability',
                           3:'Conscientiousness',4:'Openness'}, inplace=True)
# instantiate data to standardize from the previous dataframe:
c = countrys_t.iloc[:-1,:-1]
# standardize the data from datafram 'c':
scaled_country_t = StandardScaler().fit_transform(c)
# instantiate a data frame using the standardized data:
scaled_country_t = pd.DataFrame(scaled_country_t,columns=c.columns)
# add the country column back to the data:
scaled_country_t['country'] = countrys_t.country[:-1]
# display dataframe:
scaled_country_t

Unnamed: 0,Extroversion,Neuroticism,Agreeability,Conscientiousness,Openness,country
0,0.413897,-1.359751,0.88897,0.263158,0.622928,US
1,-0.169055,-0.072167,-0.138863,-0.259087,0.452108,GB
2,0.472748,-1.007275,0.737374,0.104417,0.631288,CA
3,0.326182,-0.936636,0.75358,-0.381845,0.295102,AU
4,-1.575242,-0.635894,-0.888923,0.522117,0.7031,DE
5,0.545623,0.739756,0.552656,-1.66032,-1.885278,IN
6,1.428459,1.753177,0.979455,0.504886,-1.405337,PH
7,0.257907,1.094673,-0.20577,2.026565,0.776104,MX
8,-1.164563,-1.028783,-0.269867,-0.245405,0.332225,NO
9,-1.078809,-1.154069,0.140395,-0.293535,0.369337,NL




---



## Figure 1:

In [187]:
# create a function to display my plot:
def figure_1(plot_bg_color='rgba(0,0,0)',
             fig_bg_color='rgba(0,0,0,.7)',
             txt_color='White'):
    '''
    Function to display and easily edit my desired feature of my plots.
    '''
    import chart_studio.plotly as py
    color_list = [
                'dodgerblue', 'grey', 'aqua', 'darkslateblue', 'darkslategrey',
                    'darkturquoise', 'darkviolet', 'orange', 'deeppink', 'blue',
                    'blueviolet', 'brown', 'burlywood', 'cadetblue',
                    'chartreuse', 'chocolate', 'coral', 'cornflowerblue','powderblue',
                'seagreen'
    ]

    layout = go.Layout(
        paper_bgcolor=fig_bg_color,
        plot_bgcolor=plot_bg_color,
        font=dict(
            family="Courier New, monospace",
            size=16,
            color=txt_color))
    
    fig = go.Figure(layout=layout,data=[
        go.Bar(marker={'color':color_list},name=i, x=scaled_country_t.country, y=scaled_country_t[i]) for i in scaled_country_t.columns[:-1]
    ])
    # Change the bar mode
    fig.update_layout(barmode='group',
                      title="Average Personality Traits by Country",
                      xaxis_title="Countries",
                      yaxis_title="Scale",
                      yaxis = dict(
        tickmode = 'array',
        tickvals = [-1.5,0,1.5],
        ticktext = ['Disagree','Neutral','Agree']
    ))
    return fig.show() # ,py.plot(fig, filename = 'big5_heatmap', auto_open=True)
# display plot of result means by country:
figure_1()



---




In [188]:
# Create a dictionary that holds the questions related to each of the columns, 
# because in this data we are using abbreviation based on the type of question.
col_dic = {'EXT1'	:"I am the life of the party.",
           'EXT2'	:"I don't talk a lot.",
           'EXT3'	:"I feel comfortable around people.",
           'EXT4'	:"I keep in the background.",
           'EXT5'	:"I start conversations.",
           'EXT6'	:"I have little to say.",
           'EXT7'	:"I talk to a lot of different people at parties.",
           'EXT8'	:"I don't like to draw attention to myself.",
           'EXT9'	:"I don't mind being the center of attention.",
           'EXT10'	:"I am quiet around strangers.",
           'EST1'	:"I get stressed out easily.",
           'EST2'	:"I am relaxed most of the time.",
           'EST3'	:"I worry about things.",
           'EST4'	:"I seldom feel blue.",
           'EST5'	:"I am easily disturbed.",
           'EST6'	:"I get upset easily.",
           'EST7'	:"I change my mood a lot.",
           'EST8'	:"I have frequent mood swings.",
           'EST9'	:"I get irritated easily.",
           'EST10'	:"I often feel blue.",
           'AGR1'	:"I feel little concern for others.",
           'AGR2'	:"I am interested in people.",
           'AGR3'	:"I insult people.",
           'AGR4'	:"I sympathize with others' feelings.",
           'AGR5'	:"I am not interested in other people's problems.",
           'AGR6'	:"I have a soft heart.",
           'AGR7'	:"I am not really interested in others.",
           'AGR8'	:"I take time out for others.",
           'AGR9'	:"I feel others' emotions.",
           'AGR10'	:"I make people feel at ease.",
           'CSN1'	:"I am always prepared.",
           'CSN2'	:"I leave my belongings around.",
           'CSN3'	:"I pay attention to details.",
           'CSN4'	:"I make a mess of things.",
           'CSN5'	:"I get chores done right away.",
           'CSN6'	:"I often forget to put things back in their proper place.",
           'CSN7'	:"I like order.",                                
           'CSN8'	:"I shirk my duties.",                                                   
           'CSN9'	:"I follow a schedule.",                                                   
           'CSN10'	:"I am exacting in my work.",
           'OPN1'	:"I have a rich vocabulary.",                                                
           'OPN2'	:"I have difficulty understanding abstract ideas.",                       
           'OPN3'	:"I have a vivid imagination.",                                             
           'OPN4'	:"I am not interested in abstract ideas.",                                
           'OPN5'	:"I have excellent ideas.",                                                 
           'OPN6'	:"I do not have a good imagination.",                                      
           'OPN7'	:"I am quick to understand things.",                                     
           'OPN8'	:"I use difficult words.",                                                  
           'OPN9'	:"I spend time reflecting on things.",                                 
           'OPN10'	:"I am full of ideas."}          

# Create lists of the sentemint analysis of each question/column then group them 
# based on that sentiment:
def sentimentGrouping(dictionary):
    '''
    From a dictionary where the text are the values related to the keys, and
    perform sentiment analysis on the test using TextBlob library.

    Takes one parameter: dictionary={key:'value'}
    '''
    pos_obj = []
    pos_subj = []
    neg_obj = []
    neg_subj = []
    neutral = []
    for item in dictionary:
        blob = TextBlob(dictionary[item]).sentiment
        # pos_obj:
        if blob[0] > 0.0 and blob[1] <= 0.5:
            pos_obj.append(item)
        # pos_subj:
        elif blob[0] > 0.0 and blob[1] >= 0.5:
            pos_subj.append(item)
        # neg_obj:
        elif blob[0] < 0.0 and blob[1] <= 0.5:
            neg_obj.append(item)
        # neg_subj:
        elif blob[0] < 0.0 and blob[1] >= 0.5:
            neg_subj.append(item)
        # neutral:
        elif blob[0] == 0.0:
            neutral.append(item)     
    return pos_obj,pos_subj,neg_obj,neg_subj,neutral 


# use the sentiment grouping function created earlier in the notebook to create
# lists of columns base on;
'''
   1. po = positive and objective
   2. ps = positive and subjective
   3. no = negative and objective
   4. ns = negative and subjective
   5. neu = neutral 
'''
po,ps,no,ns,neu = sentimentGrouping(col_dic)



# Instantiate lists of the polarity of the sentiment of the question and 
# the subjectivity of the question:
polarity = []
subjectivity = []
# words = []
for text in col_dic:
    blob = TextBlob(col_dic[text])
    polarity.append(blob.sentiment[0])
    subjectivity.append(blob.sentiment[1])
    # words.append(blob.sentiment_assessments[2][0][0:3])


# Sentiment data frame:                                    
sentiment_data = {'pol':polarity,'subj':subjectivity}
sent_df = pd.DataFrame(sentiment_data)
questions = []
sect = ['ext','est','agr','csn','opn']
for s in sect:
    for n in range(1,11):
        questions.append(s+str(n))   
quest_ser = pd.Series(questions)                           
sent_df['types'] = quest_ser.values

---


## Figure 2:

In [189]:
# create a function to display my plot:
def figure_2(plot_bg_color='rgba(0,0,0)',
             fig_bg_color='rgba(0,0,0,.7)',
             txt_color='White'):
    '''
    Function to display and easily edit my desired feature of my plots.
    '''
    import chart_studio.plotly as py
    color_list = [
                'dodgerblue', 'grey', 'aqua', 'darkslateblue', 'darkslategrey',
                    'darkturquoise', 'darkviolet', 'orange', 'deeppink', 'blue',
                    'blueviolet', 'brown', 'burlywood', 'cadetblue',
                    'chartreuse', 'chocolate', 'coral', 'cornflowerblue','powderblue',
                'seagreen'
    ]

    layout = go.Layout(
        paper_bgcolor=fig_bg_color,
        plot_bgcolor=plot_bg_color,
        font=dict(
            family="Courier New, monospace",
            size=16,
            color=txt_color))
    clrs = ['dodgerblue','green']
    fig = go.Figure(layout=layout,data=[
        go.Bar(marker={'color':n},name=i,x=sent_df['types'], y=sent_df[i]) for i in sent_df[['pol','subj']]
        ])
    # Change the bar mode
    fig.update_layout(barmode='group',
                      title="Sentiment Analysis Per Question",
                      xaxis_title="Question Variables",
                      yaxis_title="Value Range")
    # py.plot(fig, filename = 'sentiment', auto_open=True)
    return fig.show()# ,py.plot(fig, filename = 'sentiment', auto_open=True)

# display plot of result means by country:
figure_2()



---





## Figure 3:

In [192]:
a = big5[['country','testelapse']]
emp_list = []
for i in top_20_test_numbers['country']:
    b = a[a['country']==i]
    c = b.testelapse.mean()
    emp_list.append(c)
d = pd.DataFrame(index=top_20_test_numbers.country)
d['Avg_Test_Time'] = emp_list
e = d.reset_index()
f = e.drop('Avg_Test_Time',axis=1)


# create a function to display my plot:
def figure_3(plot_bg_color='rgba(0,0,0)',
             fig_bg_color='rgba(0,0,0,.7)',
             txt_color='White'):
    '''
    Function to display and easily edit my desired feature of my plots.
    '''
    import chart_studio.plotly as py
    color_list = [
                'dodgerblue', 'grey', 'aqua', 'darkslateblue', 'darkslategrey',
                    'darkturquoise', 'darkviolet', 'orange', 'deeppink', 'blue',
                    'blueviolet', 'brown', 'burlywood', 'cadetblue',
                    'chartreuse', 'chocolate', 'coral', 'cornflowerblue','powderblue',
                'seagreen'
    ]

    layout = go.Layout(
        paper_bgcolor=fig_bg_color,
        plot_bgcolor=plot_bg_color,
        font=dict(
            family="Courier New, monospace",
            size=16,
            color=txt_color))
    
    fig = go.Figure(layout=layout,data=[
        go.Bar(marker={'color':color_list},x=e.T.rename(columns=e.country).columns,
               y=e['Avg_Test_Time'])
    ])
    # Change the bar mode
    fig.update_layout(barmode='relative',
                      title="Average Test Times",
                      xaxis_title="Country",
                      yaxis_title="Test times /sec")
    return fig.show(),py.plot(fig, filename = 'test_times', auto_open=True)
# display plot of result means by country:
figure_3()

(None, 'https://plotly.com/~dustinstri92/26/')