In [1]:
import pandas as pd
import numpy as np
import scipy.stats as stats

In [2]:
data_url = 'https://raw.githubusercontent.com/Thinkful-Ed/data-201-resources/master/ESS_practice_data/ESSdata_Thinkful.csv'
df = pd.read_csv(data_url)

df.head()

Unnamed: 0,cntry,idno,year,tvtot,ppltrst,pplfair,pplhlp,happy,sclmeet,sclact,gndr,agea,partner
0,CH,5.0,6,3.0,3.0,10.0,5.0,8.0,5.0,4.0,2.0,60.0,1.0
1,CH,25.0,6,6.0,5.0,7.0,5.0,9.0,3.0,2.0,2.0,59.0,1.0
2,CH,26.0,6,1.0,8.0,8.0,8.0,7.0,6.0,3.0,1.0,24.0,2.0
3,CH,28.0,6,4.0,6.0,6.0,7.0,10.0,6.0,2.0,2.0,64.0,1.0
4,CH,29.0,6,5.0,6.0,7.0,5.0,8.0,7.0,2.0,2.0,55.0,1.0


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8594 entries, 0 to 8593
Data columns (total 13 columns):
cntry      8594 non-null object
idno       8594 non-null float64
year       8594 non-null int64
tvtot      8586 non-null float64
ppltrst    8580 non-null float64
pplfair    8555 non-null float64
pplhlp     8569 non-null float64
happy      8563 non-null float64
sclmeet    8579 non-null float64
sclact     8500 non-null float64
gndr       8584 non-null float64
agea       8355 non-null float64
partner    8577 non-null float64
dtypes: float64(11), int64(1), object(1)
memory usage: 872.9+ KB


In [4]:
# year 6 is 2012 and year 7 is 2014
df['year'] = np.where(df.year == 6, 2012, 2014)
# partner 2 is no partner, partner 1 is has partner
df['partner'] = np.where(df.partner == 2, 0, 1)
# create new feature gender; 1 = male, 2 = female
df['gender'] = np.where(df.gndr == 2, 'F', 'M')
df = df.drop('gndr',1)

df.head()

Unnamed: 0,cntry,idno,year,tvtot,ppltrst,pplfair,pplhlp,happy,sclmeet,sclact,agea,partner,gender
0,CH,5.0,2012,3.0,3.0,10.0,5.0,8.0,5.0,4.0,60.0,1,F
1,CH,25.0,2012,6.0,5.0,7.0,5.0,9.0,3.0,2.0,59.0,1,F
2,CH,26.0,2012,1.0,8.0,8.0,8.0,7.0,6.0,3.0,24.0,0,M
3,CH,28.0,2012,4.0,6.0,6.0,7.0,10.0,6.0,2.0,64.0,1,F
4,CH,29.0,2012,5.0,6.0,7.0,5.0,8.0,7.0,2.0,55.0,1,F


In [5]:
df = pd.concat([df, pd.get_dummies(df.year)], 1)

df.head()

Unnamed: 0,cntry,idno,year,tvtot,ppltrst,pplfair,pplhlp,happy,sclmeet,sclact,agea,partner,gender,2012,2014
0,CH,5.0,2012,3.0,3.0,10.0,5.0,8.0,5.0,4.0,60.0,1,F,1,0
1,CH,25.0,2012,6.0,5.0,7.0,5.0,9.0,3.0,2.0,59.0,1,F,1,0
2,CH,26.0,2012,1.0,8.0,8.0,8.0,7.0,6.0,3.0,24.0,0,M,1,0
3,CH,28.0,2012,4.0,6.0,6.0,7.0,10.0,6.0,2.0,64.0,1,F,1,0
4,CH,29.0,2012,5.0,6.0,7.0,5.0,8.0,7.0,2.0,55.0,1,F,1,0


## 1. Did people become less trusting from 2012 to 2014? Compute results for each country in the sample.

In [6]:
for country in df.cntry.unique():
    trst2012 = df[(df['year'] == 2012) & (df['cntry'] == country)].ppltrst.mean()
    trst2014 = df[(df['year'] == 2014) & (df['cntry'] == country)].ppltrst.mean()
    print('People in',country,'are LESS trusting in 2014') if trst2012 > trst2014 else print('People in',country,'are MORE trusting in 2014')

People in CH are MORE trusting in 2014
People in CZ are MORE trusting in 2014
People in DE are MORE trusting in 2014
People in ES are LESS trusting in 2014
People in NO are LESS trusting in 2014
People in SE are MORE trusting in 2014


## 2. Did people become happier from 2012 to 2014? Compute results for each country in the sample.

In [7]:
for country in df.cntry.unique():
    happy2012 = df[(df['year'] == 2012) & (df['cntry'] == country)].happy.mean()
    happy2014 = df[(df['year'] == 2014) & (df['cntry'] == country)].happy.mean()
    print('People in',country,'are not as happy in 2014') if happy2012 > happy2014 else print('People in',country,'are happier in 2014')

People in CH are happier in 2014
People in CZ are happier in 2014
People in DE are happier in 2014
People in ES are not as happy in 2014
People in NO are not as happy in 2014
People in SE are happier in 2014


## 3. Who reported watching more TV in 2012, men or women?

In [8]:
maletv = df[(df['year'] == 2012) & (df['gender'] == 'M')].tvtot.mean()
femaletv = df[(df['year'] == 2012) & (df['gender'] == 'F')].tvtot.mean()
print('Males watched more tv in 2012') if maletv > femaletv else print('Females watched more tv in 2012')

Females watched more tv in 2012


## 4. Who was more likely to believe people were fair in 2012, people living with a partner or people living alone?

In [9]:
alone = df[(df['year'] == 2012) & (df['partner'] == 0)].pplfair.mean()
not_alone = df[(df['year'] == 2012) & (df['partner'] == 1)].pplfair.mean()
print('Alone') if alone > not_alone else print('People living with a partner')

People living with a partner


## 5. Pick three or four of the countries in the sample and compare how often people met socially in 2014. Are there differences, and if so, which countries stand out?

In [10]:
for i, country in enumerate(df.cntry.unique()):
    print(country, len(df[df['cntry'] == country]))

CH 1546
CZ 1316
DE 28
ES 2426
NO 1462
SE 1816


In [11]:
cntry_list = 'CH', 'CZ', 'ES', 'NO', 'SE'
max_len = 658

In [12]:
sclmeet2014 = pd.DataFrame()
nation = []
for i, country in enumerate(cntry_list):
    nation.append(df[(df['year'] == 2014) & (df['cntry'] == country)].sclmeet.head(max_len).tolist())
    sclmeet2014[country] = nation[i]
    
print(sclmeet2014.mean())
F, p = stats.f_oneway(sclmeet2014['CH'],
                     sclmeet2014['CZ'],
                     sclmeet2014['ES'],
                     sclmeet2014['NO'],
                     sclmeet2014['SE'])
print(F)
print(p)


CH    5.179604
CZ    4.445802
ES    5.338415
NO    5.300912
SE    5.276596
dtype: float64
nan
nan


## 6. Pick three or four of the countries in the sample and compare how often people took part in social activities, relative to others their age, in 2014. Are there differences, and if so, which countries stand out?

In [13]:
sclact2014 = pd.DataFrame()
nation = []
for i, country in enumerate(cntry_list):
    nation.append(df[(df['year'] == 2014) & (df['cntry'] == country)].sclact.head(max_len).tolist())
    sclact2014[country] = nation[i]
    
print(sclact2014.mean())
F, p = stats.f_oneway(sclact2014['CH'],
                     sclact2014['CZ'],
                     sclact2014['ES'],
                     sclact2014['NO'],
                     sclact2014['SE'])
print(F)
print(p)


CH    2.777266
CZ    2.703077
ES    2.609302
NO    2.881459
SE    2.865649
dtype: float64
nan
nan
