# Data distributions and visualizations: PICAPS

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
pd.plotting.register_matplotlib_converters()
%matplotlib inline

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

In [None]:
#Import clean dataset
data=pd.read_csv('data_clean.csv')

In [None]:
data.shape

## Data of beneficiaries interviewed

### Distribution of age 

In [None]:
age=data[['id','age']].groupby('age').count()

plt.figure(figsize=(16,9))

sns_plot=sns.barplot(x=age.index,y=age['id'])

plt.title('Beneficiaries age distribution', fontsize=24)
plt.ylabel('Counts', fontsize=20)
plt.xlabel('Age', fontsize=20)

fig=sns_plot.get_figure()
fig.savefig('output_figures/beneficiaries_age_dist.pdf')

In [None]:
print("Mean age with standard deviation: %2.0f \u00B1 %2.0f" %(data['age'].mean(),data['age'].std()) )

### Distribution of alphabetization languages

In [None]:
lang=pd.read_csv('output_files/languages.csv', index_col='language')

plt.figure(figsize=(11,4))
plt.title('Alphabetization language of beneficiaries', fontsize=18)

sns_plot=sns.barplot(x=lang.index,y=lang['counts'])

#plt.xticks(np.arange(7), ('French-Moore-Lyele', 'Moore-French', 'French-Lyele',  'Lyele-Nuni', 'Moore', 'French', 'None'))
plt.xticks(np.arange(4), ('Lyele-Nuni', 'Moore', 'French', 'None'))

plt.xlabel('Language', fontsize=16)
plt.ylabel('Counts', fontsize=16)

fig=sns_plot.get_figure()
fig.savefig('output_figures/beneficiaries_lang_dist.pdf')

### Distribution of schooling

In [None]:
schooling=data[['id','school_level']].groupby('school_level').count()

plt.figure(figsize=(6,4))

plt.title('Schooling level of beneficiaries', fontsize=18)

sns_plot=sns.barplot(x=schooling.index,y=schooling['id'])

plt.ylabel('Counts', fontsize=16)

plt.xticks(np.arange(4), ('Never', 'Pre-school', 'Primary', 'Secondary'))
plt.xlabel('School level', fontsize=16)

fig=sns_plot.get_figure()
fig.savefig('output_figures/beneficiaries_school_dist.pdf')

## Data about kids of beneficiaries interviewed

### Distribution of number of kids per family

In [None]:
kids=data[['id','num_kids']].groupby('num_kids').count()

plt.figure(figsize=(11,4))

sns_plot=sns.barplot(x=kids.index,y=kids['id'])

plt.title('Number of kids per family', fontsize=18)
plt.ylabel('Number of families', fontsize=16)
plt.xlabel('Number of kids', fontsize=16)

fig=sns_plot.get_figure()
fig.savefig('output_figures/kids_number_dist.pdf')

In [None]:
print("Mean number of kids with standard deviation: %2.0f \u00B1 %2.0f" %(data['num_kids'].mean(),data['num_kids'].std()) )

In [None]:
kids_noschool=data[['id','num_kids_noschool']].groupby('num_kids_noschool').count()
kids_noschool.drop(0, inplace=True)

plt.figure(figsize=(9,4))

sns_plot=sns.barplot(x=kids_noschool.index,y=kids_noschool['id'])

plt.title('Number of kids per family with no schooling', fontsize=18)
plt.ylabel('Number of families', fontsize=16)
plt.xlabel('Number of kids', fontsize=16)
#plt.xlim(1,4)

fig=sns_plot.get_figure()
fig.savefig('output_figures/kids_noschool_dist.pdf')

In [None]:
kids_primary=data[['id','num_kids_primary']].groupby('num_kids_primary').count()
kids_primary.drop(0, inplace=True)

plt.figure(figsize=(11,4))

sns_plot=sns.barplot(x=kids_primary.index,y=kids_primary['id'])

plt.title('Number of kids per family in primary school', fontsize=18)
plt.ylabel('Number of families', fontsize=16)
plt.xlabel('Number of kids', fontsize=16)

fig=sns_plot.get_figure()
fig.savefig('output_figures/kids_primary_dist.pdf')

In [None]:
kids_secondary=data[['id','num_kids_secondary']].groupby('num_kids_secondary').count()
kids_secondary.drop(0, inplace=True)

plt.figure(figsize=(9,4))

sns_plot=sns.barplot(x=kids_secondary.index,y=kids_secondary['id'])

plt.title('Number of kids per family in secondary school', fontsize=18)
plt.ylabel('Number of families', fontsize=16)
plt.xlabel('Number of kids', fontsize=16)

fig=sns_plot.get_figure()
fig.savefig('output_figures/kids_secondary_dist.pdf')

In [None]:
kids_left_school=data[['id','num_kids_left_school']].groupby('num_kids_left_school').count()
kids_left_school.drop(0, inplace=True)

plt.figure(figsize=(6,4))

sns_plot=sns.barplot(x=kids_left_school.index,y=kids_left_school['id'])

plt.title('Number of kids per family who left school', fontsize=18)
plt.ylabel('Number of families', fontsize=16)
plt.xlabel('Number of kids', fontsize=16)

fig=sns_plot.get_figure()
fig.savefig('output_figures/kids_leftschool_dist.pdf')

### Distribution of number of girls per family

In [None]:
girls=data[['id','num_girls']].groupby('num_girls').count()

plt.figure(figsize=(11,4))

sns_plot=sns.barplot(x=girls.index,y=girls['id'])

plt.title('Number of girls per family', fontsize=18)
plt.ylabel('Number of families', fontsize=16)
plt.xlabel('Number of girls', fontsize=16)

fig=sns_plot.get_figure()
fig.savefig('output_figures/girls_numbers_dist.pdf')

In [None]:
girls_noschool=data[['id','num_girls_noschool']].groupby('num_girls_noschool').count()
girls_noschool.drop(0, inplace=True)

plt.figure(figsize=(6,4))

sns_plot=sns.barplot(x=girls_noschool.index,y=girls_noschool['id'])

plt.title('Number of girls per family with no schooling', fontsize=18)
plt.ylabel('Number of families', fontsize=16)
plt.xlabel('Number of girls', fontsize=16)

fig=sns_plot.get_figure()
fig.savefig('output_figures/girls_noschool_dist.pdf')

In [None]:
girls_primary=data[['id','num_girls_primary']].groupby('num_girls_primary').count()
girls_primary.drop(0, inplace=True)

plt.figure(figsize=(11,4))

sns_plot=sns.barplot(x=girls_primary.index,y=girls_primary['id'])

plt.title('Number of girls per family in primary school', fontsize=18)
plt.ylabel('Number of families', fontsize=16)
plt.xlabel('Number of girls', fontsize=16)

fig=sns_plot.get_figure()
fig.savefig('output_figures/kids_primary_dist.pdf')

In [None]:
girls_secondary=data[['id','num_girls_secondary']].groupby('num_girls_secondary').count()
girls_secondary.drop(0, inplace=True)

plt.figure(figsize=(6,4))

sns_plot=sns.barplot(x=girls_secondary.index,y=girls_secondary['id'])

plt.title('Number of girls per family in secondary school', fontsize=18)
plt.ylabel('Number of families', fontsize=16)
plt.xlabel('Number of girls', fontsize=16)

fig=sns_plot.get_figure()
fig.savefig('output_figures/kids_secondary_dist.pdf')

In [None]:
girls_left_school=data[['id','num_girls_left_school']].groupby('num_girls_left_school').count()
girls_left_school.drop(0, inplace=True)

plt.figure(figsize=(6,4))

sns_plot=sns.barplot(x=girls_left_school.index,y=girls_left_school['id'])

plt.title('Number of girls per family who left school', fontsize=18)
plt.ylabel('Number of families', fontsize=16)
plt.xlabel('Number of kids', fontsize=16)

fig=sns_plot.get_figure()
fig.savefig('output_figures/girls_leftschool_dist.pdf')

## Beneficiaries life situation: expenses, revenues, etc.

In [None]:
plt.figure(figsize=(8,6))

sns_plot=sns.distplot(a=data['total_revenue'], label="Total revenues", kde=False)
sns_plot=sns.distplot(a=data['total_expenses'], label="Total expenses", kde=False)

plt.legend(fontsize=18)
plt.title("Distribution of revenues/expenses per family", fontsize=20)
plt.ylabel('Number of families', fontsize=18)
plt.xlabel('CFA', fontsize=18)
plt.xlim(0,2000000)

fig=sns_plot.get_figure()
fig.savefig('output_figures/total_expenses_revenues_dist.pdf')

In [None]:
plt.figure(figsize=(8,10))

#f, ax = plt.subplots(figsize=(10, 8))
sns_plot=sns.scatterplot(x=data['total_revenue'], y=data['total_expenses'])

a=[0,500000]
b=a
sns_plot=sns.lineplot(a,b)

plt.xlim(0,500000)
plt.ylim(0,500000)
#plt.title('Correlation total revenues-expenses per family', fontsize=20)
plt.ylabel('Total expenses (FCA)', fontsize=18)
plt.xlabel('Total revenues (FCA)', fontsize=18)

fig=sns_plot.get_figure()
fig.savefig('output_figures/total_expenses_vs_revenues.pdf')

In [None]:
sns_plot_joint=sns.jointplot(x=data['total_revenue'], y=data['total_expenses'], kind='reg', truncate=True)
#plt.xlim(0,1000000)
#plt.ylim(0,500000)
#plt.title('Correlation total revenues-expenses per family', fontsize=20)
plt.ylabel('Total expenses (FCA)', fontsize=18)
plt.xlabel('Total revenues (FCA)', fontsize=18)

In [None]:
revenues=pd.DataFrame({'Revenues_percent':[42.1,21.4,10.5,8.0,5.2,4.3,4.1,4.0,0.2]})
plt.figure(figsize=(14,6))
sns_plot=sns.barplot(x=revenues.index,y=revenues['Revenues_percent'])

plt.xticks(np.arange(9), ('Cereals','Small Business','Other','Pork','Remittance'
                          ,'Vegetables','Small ruminants','Chickens' , 'Cows'), fontsize=12)
plt.ylabel('Percentage %', fontsize=18)
plt.title('Average of percentage of revenues', fontsize=20)

fig=sns_plot.get_figure()
fig.savefig('output_figures/average_percent_revenues_dist.pdf')

In [None]:
expenses=pd.DataFrame({'Expenses_percent':[38.1,14.2,12.9,12.5,11.3,7.9,3.0]})
plt.figure(figsize=(14,6))
sns_plot=sns.barplot(x=expenses.index,y=expenses['Expenses_percent'])

plt.xticks(np.arange(7), ('Food','School fees','Agro-pastoral','Other','Medicine'
                          ,'Community life','Construction'), fontsize=12)
plt.ylabel('Percentage %', fontsize=18)
plt.title('Average of percentage of expenses', fontsize=20)

fig=sns_plot.get_figure()
fig.savefig('output_figures/average_percent_expenses_dist.pdf')