# Campaign Data

Data analysis for a national environmental campagin. We wanted to understand:
* Number of events
* Number of school events
* Number of participating schools
* Number of volunteers from schols
* Dates of events
* Social media outreach
* Specific participation from divers teams

## Imports

In [1]:
import pandas as pd
import numpy as np
import datetime
import scipy.stats

import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option('display.max_columns',500)

In [2]:
# Read input
dfm = pd.read_csv('Files/activities.csv' ,sep = ';') # events dataset
dfp = pd.read_csv('Files/organization_participations.csv' , sep = ';') # participants dataset

In [5]:
# Output anonymized dataset for additional editing
to_keep = ['id','activity_category','meetup_date','longitude','latitude']
dfm[to_keep].to_csv('activities_anonymized.csv')

# Preliminary Analysis

In [4]:
# Total events number
print ('Δράσεις Συνολικά:' , dfm.shape[0])

Δράσεις Συνολικά: 1595


In [5]:
# Events by schools
print ('Δράσεις σχολείων:' , dfm[(dfm['activity_category_id'] == 2)].shape[0])

Δράσεις σχολείων: 862


# Feedback Analysis

In [3]:
# Getting participants' feedback into account
df_map = pd.read_csv('Files/activities.csv',sep=';')
df = pd.read_csv('Files/activity_surveys.csv',sep = ';')
dfs = pd.read_csv('Files/activity_surveys_schools.csv', sep = ';')

In [5]:
# Exporting open-ended comments for further analysis
df.sort_values(by = 'special_moment', ascending = False,inplace=True)
df.to_excel('Files/sunday_open.xlsx')

  force_unicode(url))


In [20]:
# Get emails to send the participation note to
pd.Series(df['email'].unique()).to_csv('email_epainos_sunday.csv')
pd.Series(dfs['email'].unique()).to_csv('email_epainos_schools.csv')

  """Entry point for launching an IPython kernel.
  


# School Participations

In [2]:
dfp = pd.read_csv('Files/organization_participations.csv' , sep = ';')

In [3]:
dfp.shape

(4770, 20)

In [4]:
df =dfp[dfp['sector']== 'Σχολείο']

# Activity analysis

In [15]:
df = pd.read_csv('Files/activities.csv',sep=';')

In [16]:
df.loc[:,'meetup_date'] = pd.to_datetime(df.loc[:,'meetup_date'])
df.loc[:,'meetup_date'] = [df.loc[i, 'meetup_date'].replace(tzinfo=None) for i in df.index]
df.loc[:,'meetup_date'] = [df.loc[i,'meetup_date'].strftime('%x') for i in df.index]

In [17]:
df = df[df['activity_category_id']==2]

In [21]:
df['name'] = df['name'].str.lower()

# Participant estimate - schools

In [29]:
df = pd.read_csv('Files/activity_surveys_schools.csv', sep = ';')

In [35]:
df.drop_duplicates('email', inplace = True)

In [37]:
df.sort_values(by = 'participation_estimate_number',inplace = True)

In [45]:
d = df[df['participation_estimate_number']>10]
d = d[d['participation_estimate_number']<600]

In [46]:
d.describe()

Unnamed: 0,id,participation_estimate_number
count,465.0,465.0
mean,450.275269,66.703226
std,195.960932,63.857314
min,1.0,11.0
25%,283.0,27.0
50%,428.0,45.0
75%,593.0,80.0
max,822.0,500.0


In [20]:
def mean_confidence_interval(data, confidence=0.95):
    a = 1.0 * np.array(data)
    n = len(a)
    m, se = np.mean(a), scipy.stats.sem(a)
    h = se * scipy.stats.t.ppf((1 + confidence) / 2., n-1)
    return m, m-h, m+h

In [73]:
mean_confidence_interval(df['participation_estimate_number'].iloc[:-1], confidence = 0.95)

(57.451398135818906, 52.37634942383048, 62.52644684780733)

In [74]:
print('Total events', 52.37*856)

Total events 44828.72


# Activity Analysis

In [216]:
df = pd.read_csv('Files/activities.csv',sep=';')

In [217]:
df.loc[:,'meetup_date'] = pd.to_datetime(df.loc[:,'meetup_date'])
df.loc[:,'meetup_date'] = [df.loc[i, 'meetup_date'].replace(tzinfo=None) for i in df.index]
df.loc[:,'meetup_date'] = [df.loc[i,'meetup_date'].strftime('%x') for i in df.index]

In [219]:
df['meetup_date'].value_counts().sort_index()

02/28/19      1
03/20/19      1
03/21/19      5
03/23/19      1
03/26/19      1
03/27/19      2
03/28/19      1
03/29/19      2
03/30/19      2
03/31/19      8
04/01/19     99
04/02/19     74
04/03/19     97
04/04/19    161
04/05/19    221
04/06/19     40
04/07/19    756
04/08/19     10
04/09/19      6
04/10/19     13
04/11/19      8
04/12/19     12
04/13/19     12
04/14/19     35
04/15/19      2
04/17/19      3
04/18/19      4
04/19/19      4
04/20/19      4
04/21/19      1
04/22/19      2
04/23/19      1
04/27/19      1
05/01/19      1
05/05/19      1
05/09/19      1
05/16/19      1
05/17/19      1
Name: meetup_date, dtype: int64

Dates:

1/4: 99
2/4: 74
3/4: 97
4/4: 161
5/4: 221
6/4: 40
7/4: 756
rest: 147

# Social media

In [222]:
dfs = pd.read_csv("Files/Facebook Insights Data Export - Let's do it Greece - 2019-05-26.csv")
dfs1 = pd.read_csv("Files/Facebook Insights Data Export - Let's do it Greece - 2019-05-26(1).csv")
dfv = pd.read_csv("Files/Facebook Insights Data Export (Video Posts) - Let's do it Greece - 2019-05-26.csv")
dfv1 = pd.read_csv("Files/Facebook Insights Data Export (Video Posts) - Let's do it Greece - 2019-05-26(1).csv")

In [223]:
dfs.columns = [dfs.columns, dfs.iloc[0,:]]
dfs.drop(0,axis=0,inplace=True)

In [224]:
dfs1.columns = [dfs1.columns, dfs1.iloc[0,:]]
dfs1.drop(0,axis=0,inplace=True)
dfv1.columns = [dfv1.columns, dfv1.iloc[0,:]]
dfv1.drop(0,axis=0,inplace=True)
dfv.columns = [dfv.columns, dfv.iloc[0,:]]
dfv.drop(0,axis=0,inplace=True)

In [68]:
dfv['Lifetime Total Video Views'].astype('int64').sum(axis=0) + dfv1['Lifetime Total Video Views'].astype('int64').sum(axis=0)

Lifetime Total Video Views    125470
dtype: int64

In [69]:
dfs['Daily Total Reach'].astype('int64').sum(axis=0) + dfs1['Daily Total Reach'].astype('int64').sum(axis=0)

Daily Total Reach    1425444
dtype: int64

In [58]:
dfs['Daily Viral impressions'].astype('int64').sum(axis=0) + dfs1['Daily Viral impressions'].astype('int64').sum(axis=0)

Daily Viral impressions    627993
dtype: int64

In [60]:
col = "Daily Organic impressions"
dfs[col].astype('int64').sum(axis=0) + dfs1[col].astype('int64').sum(axis=0)

Daily Organic impressions    1056052
dtype: int64

In [64]:
col = "Daily Paid Impressions"
dfs[col].dropna().astype('int64').sum(axis=0) + dfs1[col].dropna().astype('int64').sum(axis=0)

Daily Paid Impressions    827651
dtype: int64

In [175]:
[{s: df.loc[:,s].dropna().astype('int64').sum(axis=0)} for s in df.loc[:,df.columns.str.contains('Impressions')].columns]

[{'Daily Paid Impressions': 827651},
 {'Daily Total Impressions': 1898071},
 {'Daily Total Impressions of your posts': 1850922},
 {'Daily Viral Impressions Of Your Posts': 595313}]

# Divers groups

We were asked to fetch data on divers groups for the national campaign

In [4]:
dfm = pd.read_csv('Files/activities.csv' ,sep = ';')
dfp = pd.read_csv('Files/organization_participations.csv' , sep = ';')

In [23]:
divers_phrase = ['δυτ', 'div', 'υποθ']
divers = '|'.join(divers_phrase)
d_map = dfm[dfm["organization_name"].str.contains(divers, case = False).fillna(value = False)]['organization_name']

In [26]:
divers_phrase = ['δυτ', 'div', 'υποθ']
divers = '|'.join(divers_phrase)
d_part = dfp[dfp["name"].str.contains(divers, case = False).fillna(value = False)]['name']

In [42]:
dm = d_map.to_frame().drop_duplicates()
dm['source'] = 'activities'
dp = d_part.to_frame().drop_duplicates()
dp.columns = ['organization_name']
dp['source'] = 'participations'

df = pd.concat([dm,dp],axis=0)

In [None]:
id_drop = [579,1167,328,1832,3716,3772,3773,3774,3775,3776,3777,3899,3940,4391,4594,4140,1367,2103,3178] # false positives
df.drop(index = id_drop,inplace=True)

In [59]:
df.drop_duplicates(subset = 'organization_name',keep='last',inplace = True)
df.drop(labels = ['source'],inplace=True,axis=1)