# This notebook applies ML techiques to the Eurobarometer
**The goal is to predict the probability of acceptance of asylum seekers based on survey results**


![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)
Version for Colab at: https://drive.google.com/file/d/15ly8CpYJeu28jI_7ATWioIf7SLxGy1ly/view?usp=sharing


Data at: https://www.dropbox.com/sh/svwq65agr22e3wa/AAAMvhfwke_4pi5zmnZHZoKWa?dl=0

We import the libraries we need: Pandas, Numpy and Scikit-Learn
If we do not have any of them, the correct way to install them in a Jupyter Notebook is:

import sys

!{sys.execultable} -m pip install LIBRARY

In [1]:
import pandas as pd
import numpy as np
import pickle

We load our data from a CSV file

In [2]:
data_nov_2015 = pd.read_csv('data/GESIS/ZA6643_v3-1-0.csv', header=0, sep= ';', low_memory= False)
type(data_nov_2015)

pandas.core.frame.DataFrame

In [4]:
data_nov_2015

Unnamed: 0,studyno1,studyno2,doi,version,edition,survey,caseid,uniqid,serialid,tnscntry,...,w89,w90,w92,w94,w95,w97,w98,w99,w100,wex
0,GESIS STUDY ID ZA6643,GESIS STUDY ID ZA6643,doi:10.4232/1.12799,3.1.0 (2017-06-01),Archive pre-release (update),Eurobarometer 84.3 (November 2015),2,710000002,31820,SHQIPERIA,...,0,0,",209874",0,0,1088357,0,0,",22022",3189500977
1,GESIS STUDY ID ZA6643,GESIS STUDY ID ZA6643,doi:10.4232/1.12799,3.1.0 (2017-06-01),Archive pre-release (update),Eurobarometer 84.3 (November 2015),3,710000003,31821,SHQIPERIA,...,0,0,",135586",0,0,",703118",0,0,",14227",206053418
2,GESIS STUDY ID ZA6643,GESIS STUDY ID ZA6643,doi:10.4232/1.12799,3.1.0 (2017-06-01),Archive pre-release (update),Eurobarometer 84.3 (November 2015),4,710000004,31822,SHQIPERIA,...,0,0,",176903",0,0,",917381",0,0,",185625",2688443848
3,GESIS STUDY ID ZA6643,GESIS STUDY ID ZA6643,doi:10.4232/1.12799,3.1.0 (2017-06-01),Archive pre-release (update),Eurobarometer 84.3 (November 2015),5,710000005,31823,SHQIPERIA,...,0,0,",209874",0,0,1088357,0,0,",22022",3189500977
4,GESIS STUDY ID ZA6643,GESIS STUDY ID ZA6643,doi:10.4232/1.12799,3.1.0 (2017-06-01),Archive pre-release (update),Eurobarometer 84.3 (November 2015),6,710000006,31824,SHQIPERIA,...,0,0,",135586",0,0,",703118",0,0,",14227",206053418
5,GESIS STUDY ID ZA6643,GESIS STUDY ID ZA6643,doi:10.4232/1.12799,3.1.0 (2017-06-01),Archive pre-release (update),Eurobarometer 84.3 (November 2015),7,710000007,31825,SHQIPERIA,...,0,0,",209874",0,0,1088357,0,0,",22022",3189500977
6,GESIS STUDY ID ZA6643,GESIS STUDY ID ZA6643,doi:10.4232/1.12799,3.1.0 (2017-06-01),Archive pre-release (update),Eurobarometer 84.3 (November 2015),8,710000008,31826,SHQIPERIA,...,0,0,",092294",0,0,",478614",0,0,",096844",1402609253
7,GESIS STUDY ID ZA6643,GESIS STUDY ID ZA6643,doi:10.4232/1.12799,3.1.0 (2017-06-01),Archive pre-release (update),Eurobarometer 84.3 (November 2015),9,710000009,31827,SHQIPERIA,...,0,0,",135586",0,0,",703118",0,0,",14227",206053418
8,GESIS STUDY ID ZA6643,GESIS STUDY ID ZA6643,doi:10.4232/1.12799,3.1.0 (2017-06-01),Archive pre-release (update),Eurobarometer 84.3 (November 2015),10,710000010,31828,SHQIPERIA,...,0,0,",209917",0,0,1088581,0,0,",220266",3190158203
9,GESIS STUDY ID ZA6643,GESIS STUDY ID ZA6643,doi:10.4232/1.12799,3.1.0 (2017-06-01),Archive pre-release (update),Eurobarometer 84.3 (November 2015),11,710000011,31829,SHQIPERIA,...,0,0,",091602",0,0,",475025",0,0,",096118",1392092896


In [5]:
#We have a DF of 32833 rows × 733 columns
#So we get a list of variables
list(data_nov_2015.columns.values)

['studyno1',
 'studyno2',
 'doi',
 'version',
 'edition',
 'survey',
 'caseid',
 'uniqid',
 'serialid',
 'tnscntry',
 'country',
 'isocntry',
 'split',
 'q1.1',
 'q1.2',
 'q1.3',
 'q1.4',
 'q1.5',
 'q1.6',
 'q1.7',
 'q1.8',
 'q1.9',
 'q1.10',
 'q1.11',
 'q1.12',
 'q1.13',
 'q1.14',
 'q1.15',
 'q1.16',
 'q1.17',
 'q1.18',
 'q1.19',
 'q1.20',
 'q1.21',
 'q1.22',
 'q1.23',
 'q1.24',
 'q1.25',
 'q1.26',
 'q1.27',
 'q1.28',
 'q1.29',
 'q1.30',
 'q1.31',
 'q1.32',
 'q1.33',
 'q1.34',
 'q1.35',
 'd70',
 'd71a_1',
 'd71a_2',
 'd71a_3',
 'd71b_1',
 'd71b_2',
 'd71b_3',
 'polintr',
 'qa1a_1',
 'qa1a_2',
 'qa1a_3',
 'qa1a_4',
 'qa1a_5',
 'qa1a_6',
 'qa1a_7',
 'qa1b_1',
 'qa1b_2',
 'qa1b_3',
 'qa1b_4',
 'qa1b_5',
 'qa1b_6',
 'qa1b_7',
 'qa2a_1',
 'qa2a_2',
 'qa2a_3',
 'qa2a_4',
 'qa2a_5',
 'qa2a_6',
 'qa2a_7',
 'qa2a_8',
 'qa2b_1',
 'qa2b_2',
 'qa2b_3',
 'qa2b_4',
 'qa2b_5',
 'qa2b_6',
 'qa2b_7',
 'qa3a.1',
 'qa3a.2',
 'qa3a.3',
 'qa3a.4',
 'qa3a.5',
 'qa3a.6',
 'qa3a.7',
 'qa3a.8',
 'qa3a.9',
 'q

## We load other data sets

In [3]:
data_may_2016 = pd.read_csv('data/GESIS/ZA6694_v1-1-0.csv', header=0, sep= ';', low_memory= False)
data_nov_2016 = pd.read_csv('data/GESIS/ZA6788_v1-3-0.csv', header=0, sep= ';', low_memory= False)
data_may_2017 = pd.read_csv('data/GESIS/ZA6863_v1-0-0.csv', header=0, sep= ';', low_memory= False)
data_nov_2017 = pd.read_csv('data/GESIS/ZA6928_v1-0-0.csv', header=0, sep= ';', low_memory= False)

In [7]:
#data_nov_2015.shape
#data_may_2016.shape
#data_nov_2016.shape
#data_may_2017.shape
#data_nov_2017.shape

In [4]:
#Filter and unify variables and labels
#Manually check that some variables may not exixt or exist with different label


#list(data_nov_2015.columns)
data_nov_2015 = data_nov_2015[['survey', 'uniqid', 'p1', 'tnscntry', 'd7', 'd8', 'd10', 'd11', 'd15a', 'd25', 'd40a', 'qd11_6', 'qd11_3', 'q1.1', 'q1.2', 'q1.3', 'q1.4', 'q1.5', 'q1.6', 'q1.7', 'q1.8', 'q1.9', 'q1.10', 'q1.11', 'q1.12', 'q1.13', 'q1.14', 'q1.15', 'q1.16', 'q1.17', 'q1.18', 'q1.19', 'q1.20', 'q1.21', 'q1.22', 'q1.23', 'q1.24', 'q1.25', 'q1.26', 'q1.27', 'q1.28', 'q1.29', 'q1.30', 'q1.31', 'q1.32', 'q1.33', 'q1.34', 'q1.35']]
data_nov_2015.columns = ['survey', 'uniqid', 'date', 'country', 'marital_status', 'educational', 'gender', 'age', 'occupation', 'type_community', 'household_composition', 'support_refugees', 'support_migrants', 'q1.1', 'q1.2', 'q1.3', 'q1.4', 'q1.5', 'q1.6', 'q1.7', 'q1.8', 'q1.9', 'q1.10', 'q1.11', 'q1.12', 'q1.13', 'q1.14', 'q1.15', 'q1.16', 'q1.17', 'q1.18', 'q1.19', 'q1.20', 'q1.21', 'q1.22', 'q1.23', 'q1.24', 'q1.25', 'q1.26', 'q1.27', 'q1.28', 'q1.29', 'q1.30', 'q1.31', 'q1.32', 'q1.33', 'q1.34', 'q1.35']
print('Shape data_nov_2015 =', data_nov_2015.shape)

Shape data_nov_2015 = (32833, 48)


In [5]:
#list(data_may_2016.columns)
#data_may_2016 does not have p1 (date of interview), but we can create it
data_may_2016['p1']='Saturday 21th May 2016'
data_may_2016 = data_may_2016[['survey', 'uniqid', 'p1', 'tnscntry', 'd7', 'd8', 'd10', 'd11', 'd15a', 'd25', 'd40a', 'qd4_5', 'qd4_2', 'q1.1', 'q1.2', 'q1.3', 'q1.4', 'q1.5', 'q1.6', 'q1.7', 'q1.8', 'q1.9', 'q1.10', 'q1.11', 'q1.12', 'q1.13', 'q1.14', 'q1.15', 'q1.16', 'q1.17', 'q1.18', 'q1.19', 'q1.20', 'q1.21', 'q1.22', 'q1.23', 'q1.24', 'q1.25', 'q1.26', 'q1.27', 'q1.28', 'q1.29', 'q1.30', 'q1.31', 'q1.32', 'q1.33', 'q1.34', 'q1.35']]
data_may_2016.columns = ['survey', 'uniqid', 'date', 'country', 'marital_status', 'educational', 'gender', 'age', 'occupation', 'type_community', 'household_composition', 'support_refugees', 'support_migrants', 'q1.1', 'q1.2', 'q1.3', 'q1.4', 'q1.5', 'q1.6', 'q1.7', 'q1.8', 'q1.9', 'q1.10', 'q1.11', 'q1.12', 'q1.13', 'q1.14', 'q1.15', 'q1.16', 'q1.17', 'q1.18', 'q1.19', 'q1.20', 'q1.21', 'q1.22', 'q1.23', 'q1.24', 'q1.25', 'q1.26', 'q1.27', 'q1.28', 'q1.29', 'q1.30', 'q1.31', 'q1.32', 'q1.33', 'q1.34', 'q1.35']
print('Shape data_may_2016 =', data_may_2016.shape)

Shape data_may_2016 = (32987, 48)


In [6]:
#list(data_nov_2016.columns)
data_nov_2016 = data_nov_2016[['survey', 'uniqid', 'p1', 'tnscntry', 'd7', 'd8', 'd10', 'd11', 'd15a', 'd25', 'd40a', 'qd9_5', 'qd9_2', 'q1.1', 'q1.2', 'q1.3', 'q1.4', 'q1.5', 'q1.6', 'q1.7', 'q1.8', 'q1.9', 'q1.10', 'q1.11', 'q1.12', 'q1.13', 'q1.14', 'q1.15', 'q1.16', 'q1.17', 'q1.18', 'q1.19', 'q1.20', 'q1.21', 'q1.22', 'q1.23', 'q1.24', 'q1.25', 'q1.26', 'q1.27', 'q1.28', 'q1.29', 'q1.30', 'q1.31', 'q1.32', 'q1.33', 'q1.34', 'q1.35']]
data_nov_2016.columns = ['survey', 'uniqid', 'date', 'country', 'marital_status', 'educational', 'gender', 'age', 'occupation', 'type_community', 'household_composition', 'support_refugees', 'support_migrants', 'q1.1', 'q1.2', 'q1.3', 'q1.4', 'q1.5', 'q1.6', 'q1.7', 'q1.8', 'q1.9', 'q1.10', 'q1.11', 'q1.12', 'q1.13', 'q1.14', 'q1.15', 'q1.16', 'q1.17', 'q1.18', 'q1.19', 'q1.20', 'q1.21', 'q1.22', 'q1.23', 'q1.24', 'q1.25', 'q1.26', 'q1.27', 'q1.28', 'q1.29', 'q1.30', 'q1.31', 'q1.32', 'q1.33', 'q1.34', 'q1.35']
print('Shape data_nov_2016 =', data_nov_2016.shape)

Shape data_nov_2016 = (32896, 48)


In [7]:
#list(data_may_2017.columns)
data_may_2017 = data_may_2017[['survey', 'uniqid', 'p1', 'tnscntry', 'd7', 'd8', 'd10', 'd11', 'd15a', 'd25', 'd40a', 'qd11_6', 'qd11_3', 'q1.1', 'q1.2', 'q1.3', 'q1.4', 'q1.5', 'q1.6', 'q1.7', 'q1.8', 'q1.9', 'q1.10', 'q1.11', 'q1.12', 'q1.13', 'q1.14', 'q1.15', 'q1.16', 'q1.17', 'q1.18', 'q1.19', 'q1.20', 'q1.21', 'q1.22', 'q1.23', 'q1.24', 'q1.25', 'q1.26', 'q1.27', 'q1.28', 'q1.29', 'q1.30', 'q1.31', 'q1.32', 'q1.33', 'q1.34', 'q1.35']]
data_may_2017.columns = ['survey', 'uniqid', 'date', 'country', 'marital_status', 'educational', 'gender', 'age', 'occupation', 'type_community', 'household_composition', 'support_refugees', 'support_migrants', 'q1.1', 'q1.2', 'q1.3', 'q1.4', 'q1.5', 'q1.6', 'q1.7', 'q1.8', 'q1.9', 'q1.10', 'q1.11', 'q1.12', 'q1.13', 'q1.14', 'q1.15', 'q1.16', 'q1.17', 'q1.18', 'q1.19', 'q1.20', 'q1.21', 'q1.22', 'q1.23', 'q1.24', 'q1.25', 'q1.26', 'q1.27', 'q1.28', 'q1.29', 'q1.30', 'q1.31', 'q1.32', 'q1.33', 'q1.34', 'q1.35']
print('Shape data_may_2017 =', data_may_2017.shape)

Shape data_may_2017 = (33180, 48)


In [8]:
#list(data_may_2017.columns)
data_nov_2017 = data_nov_2017[['survey', 'uniqid', 'p1', 'tnscntry', 'd7', 'd8', 'd10', 'd11', 'd15a', 'd25', 'd40a', 'qd9_4', 'qd9_1', 'q1.1', 'q1.2', 'q1.3', 'q1.4', 'q1.5', 'q1.6', 'q1.7', 'q1.8', 'q1.9', 'q1.10', 'q1.11', 'q1.12', 'q1.13', 'q1.14', 'q1.15', 'q1.16', 'q1.17', 'q1.18', 'q1.19', 'q1.20', 'q1.21', 'q1.22', 'q1.23', 'q1.24', 'q1.25', 'q1.26', 'q1.27', 'q1.28', 'q1.29', 'q1.30', 'q1.31', 'q1.32', 'q1.33', 'q1.34', 'q1.35']]
data_nov_2017.columns = ['survey', 'uniqid', 'date', 'country', 'marital_status', 'educational', 'gender', 'age', 'occupation', 'type_community', 'household_composition', 'support_refugees', 'support_migrants', 'q1.1', 'q1.2', 'q1.3', 'q1.4', 'q1.5', 'q1.6', 'q1.7', 'q1.8', 'q1.9', 'q1.10', 'q1.11', 'q1.12', 'q1.13', 'q1.14', 'q1.15', 'q1.16', 'q1.17', 'q1.18', 'q1.19', 'q1.20', 'q1.21', 'q1.22', 'q1.23', 'q1.24', 'q1.25', 'q1.26', 'q1.27', 'q1.28', 'q1.29', 'q1.30', 'q1.31', 'q1.32', 'q1.33', 'q1.34', 'q1.35']
print('Shape data_nov_2017 =', data_nov_2017.shape)

Shape data_nov_2017 = (33193, 48)


## And join all datasets

In [9]:
#We have 5 datasets (13 features each) with a total number of records of:
len(data_nov_2015)+ len(data_may_2016) + len(data_nov_2016) + len(data_may_2017) + len(data_nov_2017)

165089

In [10]:
#we can joint the 5 datasets in one big dataset:
data = pd.concat([data_nov_2015,data_may_2016,data_nov_2016,data_may_2017,data_nov_2017])
data.shape

(165089, 48)

## Create the variable "native"

In [11]:
#0= Foreigner, 1= Native
data['native'] = np.where((data['country'] == 'BELGIQUE') & (data['q1.1'] == "Mentioned")|
                                   (data['country'] == 'DANMARK') & (data['q1.2'] == "Mentioned")|
                                   (data['country'] == 'DEUTSCHLAND OST') & (data['q1.3'] == "Mentioned")|
                                   (data['country'] == 'DEUTSCHLAND WEST') & (data['q1.3'] == "Mentioned")|
                                   (data['country'] == 'ELLADA') & (data['q1.4'] == "Mentioned")|
                                   (data['country'] == 'ESPANA') & (data['q1.5'] == "Mentioned")|
                                   (data['country'] == 'FRANCE') & (data['q1.6'] == "Mentioned")|
                                   (data['country'] == 'IRELAND') & (data['q1.7'] == "Mentioned")|
                                   (data['country'] == 'ITALIA') & (data['q1.8'] == "Mentioned")|
                                   (data['country'] == 'LUXEMBOURG') & (data['q1.9'] == "Mentioned")|
                                   (data['country'] == 'NEDERLAND') & (data['q1.10'] == "Mentioned")|
                                   (data['country'] == 'PORTUGAL') & (data['q1.11'] == "Mentioned") |
                                   (data['country'] == 'GREAT BRITAIN') & (data['q1.12'] == "Mentioned")|
                                   (data['country'] == 'ÖSTERREICH') & (data['q1.13'] == "Mentioned")|
                                   (data['country'] == 'SVERIGE') & (data['q1.14'] == "Mentioned")|
                                   (data['country'] == 'SUOMI') & (data['q1.15'] == "Mentioned")|
                                   (data['country'] == 'KYPROS') & (data['q1.16'] == "Mentioned")|
                                   (data['country'] == 'CESKA REPUBLIKA') & (data['q1.17'] == "Mentioned")|
                                   (data['country'] == 'EESTI') & (data['q1.18'] == "Mentioned")|
                                   (data['country'] == 'MAGYARORSZAG') & (data['q1.19'] == "Mentioned")|
                                   (data['country'] == 'LATVIA') & (data['q1.20'] == "Mentioned")|
                                   (data['country'] == 'LIETUVA') & (data['q1.21'] == "Mentioned")|
                                   (data['country'] == 'MALTA') & (data['q1.22'] == "Mentioned")|
                                   (data['country'] == 'POLSKA') & (data['q1.23'] == "Mentioned")|
                                   (data['country'] == 'SLOVENSKA REPUBLIC') & (data['q1.24'] == "Mentioned")|
                                   (data['country'] == 'SLOVENIJA') & (data['q1.25'] == "Mentioned")|
                                   (data['country'] == 'BALGARIJA') & (data['q1.26'] == "Mentioned")|
                                   (data['country'] == 'ROMANIA') & (data['q1.27'] == "Mentioned")|
                                   (data['country'] == 'HRVATSKA') & (data['q1.28'] == "Mentioned")|
                                   (data['country'] == 'TURKIYE') & (data['q1.29'] == "Mentioned")|
                                   (data['country'] == 'REPUBLIKA MAKEDONIJA') & (data['q1.30'] == "Mentioned")|
                                   (data['country'] == 'CRNA GORA') & (data['q1.31'] == "Mentioned")|
                                   (data['country'] == 'SRPSKI') & (data['q1.32'] == "Mentioned")|
                                   (data['country'] == 'SHQIPERIA') & (data['q1.33'] == "Mentioned"), 
                                   1, data['country'])

In [12]:
data['native'] = np.where((data['native'] == 'BELGIQUE') | 
                                   (data['native'] == 'DANMARK') | 
                                   (data['native'] == 'DEUTSCHLAND OST') | 
                                   (data['native'] == 'DEUTSCHLAND WEST') | 
                                   (data['native'] == 'ELLADA') | 
                                   (data['native'] == 'ESPANA') | 
                                   (data['native'] == 'FRANCE') | 
                                   (data['native'] == 'IRELAND') | 
                                   (data['native'] == 'ITALIA') | 
                                   (data['native'] == 'LUXEMBOURG') | 
                                   (data['native'] == 'NEDERLAND') | 
                                   (data['native'] == 'PORTUGAL') | 
                                   (data['native'] == 'GREAT BRITAIN') | 
                                   (data['native'] == 'ÖSTERREICH') | 
                                   (data['native'] == 'SVERIGE') | 
                                   (data['native'] == 'SUOMI') | 
                                   (data['native'] == 'KYPROS') | 
                                   (data['native'] == 'CESKA REPUBLIKA') | 
                                   (data['native'] == 'EESTI') | 
                                   (data['native'] == 'MAGYARORSZAG') | 
                                   (data['native'] == 'LATVIA') | 
                                   (data['native'] == 'LIETUVA') | 
                                   (data['native'] == 'MALTA') | 
                                   (data['native'] == 'POLSKA') | 
                                   (data['native'] == 'SLOVENSKA REPUBLIC') | 
                                   (data['native'] == 'SLOVENIJA') | 
                                   (data['native'] == 'BALGARIJA') | 
                                   (data['native'] == 'ROMANIA') | 
                                   (data['native'] == 'HRVATSKA') | 
                                   (data['native'] == 'TURKIYE') | 
                                   (data['native'] == 'REPUBLIKA MAKEDONIJA') | 
                                   (data['native'] == 'CRNA GORA') | 
                                   (data['native'] == 'SRPSKI') | 
                                   (data['native'] == 'SHQIPERIA'), 
                                   0, data['native'])

In [13]:
#No data enough for 'KUZEY KIBRIS TÜRK CUMHURIYETI', 'NORTHERN IRELAND'
data['native'] = np.where((data['native'] == 'KUZEY KIBRIS TÜRK CUMHURIYETI') | 
                                   (data['native'] == 'NORTHERN IRELAND'), 
                                   np.NaN, data['native'])

data['native'] = pd.to_numeric(data['native'], errors='coerce')
#data['native'].unique()
data['native'].describe()

count    161056.000000
mean          0.882414
std           0.322119
min           0.000000
25%           1.000000
50%           1.000000
75%           1.000000
max           1.000000
Name: native, dtype: float64

### In order to get a smaller df we filter with the variables we need:

* survey = Eurobarometer Number [survey]
* uniqid = Unique ID [uniqid]
* p1 = Date of interview [date]
* tnscntry = Country [country]
* native = 0= Foreigner, 1= Native
* d7 = Marital Status [marital_status]
* d8 = Age education (Number of years of education) [educational]
* d10 = Gender [gender]
* d11 = Age [age]
* d15a = Occupation [occupation]
* d25 = Type of community [type_community]
* d40a = Household composition [household_compostion]
* qd11_6 = COUNTRY SHOULD HELP REFUGEES [support_refugees]
* qd11_3 = IMMIGRANTS CONTRIBUTE A LOT [support_migrants]

In [14]:
data = data[['survey', 'uniqid', 'date', 'country', 'native', 'marital_status', 'educational', 'gender', 'age', 'occupation', 'type_community', 'household_composition', 'support_refugees', 'support_migrants']]
data

Unnamed: 0,survey,uniqid,date,country,native,marital_status,educational,gender,age,occupation,type_community,household_composition,support_refugees,support_migrants
0,Eurobarometer 84.3 (November 2015),710000002,Saturday 7th November 2015,SHQIPERIA,1.0,(Re-)Married: children this marriage,14,Woman,47,"Responsible for ordinary shopping, etc.",Small/middle town,Six,Inap. (not 1 in eu28),Inap. (not 1 in eu28)
1,Eurobarometer 84.3 (November 2015),710000003,Saturday 7th November 2015,SHQIPERIA,1.0,(Re-)Married: children this marriage,14,Man,52,Farmer,Rural area or village,Five,Inap. (not 1 in eu28),Inap. (not 1 in eu28)
2,Eurobarometer 84.3 (November 2015),710000004,Saturday 7th November 2015,SHQIPERIA,1.0,Single liv w partner: childr this union,Still studying,Woman,23,Student,Small/middle town,One,Inap. (not 1 in eu28),Inap. (not 1 in eu28)
3,Eurobarometer 84.3 (November 2015),710000005,Saturday 7th November 2015,SHQIPERIA,1.0,(Re-)Married: children this marriage,22,Woman,42,"Employed professional (employed doctor, etc.)",Small/middle town,Four,Inap. (not 1 in eu28),Inap. (not 1 in eu28)
4,Eurobarometer 84.3 (November 2015),710000006,Saturday 7th November 2015,SHQIPERIA,1.0,Single liv w partner: childr this union,23,Man,43,"Employed professional (employed doctor, etc.)",Small/middle town,Three,Inap. (not 1 in eu28),Inap. (not 1 in eu28)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33188,Eurobarometer 88.3 (November 2017),710001485,"Thursday, 9th November 2017",SHQIPERIA,0.0,(Re-)Married: children this marriage,18,Man,62,Farmer,Rural area or village,Four,Inap. (not 1 in eu28),Inap. (not 1 in eu28)
33189,Eurobarometer 88.3 (November 2017),710001486,"Thursday, 9th November 2017",SHQIPERIA,0.0,(Re-)Married: children this marriage,18,Man,46,"Employed position, travelling",Rural area or village,Four,Inap. (not 1 in eu28),Inap. (not 1 in eu28)
33190,Eurobarometer 88.3 (November 2017),710001488,"Thursday, 9th November 2017",SHQIPERIA,0.0,(Re-)Married: children this marriage,18,Woman,37,"Responsible for ordinary shopping, etc.",Rural area or village,Four,Inap. (not 1 in eu28),Inap. (not 1 in eu28)
33191,Eurobarometer 88.3 (November 2017),710001490,"Thursday, 9th November 2017",SHQIPERIA,0.0,(Re-)Married: children this marriage,18,Man,58,Farmer,Rural area or village,Five,Inap. (not 1 in eu28),Inap. (not 1 in eu28)


In [15]:
data.shape

(165089, 14)

In [16]:
#make a copy of the df
data = data.copy()

## We check each variable

In [17]:
data.columns

Index(['survey', 'uniqid', 'date', 'country', 'native', 'marital_status',
       'educational', 'gender', 'age', 'occupation', 'type_community',
       'household_composition', 'support_refugees', 'support_migrants'],
      dtype='object')

In [18]:
data['survey'].unique()

array(['Eurobarometer 84.3 (November 2015)',
       'Eurobarometer 85.2 (May 2016)',
       'Eurobarometer 86.2 (November 2016)',
       'Eurobarometer 87.3 (May 2017)',
       'Eurobarometer 88.3 (November 2017)'], dtype=object)

In [19]:
data['uniqid'].describe()

count    1.650890e+05
mean     3.513673e+08
std      2.837433e+08
min      1.579478e+07
25%      1.000008e+08
50%      3.400031e+08
75%      4.300007e+08
max      1.000000e+09
Name: uniqid, dtype: float64

### Date

In [20]:
# Variable Date
data['date'].unique()

array(['Saturday 7th November 2015', 'Sunday 8th November 2015',
       'Monday 9th November 2015', 'Tuesday 10th November 2015',
       'Wednesday 11th November 2015', 'Thursday 12th November 2015',
       'Friday 13th November 2015', 'Saturday 14th November 2015',
       'Sunday 15th November 2015', 'Monday 16th November 2015',
       'Tuesday 17th November 2015', 'Saturday 21th May 2016',
       'Sunday, 13th November 2016', 'Monday, 14th November 2016',
       'Sunday, 6th November 2016', 'Tuesday, 8th November 2016',
       'Thursday, 10th November 2016', 'Friday, 11th November 2016',
       'Saturday, 12th November 2016', 'Saturday, 5th November 2016',
       'Wednesday, 9th November 2016', 'Monday, 7th November 2016',
       'Thursday, 3rd November 2016', 'Friday, 4th November 2016',
       'Monday, 22nd May 2017', 'Friday, 26th May 2017',
       'Sunday, 21th May 2017', 'Tuesday, 23rd May 2017',
       'Wednesday, 24th May 2017', 'Saturday, 20th May 2017',
       'Saturday, 27t

In [21]:
#Remove ordinals from the string
data.loc[:,'date'] = data.loc[:,'date'].replace('th', '', regex=True)
data.loc[:,'date'] = data.loc[:,'date'].replace(',', '', regex=True)
data.loc[:,'date'] = data.loc[:,'date'].replace('1st', '1', regex=True)
data.loc[:,'date'] = data.loc[:,'date'].replace('2nd', '2', regex=True)
data.loc[:,'date'] = data.loc[:,'date'].replace('3rd', '3', regex=True)
data['date'].unique()

array(['Saturday 7 November 2015', 'Sunday 8 November 2015',
       'Monday 9 November 2015', 'Tuesday 10 November 2015',
       'Wednesday 11 November 2015', 'Thursday 12 November 2015',
       'Friday 13 November 2015', 'Saturday 14 November 2015',
       'Sunday 15 November 2015', 'Monday 16 November 2015',
       'Tuesday 17 November 2015', 'Saturday 21 May 2016',
       'Sunday 13 November 2016', 'Monday 14 November 2016',
       'Sunday 6 November 2016', 'Tuesday 8 November 2016',
       'Thursday 10 November 2016', 'Friday 11 November 2016',
       'Saturday 12 November 2016', 'Saturday 5 November 2016',
       'Wednesday 9 November 2016', 'Monday 7 November 2016',
       'Thursday 3 November 2016', 'Friday 4 November 2016',
       'Monday 22 May 2017', 'Friday 26 May 2017', 'Sunday 21 May 2017',
       'Tuesday 23 May 2017', 'Wednesday 24 May 2017',
       'Saturday 20 May 2017', 'Saturday 27 May 2017',
       'Monday 29 May 2017', 'Thursday 25 May 2017', 'Sunday 28 May 2017',


In [22]:
#Transform the string to datetime format
from datetime import datetime
import locale
locale.setlocale(locale.LC_ALL, str('en_US.UTF-8'))
data['date'] = data['date'].apply(lambda x: datetime.strptime(x, '%A %d %B %Y'))

In [23]:
data['date'].unique()

array(['2015-11-07T00:00:00.000000000', '2015-11-08T00:00:00.000000000',
       '2015-11-09T00:00:00.000000000', '2015-11-10T00:00:00.000000000',
       '2015-11-11T00:00:00.000000000', '2015-11-12T00:00:00.000000000',
       '2015-11-13T00:00:00.000000000', '2015-11-14T00:00:00.000000000',
       '2015-11-15T00:00:00.000000000', '2015-11-16T00:00:00.000000000',
       '2015-11-17T00:00:00.000000000', '2016-05-21T00:00:00.000000000',
       '2016-11-13T00:00:00.000000000', '2016-11-14T00:00:00.000000000',
       '2016-11-06T00:00:00.000000000', '2016-11-08T00:00:00.000000000',
       '2016-11-10T00:00:00.000000000', '2016-11-11T00:00:00.000000000',
       '2016-11-12T00:00:00.000000000', '2016-11-05T00:00:00.000000000',
       '2016-11-09T00:00:00.000000000', '2016-11-07T00:00:00.000000000',
       '2016-11-03T00:00:00.000000000', '2016-11-04T00:00:00.000000000',
       '2017-05-22T00:00:00.000000000', '2017-05-26T00:00:00.000000000',
       '2017-05-21T00:00:00.000000000', '2017-05-23

In [24]:
data['date'].describe()

count                  165089
unique                     50
top       2016-05-21 00:00:00
freq                    32987
first     2015-11-07 00:00:00
last      2017-11-19 00:00:00
Name: date, dtype: object

### Country

In [25]:
#We have 36 countries
data['country'].unique()
#len(data_nov_2015['country'].unique())

array(['SHQIPERIA', 'ÖSTERREICH', 'BELGIQUE', 'BALGARIJA', 'KYPROS',
       'KUZEY KIBRIS TÜRK CUMHURIYETI', 'CESKA REPUBLIKA',
       'DEUTSCHLAND OST', 'DEUTSCHLAND WEST', 'DANMARK', 'EESTI',
       'ESPANA', 'SUOMI', 'FRANCE', 'GREAT BRITAIN', 'NORTHERN IRELAND',
       'ELLADA', 'HRVATSKA', 'MAGYARORSZAG', 'IRELAND', 'ITALIA',
       'LIETUVA', 'LUXEMBOURG', 'LATVIA', 'CRNA GORA',
       'REPUBLIKA MAKEDONIJA', 'MALTA', 'NEDERLAND', 'POLSKA', 'PORTUGAL',
       'ROMANIA', 'SRPSKI', 'SVERIGE', 'SLOVENIJA', 'SLOVENSKA REPUBLIC',
       'TURKIYE'], dtype=object)

In [26]:
data['native'].unique()

array([ 1.,  0., nan])

In [27]:
data['native'].describe()

count    161056.000000
mean          0.882414
std           0.322119
min           0.000000
25%           1.000000
50%           1.000000
75%           1.000000
max           1.000000
Name: native, dtype: float64

### Marital Status

In [28]:
data['marital_status'].unique()

array(['(Re-)Married: children this marriage',
       'Single liv w partner: childr this union',
       'Single: without children', '(Re-)Married: without children',
       'Refusal (SPONT.)', 'Single: with children',
       'Single liv w partner: without children',
       '(Re-)Married: children prev marriage',
       'Divorced/Separated: without children', 'Widow: with children',
       '(Re-)Married: children this/prev marriage',
       'Divorced/Separated: with children',
       'Single liv w partner: childr prev union',
       'Widow: without children', 'Other (SPONT.)',
       'Single liv w partner: childr this/prev union'], dtype=object)

In [29]:
#We simplifly this variable
data['marital_status'] = data['marital_status'].replace('Single: without children', 'Single')
data['marital_status'] = data['marital_status'].replace('(Re-)Married: children this marriage', 'Married')
data['marital_status'] = data['marital_status'].replace('Widow: with children', 'Widow')
data['marital_status'] = data['marital_status'].replace('Single liv w partner: childr this union', 'Partnership')
data['marital_status'] = data['marital_status'].replace('Single liv w partner: without children', 'Partnership')
data['marital_status'] = data['marital_status'].replace('Divorced/Separated: without children', 'Divorced/Separated')
data['marital_status'] = data['marital_status'].replace('(Re-)Married: without children', 'Married')
data['marital_status'] = data['marital_status'].replace('Widow: without children', 'Widow')
data['marital_status'] = data['marital_status'].replace('Divorced/Separated: with children', 'Divorced/Separated')
data['marital_status'] = data['marital_status'].replace('(Re-)Married: children prev marriage', 'Married')
data['marital_status'] = data['marital_status'].replace('Single liv w partner: childr this/prev union', 'Partnership')
data['marital_status'] = data['marital_status'].replace('Single: with children', 'Single')
data['marital_status'] = data['marital_status'].replace('Single liv w partner: childr prev union', 'Single')
data['marital_status'] = data['marital_status'].replace('(Re-)Married: children this/prev marriage', 'Married')
data['marital_status'] = data['marital_status'].replace('Other (SPONT.)', np.NaN)
data['marital_status'] = data['marital_status'].replace('Refusal (SPONT.)', np.NaN)
data['marital_status'].unique()

array(['Married', 'Partnership', 'Single', nan, 'Divorced/Separated',
       'Widow'], dtype=object)

### Age

In [30]:
data['age'].unique()

array(['47', '52', '23', '42', '43', '41', '38', '48', '26', '28', '27',
       '22', '50', '18', '51', '45', '19', '61', '34', '54', '40', '31',
       '63', '37', '59', '44', '55', '62', '68', '58', '65', '20', '56',
       '35', '53', '60', '17', '49', '36', '32', '25', '24', '30', '29',
       '33', '69', '73', '39', '67', '46', '57', '21', '64', '70', '66',
       '15 years', '16', '71', '72', '76', '74', '88', '80', '77', '92',
       '78', '75', '79', '82', '85', '83', '89', '90', '81', '84', '86',
       '95', '93', '87', '91', '94', '96',
       '99 years (and older) [NOT DOCUMENTED]', '97', '98 years',
       '99 years (and older)'], dtype=object)

In [31]:
data['age'] = data['age'].replace('15 years', '15')
data['age'] = data['age'].replace(15, '15')
data['age'] = data['age'].replace(99, '99')
data['age'] = data['age'].replace(98, '98')
data['age'] = data['age'].replace('99 years (and older) [NOT DOCUMENTED]', np.nan)
data['age'] = data['age'].replace('99 years (and older)', '99')
data['age'] = data['age'].replace('98 years', '98')
#Transform this variable tu numbers
data['age'] = pd.to_numeric(data['age'], errors='coerce')
#data['age'] = data['age'].convert_objects(convert_numeric=True)

data['age'].describe()

count    165066.000000
mean         49.608732
std          18.242549
min          15.000000
25%          35.000000
50%          50.000000
75%          64.000000
max          99.000000
Name: age, dtype: float64

In [32]:
#Recode Age to levels (similar to Census)
# 0 = under 15 years
# 1 = 15 to 29 years
# 2 = 30 to 49 years
# 3 = 50 to 64 years
# 4 = 65 to 84 years
# 5 = 85 years and over

data['age'] = np.where(data['age'].between(0,14), 0, data['age'])
data['age'] = np.where(data['age'].between(15,29), 1, data['age'])
data['age'] = np.where(data['age'].between(30,49), 2, data['age'])
data['age'] = np.where(data['age'].between(50,64), 3, data['age'])
data['age'] = np.where(data['age'].between(65,84), 4, data['age'])
data['age'] = np.where(data['age'].between(85,120), 5, data['age'])


data['age'].unique()

array([ 2.,  3.,  1.,  4.,  5., nan])

### Educational

In [33]:
data['educational'].unique()

array(['14', 'Still studying', '22', '23', '24', '18', 'DK', '25', '19',
       '21', 'Refusal', 'No full-time education', '16', '20', '28', '26',
       '30', '5', '15', '13', '11', '12', '10', '29', '17', '27', '35',
       '31', '48', '50', '38', '2 years', '37', '36', '34', '49', '33',
       '40', '32', '52', '39', '3', '41', '8', '9', '54', '45', '64',
       '42', '60', '58', '57', '67', '43', '55', '47', '51', '44', '56',
       '61', '7', '46', '63', '62', '53', '77 years', '59', '4', '6',
       '66', '78 years', '65', '68', '70', '69', '87 years', '79',
       '80 years', '73', '71', '75 years'], dtype=object)

In [34]:
data['educational'] = data['educational'].replace('0', np.NaN)
data['educational'] = data['educational'].replace('97', np.NaN)
data['educational'] = data['educational'].replace('98', np.NaN)
data['educational'] = data['educational'].replace('99', np.NaN)
data['educational'] = data['educational'].replace('Refusal', np.NaN)
data['educational'] = data['educational'].replace('No full-time education', np.NaN)
data['educational'] = data['educational'].replace('2 years', 2)
data['educational'] = data['educational'].replace('80 years', 80)
data['educational'] = data['educational'].replace('78 years', 78)
data['educational'] = data['educational'].replace('87 years', 87)
data['educational'] = data['educational'].replace('77 years', 77)
data['educational'] = data['educational'].replace('75 years', 75)
data['educational'] = data['educational'].replace('DK', np.NaN)
data['educational'] = data['educational'].replace('Still studying', np.NaN)
#Transform this variable tu numbers
data['educational'] = pd.to_numeric(data['educational'], errors='coerce')
#data['educational'].unique()
data['educational'].describe()

count    147880.000000
mean         19.309758
std           5.021245
min           2.000000
25%          17.000000
50%          18.000000
75%          22.000000
max          87.000000
Name: educational, dtype: float64

In [35]:
#Recode educational to levels (similar to Census)
#0. No formal education	<10
#1. ISCED Level 1. Primary education	10-12
#2. ISCED Level 2. Lower secondary education	13-15
#3. ISCED Level 3. Upper secondary education	16-18
#4. ISCED Level 4. Post secondary non-tertiary education, ISCED Level 5. First stage of tertiary education, ISCED Level 6. Second Stage of tertiary education	>18 
data['educational'] = np.where(data['educational'].between(0,9), 0, data['educational'])
data['educational'] = np.where(data['educational'].between(10,12), 1, data['educational'])
data['educational'] = np.where(data['educational'].between(13,15), 2, data['educational'])
data['educational'] = np.where(data['educational'].between(16,18), 3, data['educational'])
data['educational'] = np.where(data['educational'].between(18,100), 4, data['educational'])
data['educational'].unique()

array([ 2., nan,  4.,  3.,  0.,  1.])

In [36]:
data['educational'].describe()

count    147880.000000
mean          3.244306
std           0.856819
min           0.000000
25%           3.000000
50%           3.000000
75%           4.000000
max           4.000000
Name: educational, dtype: float64

### Gender

In [37]:
data['gender'].unique()

array(['Woman', 'Man'], dtype=object)

In [38]:
data['gender'].value_counts()

Woman    88557
Man      76532
Name: gender, dtype: int64

### Occupation

In [39]:
data['occupation'].unique()


array(['Responsible for ordinary shopping, etc.', 'Farmer', 'Student',
       'Employed professional (employed doctor, etc.)',
       'Middle management, etc.', 'Employed position, travelling',
       'Employed position, at desk',
       'Unemployed, temporarily not working', 'Skilled manual worker',
       'General management, etc.', 'Owner of a shop, craftsmen, etc.',
       'Business proprietors, etc.', 'Retired, unable to work',
       'Unskilled manual worker, etc.', 'Professional (lawyer, etc.)',
       'Employed position, service job', 'Supervisor', 'Fisherman'],
      dtype=object)

In [40]:
data['occupation'] = data['occupation'].replace('Student', 'Not active')
data['occupation'] = data['occupation'].replace('Employed position, at desk', 'Employed')
data['occupation'] = data['occupation'].replace('Unskilled manual worker, etc.', 'Employed')
data['occupation'] = data['occupation'].replace('Retired, unable to work', 'Not active')
data['occupation'] = data['occupation'].replace('Employed position, service job', 'Employed')
data['occupation'] = data['occupation'].replace('Unemployed, temporarily not working', 'Unemployed')
data['occupation'] = data['occupation'].replace('Owner of a shop, craftsmen, etc.', 'Employed')
data['occupation'] = data['occupation'].replace('Middle management, etc.', 'Employed')
data['occupation'] = data['occupation'].replace('Skilled manual worker', 'Employed')
data['occupation'] = data['occupation'].replace('Employed professional (employed doctor, etc.)', 'Employed')
data['occupation'] = data['occupation'].replace('General management, etc.', 'Employed')
data['occupation'] = data['occupation'].replace('Employed position, travelling', 'Employed')
data['occupation'] = data['occupation'].replace('Business proprietors, etc.', 'Employed')
data['occupation'] = data['occupation'].replace('Supervisor', 'Employed')
data['occupation'] = data['occupation'].replace('Professional (lawyer, etc.)', 'Employed')
data['occupation'] = data['occupation'].replace('Responsible for ordinary shopping, etc.', 'Employed')
data['occupation'] = data['occupation'].replace('Farmer', 'Employed')
data['occupation'] = data['occupation'].replace('Fisherman', 'Employed')
data['occupation'].unique()

array(['Employed', 'Not active', 'Unemployed'], dtype=object)

In [41]:
data['occupation'].value_counts()

Employed      91303
Not active    60081
Unemployed    13705
Name: occupation, dtype: int64

### Type of community

In [42]:
data['type_community'].unique()

array(['Small/middle town', 'Rural area or village', 'Large town', 'DK'],
      dtype=object)

In [43]:
data['type_community'] = data['type_community'].replace('DK', np.NaN)
data['type_community'].unique()

array(['Small/middle town', 'Rural area or village', 'Large town', nan],
      dtype=object)

### Household composition

In [44]:
data['household_composition'].unique()

array(['Six', 'Five', 'One', 'Four', 'Three', 'Two', 'Seven', 'Eight',
       'Nine', 'Eighteen', 'DK/NA', 'Twenty', 'Ten', 'Sixteen', 'Twelve',
       'Thirteen', 'Fifteen', 'Fourteen', 'Nineteen', 'Eleven'],
      dtype=object)

In [45]:
data['household_composition'] = data['household_composition'].replace('DK/NA', np.NaN)
data['household_composition'] = data['household_composition'].replace('One', 1)
data['household_composition'] = data['household_composition'].replace('Two', 2)
data['household_composition'] = data['household_composition'].replace('Three', 3)
data['household_composition'] = data['household_composition'].replace('Four', 4)
data['household_composition'] = data['household_composition'].replace('Five', 5)
data['household_composition'] = data['household_composition'].replace('Six', 6)
data['household_composition'] = data['household_composition'].replace('Seven', 7)
data['household_composition'] = data['household_composition'].replace('Eight', 8)
data['household_composition'] = data['household_composition'].replace('Nine', 9)
data['household_composition'] = data['household_composition'].replace('Ten', 10)
data['household_composition'] = data['household_composition'].replace('Eleven', 11)
data['household_composition'] = data['household_composition'].replace('Twelve', 12)
data['household_composition'] = data['household_composition'].replace('Thirteen', 13)
data['household_composition'] = data['household_composition'].replace('Fourteen', 14)
data['household_composition'] = data['household_composition'].replace('Fifteen', 15)
data['household_composition'] = data['household_composition'].replace('Sixteen', 16)
data['household_composition'] = data['household_composition'].replace('Seventeen', 17)
data['household_composition'] = data['household_composition'].replace('Eighteen', 18)
data['household_composition'] = data['household_composition'].replace('Nineteen', 19)
data['household_composition'] = data['household_composition'].replace('Twenty', 20)
data['household_composition'] = pd.to_numeric(data['household_composition'], errors='coerce')
data['household_composition'].describe()

count    165051.000000
mean          2.330528
std           1.161110
min           1.000000
25%           2.000000
50%           2.000000
75%           3.000000
max          20.000000
Name: household_composition, dtype: float64

In [46]:
#Recode household_composition to levels (similar to Census)
# 0 = 1 person
# 1 = 2 persons
# 2 = 3 to 5 persons
# 3 = 6 and more persons

data['household_composition'] = np.where(data['household_composition'].between(0,1), 0, data['household_composition'])
data['household_composition'] = np.where(data['household_composition'].between(2,2), 1, data['household_composition'])
data['household_composition'] = np.where(data['household_composition'].between(3,5), 2, data['household_composition'])
data['household_composition'] = np.where(data['household_composition'].between(6,30), 3, data['household_composition'])
data['household_composition'].unique()



array([ 3.,  2.,  0.,  1., nan])

### Support Refugees
STATEMENTS: COUNTRY SHOULD HELP REFUGEES {1, Totally agree}...{4, Totally disagree}

#### Level of rejection 
#### 1 = Not Support, 0 = Support

In [47]:
data['support_refugees'].unique()

array(['Inap. (not 1 in eu28)', 'Tend to agree', 'Totally agree',
       'Tend to disagree', 'Totally disagree', 'DK',
       'Inap. (MK ME RS AL TR in isocntry)'], dtype=object)

In [48]:
data['support_refugees'] = data['support_refugees'].replace('Totally agree', 0)
data['support_refugees'] = data['support_refugees'].replace('Tend to agree', 0)
data['support_refugees'] = data['support_refugees'].replace('Tend to disagree', 1)
data['support_refugees'] = data['support_refugees'].replace('Totally disagree', 1)
data['support_refugees'] = data['support_refugees'].replace('Inap. (not 1 in eu28)', np.NaN)
data['support_refugees'] = data['support_refugees'].replace('Inap. (MK ME RS AL TR in isocntry)', np.NaN)
data['support_refugees'] = data['support_refugees'].replace('DK', np.NaN)
data['support_refugees'] = pd.to_numeric(data['support_refugees'], errors='coerce')
data['support_refugees'].unique()

array([nan,  0.,  1.])

### Support Migrants
STATEMENTS: IMMIGRANTS CONTRIBUTE A LOT {1, Totally agree}...{4, Totally disagree}
#### Level of rejection 
#### 1 = Not Support, 0 = Support

In [49]:
data['support_migrants'].unique()

array(['Inap. (not 1 in eu28)', 'Tend to disagree', 'Totally agree',
       'Tend to agree', 'Totally disagree', 'DK',
       'Inap. (MK ME RS AL TR in isocntry)'], dtype=object)

In [50]:
data['support_migrants'] = data['support_migrants'].replace('Totally agree', 0)
data['support_migrants'] = data['support_migrants'].replace('Tend to agree', 0)
data['support_migrants'] = data['support_migrants'].replace('Tend to disagree', 1)
data['support_migrants'] = data['support_migrants'].replace('Totally disagree', 1)
data['support_migrants'] = data['support_migrants'].replace('Inap. (not 1 in eu28)', np.NaN)
data['support_migrants'] = data['support_migrants'].replace('Inap. (MK ME RS AL TR in isocntry)', np.NaN)
data['support_migrants'] = data['support_migrants'].replace('DK', np.NaN)
data['support_migrants'] = pd.to_numeric(data['support_migrants'], errors='coerce')
data['support_migrants'].unique()

array([nan,  1.,  0.])

In [51]:
data.describe()

Unnamed: 0,uniqid,native,educational,age,household_composition,support_refugees,support_migrants
count,165089.0,161056.0,147880.0,165066.0,165051.0,131577.0,130376.0
mean,351367300.0,0.882414,3.244306,2.598349,1.117527,0.34655,0.589388
std,283743300.0,0.322119,0.856819,1.069912,0.752643,0.475873,0.491947
min,15794780.0,0.0,0.0,1.0,0.0,0.0,0.0
25%,100000800.0,1.0,3.0,2.0,1.0,0.0,0.0
50%,340003100.0,1.0,3.0,3.0,1.0,0.0,1.0
75%,430000700.0,1.0,4.0,3.0,2.0,1.0,1.0
max,1000000000.0,1.0,4.0,5.0,3.0,1.0,1.0


## Drop missing values

In [52]:
#Check missing values in each variable
print('Missing values:\n', data.isnull().sum())

Missing values:
 survey                       0
uniqid                       0
date                         0
country                      0
native                    4033
marital_status            1972
educational              17209
gender                       0
age                         23
occupation                   0
type_community              92
household_composition       38
support_refugees         33512
support_migrants         34713
dtype: int64


In [53]:
wonan=len(data)
print('Total records: ', wonan)

Total records:  165089


In [54]:
#Drop all missing valued
data = data.dropna()

In [55]:
print('Total records without missing values', len(data))

Total records without missing values 112837


In [56]:
print('Total deteled records:', wonan-len(data))

Total deteled records: 52252


In [57]:
#There were 6 countries that did not have data on our target variables, so we remain with less countries:
len(data['country'].unique())

29

In [58]:
data.describe()

Unnamed: 0,uniqid,native,educational,age,household_composition,support_refugees,support_migrants
count,112837.0,112837.0,112837.0,112837.0,112837.0,112837.0,112837.0
mean,296642900.0,0.975815,3.280183,2.787233,1.010139,0.359253,0.590196
std,271407100.0,0.153625,0.829606,1.001647,0.714565,0.479784,0.491799
min,15794780.0,0.0,0.0,1.0,0.0,0.0,0.0
25%,80002730.0,1.0,3.0,2.0,1.0,0.0,0.0
50%,310002300.0,1.0,3.0,3.0,1.0,0.0,1.0
75%,390002400.0,1.0,4.0,4.0,1.0,1.0,1.0
max,1000000000.0,1.0,4.0,5.0,3.0,1.0,1.0


### Matrix
Most of the algortithms will work with numbers instead of strings. In fact, they will need matrix of numbers instead of dataframes.

In [59]:
#We can get an index of features:
features = [data.columns]
features = pd.DataFrame(features).T
features

Unnamed: 0,0
0,survey
1,uniqid
2,date
3,country
4,native
5,marital_status
6,educational
7,gender
8,age
9,occupation


In [60]:
#And confirm many of them are not simple integers
data.dtypes

survey                           object
uniqid                            int64
date                     datetime64[ns]
country                          object
native                          float64
marital_status                   object
educational                     float64
gender                           object
age                             float64
occupation                       object
type_community                   object
household_composition           float64
support_refugees                float64
support_migrants                float64
dtype: object

In [61]:
#First, we transform categorical values into dummies
data_matrix = pd.get_dummies(data, columns=['country', 'marital_status', 'gender', 'occupation', 'type_community'])

In [62]:
#And drop non-relevant columns for the matrix
del data_matrix['survey']
del data_matrix['uniqid']
del data_matrix['date']
data_matrix.columns

Index(['native', 'educational', 'age', 'household_composition',
       'support_refugees', 'support_migrants', 'country_BALGARIJA',
       'country_BELGIQUE', 'country_CESKA REPUBLIKA', 'country_DANMARK',
       'country_DEUTSCHLAND OST', 'country_DEUTSCHLAND WEST', 'country_EESTI',
       'country_ELLADA', 'country_ESPANA', 'country_FRANCE',
       'country_GREAT BRITAIN', 'country_HRVATSKA', 'country_IRELAND',
       'country_ITALIA', 'country_KYPROS', 'country_LATVIA', 'country_LIETUVA',
       'country_LUXEMBOURG', 'country_MAGYARORSZAG', 'country_MALTA',
       'country_NEDERLAND', 'country_POLSKA', 'country_PORTUGAL',
       'country_ROMANIA', 'country_SLOVENIJA', 'country_SLOVENSKA REPUBLIC',
       'country_SUOMI', 'country_SVERIGE', 'country_ÖSTERREICH',
       'marital_status_Divorced/Separated', 'marital_status_Married',
       'marital_status_Partnership', 'marital_status_Single',
       'marital_status_Widow', 'gender_Man', 'gender_Woman',
       'occupation_Employed', 'oc

In [63]:
data_matrix.dtypes

native                                  float64
educational                             float64
age                                     float64
household_composition                   float64
support_refugees                        float64
support_migrants                        float64
country_BALGARIJA                         uint8
country_BELGIQUE                          uint8
country_CESKA REPUBLIKA                   uint8
country_DANMARK                           uint8
country_DEUTSCHLAND OST                   uint8
country_DEUTSCHLAND WEST                  uint8
country_EESTI                             uint8
country_ELLADA                            uint8
country_ESPANA                            uint8
country_FRANCE                            uint8
country_GREAT BRITAIN                     uint8
country_HRVATSKA                          uint8
country_IRELAND                           uint8
country_ITALIA                            uint8
country_KYPROS                          

In [64]:
#Get an index of features in the matrix:
features_matrix = [data_matrix.columns]
features_matrix = pd.DataFrame(features_matrix).T
features_matrix

Unnamed: 0,0
0,native
1,educational
2,age
3,household_composition
4,support_refugees
5,support_migrants
6,country_BALGARIJA
7,country_BELGIQUE
8,country_CESKA REPUBLIKA
9,country_DANMARK


In [65]:
#Transform to matrix: ndarray
data_matrix = data_matrix.as_matrix()
type(data_matrix)

  


numpy.ndarray

In [66]:
#First record:
data_matrix[0]

array([1., 4., 3., 1., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       1., 0., 0., 1., 0., 0., 1., 0., 1., 0., 0., 0., 0., 1.])

In [67]:
data_matrix.shape

(112837, 48)

In [68]:
data_matrix.dtype

dtype('float64')

## Modeling data with ML
Once we have a matrix, we want to use machine learning to model the relationship between our target feature (Y) and our descriptive features (X). We can use different ML algorithms to model this relationship. We will use this model to predict the value of Y based on X.

#### Predicting support for refugees

In [69]:
#We create vectors for Y and X:
target_feature = data_matrix[:, 4]
descriptive_features = data_matrix[:, [0, 1, 2, 3, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47]]
Y = target_feature
X = descriptive_features

In [70]:
#In average the level of rejection towards refugges was 35,8%
Y.mean()

0.3592527273855207

In [71]:
#We estimate the average for each of the descriptive features (97,6% native, 3.3 on level of eduation (0-4), 53 years old, 2 family members, 3.5% from Bulgaria, etc.)
X.mean(axis=0)

array([0.97581467, 3.28018292, 2.78723291, 1.01013852, 0.03561775,
       0.03788651, 0.03840939, 0.03598997, 0.02026817, 0.03739908,
       0.0353696 , 0.03951718, 0.03410229, 0.03651285, 0.03817897,
       0.03742567, 0.03730159, 0.0342175 , 0.01880589, 0.03641536,
       0.03632674, 0.01838936, 0.04079336, 0.01885906, 0.03896771,
       0.0319576 , 0.03676099, 0.03497966, 0.03685848, 0.03739022,
       0.03750543, 0.04129851, 0.03649512, 0.08638124, 0.56677331,
       0.10550617, 0.1371979 , 0.10414137, 0.4570398 , 0.5429602 ,
       0.58370924, 0.33948084, 0.07680991, 0.26725276, 0.31342556,
       0.41932168])

### Training and testing datasets
We randomly devide the data into two sets: 1 training dataset (data to train the model) and 1 testing dataset (data to test the predictions).

In [72]:
#We create 4 new ndarray objects to work with
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=0)
X_train[2] #Third record as example

array([1., 3., 2., 2., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
       0., 0., 0., 0., 0., 1., 1., 0., 0., 0., 0., 1.])

### We standarize the descriptive features

In [73]:
#We create 2 new re-scaled objects from 0 to 1 (the reference of the scaler is X_train in order to compare)
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
sc.fit(X_train)
X_train_std = sc.transform(X_train)
X_test_std = sc.transform(X_test)

In [74]:
#Save the sc
pickle.dump(sc, open( "models/sc.pickle", "wb" ) )

In [75]:
X_train_std[2] #Third record as example

array([ 0.15503879, -0.3379366 , -0.78703149,  1.38128147, -0.19163879,
       -0.19990123, -0.20085825, -0.19453137, -0.14443175, -0.19766445,
       -0.19117677, -0.20417798, -0.1879875 , -0.19354782, -0.19897469,
       -0.1987683 , -0.19613805, -0.1894266 , -0.1394657 , -0.19330127,
       -0.19439112, -0.13718416,  4.87205547, -0.13823419, -0.2006535 ,
       -0.18157578, -0.19404013, -0.18748157, -0.19467153, -0.19776816,
       -0.1969372 , -0.20811604, -0.1947766 ,  3.25185942, -1.14174235,
       -0.34347625, -0.39918013, -0.34204634, -0.91950369,  0.91950369,
        0.846136  , -0.71780333, -0.28922117, -0.603771  , -0.67682707,
        1.17834237])

### Logistic regression

In [78]:
#Create a model based on data
from sklearn.linear_model import LogisticRegression
#El parámetro C es inverso a la regularización del modelo: Subir C es bajar la fuerza de la regularización y viceversa.
#Entre más se regularice (baje C) más disminuyen los pesos en el modelo
#Con C=1000.0 me da una precisión de 98%, con C=1.0 baja a 80%, con C=10000 sigue dando 98%...
lr = LogisticRegression(C=1000.0, random_state=0)
lr.fit(X_train_std, Y_train)

LogisticRegression(C=1000.0, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=0,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

In [79]:
#Based on the model we make predictions with the testset
Y_pred = lr.predict(X_test_std)
Y_pred

array([ 0.,  1.,  0., ...,  0.,  0.,  0.])

In [80]:
#We can estimate the accuracy of these predictions
from sklearn.metrics import accuracy_score
print('Number of predictions: %d' % len(Y_test))
print('Number of misclassifications: %d' % (Y_test != Y_pred).sum())
print('Accuracy: %.2f' % accuracy_score(Y_test, Y_pred))

Number of predictions: 33852
Number of misclassifications: 9397
Accuracy: 0.72


In [81]:
#Get the coeficients of the regression for interpretation
np.set_printoptions(precision=2)
np.set_printoptions(suppress=True)
print('Coefficients: \n', lr.coef_)
coef = lr.coef_
print(type(coef))

Coefficients: 
 [[ 0.03 -0.25 -0.11 -0.02  0.33 -0.    0.37 -0.21 -0.09 -0.26  0.17 -0.09
  -0.26  0.04 -0.15 -0.05 -0.18  0.14 -0.13  0.22  0.13 -0.12  0.31 -0.12
  -0.26  0.07 -0.11  0.15  0.09  0.28 -0.05 -0.34 -0.01  0.04 -0.03  0.
  -0.01  0.02  0.01 -0.01 -0.03 -0.01  0.07 -0.03  0.04 -0.02]]
<class 'numpy.ndarray'>


In [82]:
#Compare predited valued with real values
compare_lg = pd.DataFrame({'Predicted value':Y_pred, 'Real value':Y_test})
compare_lg

Unnamed: 0,Predicted value,Real value
0,0.0,0.0
1,1.0,0.0
2,0.0,0.0
3,0.0,0.0
4,1.0,1.0
5,0.0,1.0
6,0.0,1.0
7,0.0,1.0
8,0.0,1.0
9,1.0,1.0


In [83]:
#We can predict probabilities of cases
#Record 4 has 62% probability of supporting refugees. We chech that the prediction was correct
case_4 = np.reshape(X_test_std[5], (1, -1)) #Transform to 2d array
lr.predict_proba(case_4)

array([[ 0.63,  0.37]])

In [84]:
#Save the model
pickle.dump(lr, open( "models/lr.pickle", "wb" ) )

### Support Vector Machines (SVM)

In [82]:
#Create a model based on data
from sklearn.svm import SVC
svm = SVC(kernel='linear', C=1.0, probability=True, random_state=0) #Activamos este parámetro si queremos probabilidades probability= True

In [104]:
svm.fit(X_train_std, Y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=True, random_state=0, shrinking=True, tol=0.001,
  verbose=False)

In [105]:
Y_pred = svm.predict(X_test_std)
Y_pred

array([ 0.,  1.,  0., ...,  0.,  0.,  0.])

In [106]:
#Save the model
pickle.dump(svm, open( "models/svm.pickle", "wb" ) )

In [107]:
#We can estimate the accuracy of these predictions
print('Number of predictions: %d' % len(Y_test))
print('Number of misclassifications: %d' % (Y_test != Y_pred) .sum())
print('Accuracy: %.2f' % accuracy_score(Y_test, Y_pred))

Number of predictions: 33852
Number of misclassifications: 9504
Accuracy: 0.72


In [108]:
#Compare predited valued with real values
compare_svm = pd.DataFrame({'Predicted value':Y_pred, 'Real value':Y_test})
compare_svm

Unnamed: 0,Predicted value,Real value
0,0.0,0.0
1,1.0,0.0
2,0.0,0.0
3,0.0,0.0
4,1.0,1.0
5,0.0,1.0
6,0.0,1.0
7,0.0,1.0
8,0.0,1.0
9,1.0,1.0


In [77]:
#ML SVM with Kernel rbf and gamma=100')
from sklearn.svm import SVC
svm2 = SVC(kernel='rbf', random_state=0, gamma=100, C=1.0, probability=True)

In [78]:
svm2.fit(X_train_std, Y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=100, kernel='rbf',
    max_iter=-1, probability=True, random_state=0, shrinking=True, tol=0.001,
    verbose=False)

In [79]:
Y_pred = svm2.predict(X_test_std)
Y_pred

array([0., 1., 0., ..., 0., 0., 0.])

In [81]:
#We can estimate the accuracy of these predictions
print('Number of predictions: %d' % len(Y_test))
print('Number of misclassifications: %d' % (Y_test != Y_pred) .sum())
print('Accuracy: %.2f' % accuracy_score(Y_test, Y_pred))

Number of predictions: 33852
Number of misclassifications: 10506
Accuracy: 0.69


In [82]:
#We can also estimate precision, recall, F-score and AUC-ROC
print("precision, recall, F-score:", precision_recall_fscore_support(Y_test, Y_pred, average='macro'))
print("AUC_ROC", roc_auc_score(Y_test, Y_pred))

precision, recall, F-score: (0.660856738050883, 0.6271883458705778, 0.6308748063709599, None)
AUC_ROC 0.6271883458705779


In [88]:
#Save the model
pickle.dump(svm2, open( "models/svm2.pickle", "wb" ) )

In [114]:
#Compare predited valued with real values
compare_svm_k = pd.DataFrame({'Predicted value':Y_pred, 'Real value':Y_test})
compare_svm_k

Unnamed: 0,Predicted value,Real value
0,0.0,0.0
1,1.0,0.0
2,0.0,0.0
3,0.0,0.0
4,0.0,1.0
5,0.0,1.0
6,0.0,1.0
7,0.0,1.0
8,0.0,1.0
9,0.0,1.0


### Decision Trees

Standarization of features is not requiered

In [85]:
#Create a model based on data
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier(criterion='entropy', max_depth=3, random_state=0)

In [86]:
tree.fit(X_train, Y_train)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=3,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=0, splitter='best')

In [87]:
Y_pred = tree.predict(X_test)
Y_pred

array([ 0.,  1.,  0., ...,  0.,  0.,  0.])

In [88]:
#We can estimate the accuracy of these predictions
from sklearn.metrics import accuracy_score
print('Number of predictions: %d' % len(Y_test))
print('Number of misclassifications: %d' % (Y_test != Y_pred) .sum())
print('Accuracy: %.2f' % accuracy_score(Y_test, Y_pred))

Number of predictions: 33852
Number of misclassifications: 10478
Accuracy: 0.69


In [89]:
pickle.dump(tree, open( "models/tree.pickle", "wb" ) )

In [90]:
#We can generate a figure for interpretation
from sklearn.tree import export_graphviz
export_graphviz(tree, out_file='arbol_eurobarometro.dot')

In [91]:
#And locally export it to .png
!dot -Tpng arbol_eurobarometro.dot -o arbol_eurobarometro.png

In [333]:
#Visualize it within the notebook
#Check https://stackoverflow.com/questions/43372723/how-to-open-dot-on-mac
#Find the picture at: https://www.dropbox.com/s/v2v2qil6lssune1/arbol_eurobarometro_EXPLICADO.png?dl=0
from IPython.core.display import Image, display
PATH = "/Users/carlosarcila/Dropbox/CARLOS ARCILA DOCS/CURSO COMPLETO ESTADISTICA/Machine Learning/"
Image(filename = PATH + "arbol_eurobarometro_EXPLICADO.png", width=800, height=100)

FileNotFoundError: [Errno 2] No such file or directory: '/Users/carlosarcila/Dropbox/CARLOS ARCILA DOCS/CURSO COMPLETO ESTADISTICA/Machine Learning/arbol_eurobarometro_EXPLICADO.png'

### Random Forest

Standarization of features is not requiered

In [92]:
#Create a model based on data
#Random forests with 10 trees
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier(criterion='entropy', n_estimators=10, random_state=1, n_jobs=2)

In [93]:
forest.fit(X_train, Y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=2, oob_score=False, random_state=1,
            verbose=0, warm_start=False)

In [94]:
Y_pred = forest.predict(X_test)
Y_pred

array([ 0.,  1.,  0., ...,  0.,  0.,  0.])

In [95]:
print('Number of predictions: %d' % len(Y_test))
print('Number of misclassifications: %d' % (Y_test != Y_pred) .sum())
print('Accuracy: %.2f' % accuracy_score(Y_test, Y_pred))

Number of predictions: 33852
Number of misclassifications: 10296
Accuracy: 0.70


In [96]:
pickle.dump(forest, open( "models/forest.pickle", "wb" ) )

In [97]:
#We can get the most informative values
#Age:0.41 ; Educational:0.18
importances = forest.feature_importances_
importances

array([ 0.01,  0.1 ,  0.15,  0.13,  0.03,  0.01,  0.05,  0.01,  0.01,
        0.02,  0.01,  0.01,  0.01,  0.01,  0.01,  0.01,  0.01,  0.01,
        0.  ,  0.02,  0.01,  0.  ,  0.04,  0.01,  0.01,  0.  ,  0.  ,
        0.01,  0.01,  0.03,  0.01,  0.02,  0.01,  0.02,  0.02,  0.02,
        0.02,  0.01,  0.02,  0.02,  0.02,  0.01,  0.01,  0.02,  0.02,  0.02])

### K Nearest Neighbor
We use standarized features again

In [98]:
#Create a model based on data
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=5, p=2, metric='minkowski')

In [99]:
knn.fit(X_train_std, Y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [100]:
Y_pred = knn.predict(X_test)

In [101]:
print('Number of predictions: %d' % len(Y_test))
print('Number of misclassifications: %d' % (Y_test != Y_pred) .sum())
print('Accuracy: %.2f' % accuracy_score(Y_test, Y_pred))

Number of predictions: 33852
Number of misclassifications: 10626
Accuracy: 0.69


In [102]:
pickle.dump(knn, open( "models/knn.pickle", "wb" ) )

### Predictions for individuals

In [166]:
#Based on the model produced by decision trees
#We estimate the probabilities of supporting refugees for different individuals:

#Czech with 10 years of education, 20% chance for support
individual_1 = np.array([10, 40, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1])
individual_1 = np.reshape(individual_1, (1, -1)) #Transform to 2d array
individual_1.shape
print(tree.predict_proba(individual_1))

[[ 0.2  0.8]]


In [167]:
#Greek with 30 years of education, 69% chance for support
individual_2 = np.array([30, 40, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1])
individual_2 = np.reshape(individual_2, (1, -1)) #Transform to 2d array
print(tree.predict_proba(individual_2))

[[ 0.69  0.31]]


In [180]:
#Based on the model produced by KNN
#k-NN classifiers do not output probabilities. It is discriminative classification
#In k-NN classification, the output is a class membership.
#In k-NN regression, the output is the property value for the object. 

#Czech with 10 years of education
individual_1 = np.array([10, 40, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1])
individual_1 = individual_1.astype(float) #Convert to float for computation
individual_1 = np.reshape(individual_1, (1, -1)) #Transform to 2d array
individual_1_std = sc.transform(individual_1)
print(knn.predict_proba(individual_1_std))

[[ 0.  1.]]


In [181]:
#Greek with 30 years of education
individual_2 = np.array([30, 40, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1])
individual_2 = individual_2.astype(float) #Convert to float for computation
individual_2 = np.reshape(individual_2, (1, -1)) #Transform to 2d array
individual_2__std = sc.transform(individual_2)
print(knn.predict_proba(individual_2__std))

[[ 1.  0.]]


In [83]:
#Based on the model produced by LG
#We estimate the probabilities of supporting refugees for different individuals:

#Czech with 10 years of education, 13% chance for support
individual_1 = np.array([10, 40, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1])
individual_1 = individual_1.astype(float) #Convert to float for computation
individual_1 = np.reshape(individual_1, (1, -1)) #Transform to 2d array
individual_1_std = sc.transform(individual_1)
print(lr.predict_proba(individual_1_std))

[[ 1.  0.]]


In [186]:
#Greek with 30 years of education, 84% chance for support
individual_2 = np.array([30, 40, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1])
individual_2 = individual_2.astype(float) #Convert to float for computation
individual_2 = np.reshape(individual_2, (1, -1)) #Transform to 2d array
individual_2__std = sc.transform(individual_2)
print(lr.predict_proba(individual_2__std))

[[ 0.84  0.16]]


In [235]:
#Based on the model produced by SVM
#Czech with 10 years of education, 60% chance for support

individual_1 = np.array([10, 40, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1])
individual_1 = individual_1.astype(float) #Convert to float for computation
individual_1 = np.reshape(individual_1, (1, -1)) #Transform to 2d array
individual_1_std = sc.transform(individual_1)
P_pred = svm.predict_proba(individual_1)
P_pred

array([[ 0.6,  0.4]])

In [232]:
#Greek with 30 years of education, 68% chance for support
individual_2 = np.array([30, 40, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1])
individual_2 = individual_2.astype(float) #Convert to float for computation
individual_2 = np.reshape(individual_2, (1, -1)) #Transform to 2d array
individual_2_std = sc.transform(individual_2)
P_pred = svm.predict_proba(individual_2)
P_pred


array([[ 0.68,  0.32]])