In [1]:
%matplotlib inline
import matplotlib
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import random

from sklearn.metrics import confusion_matrix
import itertools

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import mean_absolute_error as mae
from sklearn.metrics import accuracy_score as acc
from sklearn.metrics import make_scorer
from sklearn.dummy import DummyRegressor
from sklearn.dummy import DummyClassifier
from sklearn.preprocessing import LabelEncoder

In [2]:
df_online = pd.read_csv("./NatRep_Online_Upload.csv", delimiter = ",")
df_phone = pd.read_csv("./NatRep_Phone_upload.csv", delimiter = ",")

In [3]:
df_elections = pd.read_csv("./2015_general_elections/2015_voting_gen_election.csv", delimiter=",")

In [4]:
df_qualifications = pd.read_csv("./Education_qualifications/UK_Qualifications.csv", delimiter=",")

In [5]:
df_sex_to_age = pd.read_csv("./Gender_demographics_by_age/UK_M_to_F_ratio_by_age.csv", delimiter=",")

In [6]:
df_newspaper = pd.read_csv("./Newspaper_readability/Newspaper_readerships_uk.csv", delimiter=",")

In [7]:
df_social_grade = pd.read_csv("./Social_grade/Aproximated_social_grade.csv", delimiter=",")

In [8]:
df_station_ratings = pd.read_csv("./Station_ratings/Station_Ratings_UK.csv", delimiter=",")

In [9]:
#start by deleting the ages that cannot legaly vote
for i in range(18):
    df_sex_to_age = df_sex_to_age.drop(df_sex_to_age.index[[0]])

In [10]:
#to avoid overfitting, create age groups every 4 years
df_sex_to_agegroup = df_sex_to_age.groupby(np.arange(len(df_sex_to_age))//4).sum()
df_sex_to_agegroup.index = df_sex_to_age.loc[1::4,'Age']

In [11]:
#Find the total number of men and women
totals = df_sex_to_age.sum(axis=0)

In [12]:
#reconstruct the dataframe to only include the precentages
df_sex_to_agegroup["Number of males"] = df_sex_to_agegroup["Number of males"].astype(float)
df_sex_to_agegroup["Number of females"] = df_sex_to_agegroup["Number of females"].astype(float)
for index, row in df_sex_to_agegroup.iterrows():
    row[0]=(row[0]/totals["Number of males"])
    row[1]=(row[1]/totals["Number of females"])

In [13]:
#create a dataframe to hold the means for every qualification level
df_mean_qualifications = pd.DataFrame({ 'Qualification' : np.array(["No Qualification","Level 1","Level 2","Level 3","Level 4","Other"],dtype='string'),
                     '%' : np.array([df_qualifications["% No Qualifications"].mean()*0.01,
                                     df_qualifications["% Level 1"].mean()*0.01,
                                     df_qualifications["% Level 2"].mean()*0.01,
                                     df_qualifications["% Level 3"].mean()*0.01,
                                     df_qualifications["% Level 4"].mean()*0.01,
                                     df_qualifications["% Other"].mean()*0.01],dtype='float')})

In [14]:
#find the total number of readers and then use it to compute the precentages
news_total = df_newspaper["Combined"].values.sum()   
pn = []
for readers in df_newspaper["Combined"].values:
    pn.append("{0:.5g}".format((float(readers)/float(news_total))))
cn = list(df_newspaper["Title"].values)
df_percent_newspapers = pd.DataFrame([pn,cn])
df_percent_newspapers = df_percent_newspapers.T

In [15]:
#find the respective precentages in df_online and df_phone in order to perform the bootstrap
def find_precentages_in_data(data):
    ######### AGES ##########
    p_ages_male= np.zeros((19,),dtype=np.float)
    p_ages_female = np.zeros((19,), dtype=np.float)
    step = 0
    for i in range(18,90,4):
        p_ages_male[step]=len((data.query( str(i)+'<= Age <='+str(i+3)).query('1 == gender')).index)
        p_ages_female[step]=len((data.query( str(i)+'<= Age <='+str(i+3)).query('2 == gender')).index)
        step = step + 1
    #90+
    p_ages_male[step]=len((data.query( str(90)+'< Age').query('1 == gender')).index)
    p_ages_female[step]=len((data.query( str(90)+'< Age').query('2 == gender')).index)
    
    #precentages per age group
    total_m = p_ages_male.sum()
    total_f = p_ages_female.sum()
    for i in range(0,len(p_ages_male)):
        p_ages_male[i] = "{0:.3f}".format(p_ages_male[i]/total_m)
    for i in range(0,len(p_ages_female)):
        p_ages_female[i] = "{0:.3f}".format(p_ages_female[i]/total_f)
    
    ########## SOCIAL GRADE ##########
    p_sgrade = np.zeros((4,),dtype=np.float)
    for i in range(0,4):
        p_sgrade[i]=len((data.query(str(i+1)+'== Socgrade_matrix_w8').index))
    
    total_s = p_sgrade.sum()
    for i in range(0,4):
        p_sgrade[i] = "{0:.3f}".format(p_sgrade[i]/total_s)
        
        
    ########## 2015 ELECTIONS VOTING ##########
    p_party = np.zeros((9,),dtype=np.float)
    
    voted_data =  data.query(str(1)+'== voted2015')['pastvote_2015']
    for i in voted_data:
        temp = int(i)
        if(temp<=9):p_party[temp-1] = p_party[temp-1] + 1
    
    total_p = p_party.sum()
    for i in range(0,9):
        p_party[i] = "{0:.3f}".format(p_party[i]/total_p)
        
    
    ########## EDUCATION QUALIFICATIONS ##########
    p_levels = np.zeros((6,),dtype=np.float)
    #No qualifications
    p_levels[0] = len((data.query(str(1)+'== profile_education_level').index))
    #Level 1 qualifications
    p_levels[1] = len((data.query(str(2)+'== profile_education_level').index))+len((data.query(str(5)+'== profile_education_level').index))+len((data.query(str(8)+'== profile_education_level').index))
    #Level 2 qualifications
    p_levels[2] = len((data.query(str(6)+'== profile_education_level').index))+len((data.query(str(9)+'== profile_education_level').index))+len((data.query(str(10)+'== profile_education_level').index))
            
    #Level 3 qualifications
    p_levels[3] = len((data.query(str(11)+'== profile_education_level').index))+len((data.query(str(7)+'== profile_education_level').index))+len((data.query(str(12)+'== profile_education_level').index))+len((data.query(str(13)+'== profile_education_level').index))+len((data.query(str(14)+'== profile_education_level').index))
    #Level 4 qualifications
    p_levels[4] = len((data.query(str(15)+'== profile_education_level').index))+len((data.query(str(16)+'== profile_education_level').index))+len((data.query(str(17)+'== profile_education_level').index))
    #Other qualifications
    p_levels[5] = len((data.query(str(18)+'== profile_education_level').index))
    
    total_l = p_levels.sum()
    for i in range(0,6):
        p_levels[i] = "{0:.3f}".format(p_levels[i]/total_l)
    
    ########## NEWSPAPERS PREFERRED ##########
    p_newspapers = np.zeros((15,),dtype=np.float)
    for i in range(0,14):
        p_newspapers[i] = len((data.query(str(i+1)+'== Newspaper_Read').index))
    
    p_newspapers[14] =len((data.query(str(15)+'== Newspaper_Read').index))+len((data.query(str(16)+'== Newspaper_Read').index))+len((data.query(str(17)+'== Newspaper_Read').index))
    
    total_np = p_newspapers.sum()
    p_newsp = np.zeros((15,),dtype=np.float)
    for i in range(0,15):
        p_newsp[i] = "{0:.3f}".format(p_newspapers[i]/total_np)
    
    ########## NEWS STATION PREFERRED ##########
    p_newschannel = np.zeros((6,),dtype=np.float)
    for i in range(0,6):
        p_newschannel[i] = len((data.query(str(i+1)+'== TV_News').index))
    
    total_nc = p_newschannel.sum()
    for i in range(0,6):
        p_newschannel[i] = "{0:.3f}".format(p_newschannel[i]/total_nc)
    
    
    ########## FINAL ARRAY ##########
    t1 = np.array([[p_ages_male], [p_ages_female], [p_sgrade], [p_party], [p_levels], [p_newsp], [p_newschannel]], dtype=object)
    ########## ########### ##########
    return t1
    
    

In [16]:
online_precentages = find_precentages_in_data(df_online)

In [17]:
#use bootstrapping to create new examples in order for the data precentages to match the population precentages
def bootstrap(data, sample_of_interest, condition):
    column_names = list(data.columns.values)
    #create a new id for the new sample
    new_id = (data[column_names[0]].iloc[-1])+1
    #create a list to add to the dataframe as new example
    temp = []
    temp.append(new_id)
    for column_name in itertools.islice(column_names,1,len(column_names)):
        #create a sample for a data value that was under-represented in the polling
        if (sample_of_interest == column_name):
            pool = data.query(sample_of_interest+condition)
            sample = (np.random.choice(pool[column_name],1)).item(0)
        else:
            sample = (np.random.choice(data[column_name],1)).item(0)
        temp.append(sample)
    return temp
    
        

In [18]:
#use multiple conditions to avoid confusion in the code
def conditions(online_precentages, df_sex_to_agegroup, df_social_grade, df_elections, df_mean_qualifications, df_percent_newspapers, df_station_ratings):
    ##### PRECENTAGE DIFFERENCE IN AGES OF MEN #####
    condition_ages_male = []
    for i in range(0,online_precentages[0][0].size):
        if("{0:.3f}".format(float(online_precentages[0][0][i])) < "{0:.3f}".format(float(df_sex_to_agegroup.iat[i,0]))):
            condition_ages_male.append('<')
        elif("{0:.3f}".format(float(online_precentages[0][0][i])) == "{0:.3f}".format(float(df_sex_to_agegroup.iat[i,0]))):
            condition_ages_male.append('=')
        else:
            condition_ages_male.append('>')
    
    ##### PRECENTAGE DIFFERENCE IN AGES OF WOMEN #####
    condition_ages_female = []
    for i in range(0,online_precentages[1][0].size):
        if("{0:.3f}".format(float(online_precentages[1][0][i])) < "{0:.3f}".format(float(df_sex_to_agegroup.iat[i,1]))):
            condition_ages_female.append('<')
        elif("{0:.3f}".format(float(online_precentages[1][0][i])) == "{0:.3f}".format(float(df_sex_to_agegroup.iat[i,1]))):
            condition_ages_female.append('=')
        else:
            condition_ages_female.append('>')
        
    ##### PRECENTAGE DIFFERENCE IN SOCIAL GRADES #####
    condition_social_grades = []
    for i in range(0,online_precentages[2][0].size):
        if("{0:.3f}".format(float(online_precentages[2][0][i])) < "{0:.3f}".format(float(df_social_grade.iat[0,(i+5)]))):
            condition_social_grades.append('<')
        elif("{0:.3f}".format(float(online_precentages[2][0][i])) == "{0:.3f}".format(float(df_social_grade.iat[0,(i+5)]))):
            condition_social_grades.append('=')
        else:
            condition_social_grades.append('>')
        
        
    ##### PRECENTAGE DIFFERENCE IN 2015 ELECTIONS VOTING #####
    condition_voting_2015 = []
    for i in range(0,online_precentages[3][0].size):
        if("{0:.3f}".format(float(online_precentages[3][0][i])) < "{0:.3f}".format(float(df_elections.iat[i,2]))):
            condition_voting_2015.append('<')
        elif("{0:.3f}".format(float(online_precentages[3][0][i])) == "{0:.3f}".format(float(df_elections.iat[i,2]))):
            condition_voting_2015.append('=')
        else:
            condition_voting_2015.append('>')
        
        
    ##### PRECENTAGE DIFFERENCE IN POPULATION QUALIFICATIONS #####
    condition_qualifications = []
    for i in range(0,online_precentages[4][0].size):
        if("{0:.3f}".format(float(online_precentages[4][0][i])) < "{0:.3f}".format(float(df_mean_qualifications.iat[i,0]))):
            condition_qualifications.append('<')
        elif("{0:.3f}".format(float(online_precentages[4][0][i])) == "{0:.3f}".format(float(df_mean_qualifications.iat[i,0]))):
            condition_qualifications.append('=')
        else:
            condition_qualifications.append('>')
        
    
    ##### PRECENTAGE DIFFERENCE IN NEWSPAPERS PREFERENCE #####
    condition_newspapers = []
    for i in range(0,online_precentages[5][0].size):
        if("{0:.3f}".format(float(online_precentages[5][0][i])) < "{0:.3f}".format(float(df_percent_newspapers.iat[i,0]))):
            condition_newspapers.append('<')
        elif("{0:.3f}".format(float(online_precentages[5][0][i])) == "{0:.3f}".format(float(df_percent_newspapers.iat[i,0]))):
            condition_newspapers.append('=')
        else:
            condition_newspapers.append('>')
        
        
    ##### PRECENTAGE DIFFERENCE IN NEWS STATIONS #####
    condition_tvnews = []
    for i in range(0,online_precentages[6][0].size):
        if("{0:.3f}".format(online_precentages[6][0][i]) < "{0:.3f}".format(df_station_ratings.iat[i,1])):
            condition_tvnews.append('<')
        elif("{0:.3f}".format(online_precentages[6][0][i]) == "{0:.3f}".format(df_station_ratings.iat[i,1])):
            condition_tvnews.append('=')
        else:
            condition_tvnews.append('>')
        
    
    ########## FINAL CONDITION CONSTRUCTION ##########
    condition = [condition_ages_male ,condition_ages_female ,condition_social_grades ,condition_voting_2015 ,condition_qualifications ,condition_newspapers ,condition_tvnews]
    
    return condition

In [19]:
loop_condition = conditions(online_precentages, df_sex_to_agegroup, df_social_grade, df_elections, df_mean_qualifications, df_percent_newspapers, df_station_ratings)

In [20]:
def check(condition_elements):
    flag = True
    for ar in condition_elements:
        for element in condition_elements:
            if (element!='='):
                flag = False
                break
        if (flag == False): 
            break

In [21]:
def bootstrap_men(data0, loop_con, online_perc, data1, data2, data3, data4, data5, data6):
    #combine all conditions and iterate until they resemble the population
    flag = 0
    for el in loop_con[0]:
        if (el!='='):
            flag = flag + 1


    j=0   
    ########## MEN BY AGE ##########
    while (flag > 1):
        i = 18
        for c in range (0,len(loop_con[0])):
            while (loop_con[0][c] == '<'):
                age_chosen = random.randint(i,i+3)
                string = str(age_chosen)
                string = " == "+string
                if (i>=86):
                    string = " > 90"
                temp = bootstrap(data0, "Age", string)
                while (int(temp[23])!=1):
                    temp = bootstrap(data0, "Age", string)
                #TO AVOID NOT BOOTSTRAPPING THE AGE CHOSEN
                temp[19] = age_chosen
                #Make sure if the people voted, they voted for someone!
                #(Data with values that show that the person did not vote,
                #but have a voted party will be discarded by the precentages function)
                while ((temp[17] == 1) & (temp[18] == ' ')):
                    temp = bootstrap(data0, "Age", string)
                df_additional = pd.DataFrame([temp], columns=list(data0.columns.values))
                data0 = data0.append(df_additional, ignore_index=True)
                online_prec = find_precentages_in_data(data0)
                loop_con = conditions(online_prec, data1, data2, data3, data4, data5, data6)

            i = i+4
        
        flag = 0
        for el in loop_con[0]:
            if (el!='='):
                flag = flag + 1


    return data0

In [22]:
def bootstrap_women(data0, loop_con, online_perc, data1, data2, data3, data4, data5, data6):
    #combine all conditions and iterate until they resemble the population
    flag = 0
    for el in loop_con[1]:
        if (el!='='):
            flag = flag + 1

   
    ########## WOMEN BY AGE ##########
    while (flag > 1):
        i = 18
        for c in range (0,len(loop_con[1])):
            while (loop_con[1][c] == '<'):
                age_chosen = random.randint(i,i+3)
                string = str(age_chosen)
                string = " == "+string
                if (i>=86):
                    string = " > 90"
                temp = bootstrap(data0, "Age", string)
                while (int(temp[23])!=2):
                    temp = bootstrap(data0, "Age", string)
                #TO AVOID NOT BOOTSTRAPPING THE AGE CHOSEN
                temp[19] = age_chosen
                #Make sure if the people voted, they voted for someone!
                #(Data with values that show that the person did not vote,
                #but have a voted party will be discarded by the precentages function)
                while ((temp[17] == 1) & (temp[18] == ' ')):
                    temp = bootstrap(data0, "Age", string)
                df_additional = pd.DataFrame([temp], columns=list(data0.columns.values))
                data0 = data0.append(df_additional, ignore_index=True)
                online_prec = find_precentages_in_data(data0)
                loop_con = conditions(online_prec, data1, data2, data3, data4, data5, data6)

            i = i+4
        
        flag = 0
        for el in loop_con[1]:
            if (el!='='):
                flag = flag + 1


    return data0

In [23]:
def bootstrap_social_grades(data0, loop_con, online_perc, data1, data2, data3, data4, data5, data6):
    #combine all conditions and iterate until they resemble the population
    flag = 0
    for el in loop_con[2]:
        if (el!='='):
            flag = flag + 1
    
    ########## SOCIAL GRADES DISTRIBUTION ##########
    while (flag > 1):
        for c in range (0,len(loop_con[2])):
            while (loop_con[2][c] == '<'):
                string = "== "+str(c+1)
                temp = bootstrap(data0, "Socgrade_matrix_w8", string)
                #Make sure if the people voted, they voted for someone!
                #(Data with values that show that the person did not vote,
                #but have a voted party will be discarded by the precentages function)
                while ((temp[17] == 1) & (temp[18] == ' ')):
                    temp = bootstrap(data0, "Social_matrix_w8", string)
                df_additional = pd.DataFrame([temp], columns=list(data0.columns.values))
                data0 = data0.append(df_additional, ignore_index=True)
                online_prec = find_precentages_in_data(data0)
                loop_con = conditions(online_prec, data1, data2, data3, data4, data5, data6)
                
                
        flag = 0
        for el in loop_con[2]:
            if (el!='='):
                flag = flag + 1
                
    
    return data0

In [24]:
def bootstrap_past_votings(data0, loop_con, online_perc, data1, data2, data3, data4, data5, data6):
    #combine all conditions and iterate until they resemble the population
    flag = 0
    for el in loop_con[3]:
        if (el!='='):
            flag = flag + 1
    
    ########## PAST VOTINGS BY PARTIES DISTRIBUTION ##########
    while (flag > 3):
        for c in range (0,len(loop_con[3])):
            while (loop_con[3][c] == '<'):
                temp = bootstrap(data0, "voted_2015", "== 1")
                temp[18] = str(c+1)
                df_additional = pd.DataFrame([temp], columns=list(data0.columns.values))
                data0 = data0.append(df_additional, ignore_index=True)
                online_prec = find_precentages_in_data(data0)
                loop_con = conditions(online_prec, data1, data2, data3, data4, data5, data6)
                
                
        flag = 0
        for el in loop_con[3]:
            if (el!='='):
                flag = flag + 1
                
    return data0                

In [25]:
def bootstrap_education(data0, loop_con, online_perc, data1, data2, data3, data4, data5, data6):
    #combine all conditions and iterate until they resemble the population
    flag = 0
    for el in loop_con[4]:
        if (el!='='):
            flag = flag + 1
            
    ########## EDUCATION QUALIFICATIONS DISTRIBUTION ##########
    while (flag > 2):
        for c in range (0,len(loop_con[4])):
            while (loop_con[4][c] == '<'):
                i = c
                #No qualification
                if (c==0):
                    i = c+1
                #Level 1 qualifications
                elif (c==1):
                    i = random.choice([2,5,8])
                #Level 2 qualifications
                elif (c==2):
                    i = random.choice([6,9,10])
                #Level 3 qualifications
                elif (c==3):
                    i= random.choice([11,7,12,13,14])
                #Level 4+ qualifications
                elif (c==4):
                    i= random.choice([15,16,17])
                else:
                    i =18

                string = "== "+str(i)
                temp = bootstrap(data0, "profile_education_level", string)
                #Make sure if the people voted, they voted for someone!
                #(Data with values that show that the person did not vote,
                #but have a voted party will be discarded by the precentages function)
                while ((temp[17] == 1) & (temp[18] == ' ')):
                    temp = bootstrap(data0, "profile_education_level", string)
                df_additional = pd.DataFrame([temp], columns=list(data0.columns.values))
                data0 = data0.append(df_additional, ignore_index=True)
                online_prec = find_precentages_in_data(data0)
                loop_con = conditions(online_prec, data1, data2, data3, data4, data5, data6)
        
            
        flag = 0
        for el in loop_con[4]:
            if (el!='='):
                flag = flag + 1
            
    return data0

In [26]:
def bootstrap_newspapers(data0, loop_con, online_perc, data1, data2, data3, data4, data5, data6):
    #combine all conditions and iterate until they resemble the population
    flag = 0
    for el in loop_con[5]:
        if (el!='='):
            flag = flag + 1
    
    ########## NEWSPAPER PREFERENCE DISTRIBUTION ##########
    while (flag > 1):
        for c in range (0,len(loop_con[5])):
            while (loop_con[5][c] == '<'):
                string = "== " + str(c+1)
                temp = bootstrap(data0, "Newspaper_Read", string)
                #Make sure if the people voted, they voted for someone!
                #(Data with values that show that the person did not vote,
                #but have a voted party will be discarded by the precentages function)
                while ((temp[17] == 1) & (temp[18] == ' ')):
                    temp = bootstrap(data0, "Newspaper_Read", string)
                df_additional = pd.DataFrame([temp], columns=list(data0.columns.values))
                data0 = data0.append(df_additional, ignore_index=True)
                online_prec = find_precentages_in_data(data0)
                loop_con = conditions(online_prec, data1, data2, data3, data4, data5, data6)
                
                
        
        flag = 0
        for el in loop_con[5]:
            if (el!='='):
                flag = flag + 1
                
    return data0      

In [27]:
def bootstrap_tvnews(data0, loop_con, online_perc, data1, data2, data3, data4, data5, data6):
    #combine all conditions and iterate until they resemble the population
    flag = 0
    for el in loop_con[6]:
        if (el!='='):
            flag = flag + 1
            
    ########## NEWS STATION PREFERENCE DISTRIBUTION ##########
    while (flag > 1):
        for c in range (0,len(loop_con[6])):
            while (loop_con[6][c] == '<'):
                i = c+1
                if (i==2): string = "== "+str(i-1)
                else: string = "== " + str(i)
                temp = bootstrap(data0, "TV_News", string)
                if (i==2):temp[10] = 2
                #Make sure if the people voted, they voted for someone!
                #(Data with values that show that the person did not vote,
                #but have a voted party will be discarded by the precentages function)
                while ((temp[17] == 1) & (temp[18] == ' ')):
                    temp = bootstrap(data0, "TV_News", string)
                    if (i==2):temp[10] = 2
                df_additional = pd.DataFrame([temp], columns=list(data0.columns.values))
                data0 = data0.append(df_additional, ignore_index=True)
                online_prec = find_precentages_in_data(data0)
                loop_con = conditions(online_prec, data1, data2, data3, data4, data5, data6)
                
                       
        flag = 0
        for el in loop_con[6]:
            if (el!='='):
                flag = flag + 1
                
    return data0

In [28]:
print "########### MEN POPULATION % ##########"
print df_sex_to_agegroup["Number of males"]
print ""
print "########### WOMEN POPULATION % ##########"
print df_sex_to_agegroup["Number of females"]
print ""
print "########### SOCIAL GRADE POPULATION % ##########"
print df_social_grade.iloc[[0]]
print ""
print "########### VOTING PARTY POPULATION % ##########"
print df_elections
print ""
print "########### QUALIFICATIONS POPULATION % ##########"
print df_mean_qualifications
print ""
print "########### NEWSPAPERS POPULATION % ##########"
print df_percent_newspapers
print ""
print "########### TX NEWS POPULATION % ##########"
print df_station_ratings
print ""

########### MEN POPULATION % ##########
Age
18     0.066878
22     0.072109
26     0.070812
30     0.069682
34     0.065900
38     0.065440
42     0.072479
46     0.074537
50     0.072203
54     0.064226
58     0.057229
62     0.055368
66     0.055595
70     0.041469
74     0.034191
78     0.026345
82     0.018620
86     0.010553
90+    0.006366
Name: Number of males, dtype: float64

########### WOMEN POPULATION % ##########
Age
18     0.060318
22     0.066678
26     0.067333
30     0.067036
34     0.062987
38     0.062975
42     0.070616
46     0.072751
50     0.070203
54     0.062309
58     0.056035
62     0.055059
66     0.055897
70     0.043284
74     0.037489
78     0.031315
82     0.025433
86     0.017238
90+    0.015044
Name: Number of females, dtype: float64

########### SOCIAL GRADE POPULATION % ##########
   Area code          Area name  Unnamed: 2  Unnamed: 3  \
0  K04000001  ENGLAND AND WALES         NaN         NaN   

  All categories: Approximated social grade  Approxima

In [29]:
con = 0
while (con<7):
    con = 0
    print "########## BOOTSTRAPPING MEN ##########"
    print online_precentages[0][0]
    boot = bootstrap_men(df_online, loop_condition, online_precentages, df_sex_to_agegroup, df_social_grade, df_elections, df_mean_qualifications, df_percent_newspapers, df_station_ratings)
    if (df_online.equals(boot)):con = con +1
    else:
        df_online = boot
        online_precentages = find_precentages_in_data(df_online)
        loop_condition = conditions(online_precentages, df_sex_to_agegroup, df_social_grade, df_elections, df_mean_qualifications, df_percent_newspapers, df_station_ratings)
        print online_precentages[0][0]
        print "New number of examples: "+str(df_online.tail(1).iat[0,0])
        print df_online["EUREF_Int"].value_counts()
    
    print " "
    print "########## BOOTSTRAPPING WOMEN ##########"
    print online_precentages[1][0]
    boot = bootstrap_women(df_online, loop_condition, online_precentages, df_sex_to_agegroup, df_social_grade, df_elections, df_mean_qualifications, df_percent_newspapers, df_station_ratings)
    if (df_online.equals(boot)):con = con +1
    else:
        df_online = boot
        online_precentages = find_precentages_in_data(df_online)
        loop_condition = conditions(online_precentages, df_sex_to_agegroup, df_social_grade, df_elections, df_mean_qualifications, df_percent_newspapers, df_station_ratings)
        print online_precentages[1][0]
        print "New number of examples: "+str(df_online.tail(1).iat[0,0])
        print df_online["EUREF_Int"].value_counts()
    
    print " "
    print "########## BOOTSTRAPPING SOCIAL GRADE #########"
    print online_precentages[2][0]
    boot = bootstrap_social_grades(df_online, loop_condition, online_precentages, df_sex_to_agegroup, df_social_grade, df_elections, df_mean_qualifications, df_percent_newspapers, df_station_ratings)
    if (df_online.equals(boot)):con = con +1
    else:
        df_online = boot
        online_precentages = find_precentages_in_data(df_online)
        loop_condition = conditions(online_precentages, df_sex_to_agegroup, df_social_grade, df_elections, df_mean_qualifications, df_percent_newspapers, df_station_ratings)
        print online_precentages[2][0]
        print "New number of examples: "+str(df_online.tail(1).iat[0,0])
        print df_online["EUREF_Int"].value_counts()
    
    print " "
    print "########## BOOTSTRAPPING PAST VOTES ##########"
    print online_precentages[3][0]
    boot = bootstrap_past_votings(df_online, loop_condition, online_precentages, df_sex_to_agegroup, df_social_grade, df_elections, df_mean_qualifications, df_percent_newspapers, df_station_ratings)
    if (df_online.equals(boot)):con = con +1
    else:
        df_online = boot
        online_precentages = find_precentages_in_data(df_online)
        loop_condition = conditions(online_precentages, df_sex_to_agegroup, df_social_grade, df_elections, df_mean_qualifications, df_percent_newspapers, df_station_ratings)
        print online_precentages[3][0]
        print "New number of examples: "+str(df_online.tail(1).iat[0,0])
        print df_online["EUREF_Int"].value_counts()
    
    print " "
    print "########## BOOTSTRAPPING EDUCATION QUALIFICATIONS ##########"
    print online_precentages[4][0]
    boot = bootstrap_education(df_online, loop_condition, online_precentages, df_sex_to_agegroup, df_social_grade, df_elections, df_mean_qualifications, df_percent_newspapers, df_station_ratings)
    if (df_online.equals(boot)):con = con +1
    else:
        df_online = boot
        online_precentages = find_precentages_in_data(df_online)
        loop_condition = conditions(online_precentages, df_sex_to_agegroup, df_social_grade, df_elections, df_mean_qualifications, df_percent_newspapers, df_station_ratings)
        print online_precentages[4][0]
        print "New number of examples: "+str(df_online.tail(1).iat[0,0])
        print df_online["EUREF_Int"].value_counts()
    
    print " "
    print "########## BOOTSTRAPPING NEWSPAPERS ##########"
    print online_precentages[5][0]
    boot = bootstrap_newspapers(df_online, loop_condition, online_precentages, df_sex_to_agegroup, df_social_grade, df_elections, df_mean_qualifications, df_percent_newspapers, df_station_ratings)
    if (df_online.equals(boot)):con = con +1
    else:
        df_online = boot
        online_precentages = find_precentages_in_data(df_online)
        loop_condition = conditions(online_precentages, df_sex_to_agegroup, df_social_grade, df_elections, df_mean_qualifications, df_percent_newspapers, df_station_ratings)
        print online_precentages[5][0]
        print "New number of examples: "+str(df_online.tail(1).iat[0,0])
        print df_online["EUREF_Int"].value_counts()
    
    print " "
    print "########## BOOTSTRAPPING TV NEWS ##########"
    print online_precentages[6][0]
    boot = bootstrap_tvnews(df_online, loop_condition, online_precentages, df_sex_to_agegroup, df_social_grade, df_elections, df_mean_qualifications, df_percent_newspapers, df_station_ratings)
    if (df_online.equals(boot)):con = con +1
    else:
        df_online = boot
        online_precentages = find_precentages_in_data(df_online)
        loop_condition = conditions(online_precentages, df_sex_to_agegroup, df_social_grade, df_elections, df_mean_qualifications, df_percent_newspapers, df_station_ratings)
        print online_precentages[6][0]
        print "New number of examples: "+str(df_online.tail(1).iat[0,0])
        print df_online["EUREF_Int"].value_counts()
    print " "
    print " "
    print "##########"
    print "New number of examples: "+str(df_online.tail(1).iat[0,0])
    print "##########"
    print " "
    print " "
    

########## BOOTSTRAPPING MEN ##########
[ 0.074  0.048  0.058  0.061  0.069  0.073  0.081  0.073  0.058  0.06
  0.072  0.073  0.084  0.068  0.031  0.013  0.004  0.     0.001]
[ 0.067  0.072  0.071  0.07   0.066  0.065  0.072  0.075  0.072  0.064
  0.057  0.055  0.056  0.046  0.034  0.026  0.019  0.011  0.006]
New number of examples: 2503
1    980
2    974
3    405
4    120
Name: EUREF_Int, dtype: int64
 
########## BOOTSTRAPPING WOMEN ##########
[ 0.059  0.068  0.07   0.077  0.075  0.06   0.062  0.058  0.071  0.048
  0.064  0.064  0.119  0.066  0.019  0.013  0.004  0.     0.003]
[ 0.06   0.067  0.067  0.067  0.063  0.063  0.071  0.073  0.07   0.062
  0.056  0.055  0.064  0.043  0.037  0.031  0.025  0.017  0.015]
New number of examples: 3505
2    1372
1    1368
3     577
4     164
Name: EUREF_Int, dtype: int64
 
########## BOOTSTRAPPING SOCIAL GRADE #########
[ 0.299  0.297  0.196  0.208]
[ 0.228  0.308  0.208  0.257]
New number of examples: 4736
2    1887
1    1847
3     766
4     212


KeyboardInterrupt: 