In [1]:
%matplotlib inline
import matplotlib
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import random

from sklearn.metrics import confusion_matrix
import itertools

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import mean_absolute_error as mae
from sklearn.metrics import accuracy_score as acc
from sklearn.metrics import make_scorer
from sklearn.dummy import DummyRegressor
from sklearn.dummy import DummyClassifier
from sklearn.preprocessing import LabelEncoder

In [2]:
df_online = pd.read_csv("./NatRep_Online_Upload.csv", delimiter = ",")
df_phone = pd.read_csv("./NatRep_Phone_upload.csv", delimiter = ",")

In [3]:
df_elections = pd.read_csv("./2015_general_elections/2015_voting_gen_election.csv", delimiter=",")

In [4]:
df_qualifications = pd.read_csv("./Education_qualifications/UK_Qualifications.csv", delimiter=",")

In [5]:
df_sex_to_age = pd.read_csv("./Gender_demographics_by_age/UK_M_to_F_ratio_by_age.csv", delimiter=",")

In [6]:
df_newspaper = pd.read_csv("./Newspaper_readability/Newspaper_readerships_uk.csv", delimiter=",")

In [7]:
df_social_grade = pd.read_csv("./Social_grade/Aproximated_social_grade.csv", delimiter=",")

In [8]:
df_station_ratings = pd.read_csv("./Station_ratings/Station_Ratings_UK.csv", delimiter=",")

In [9]:
#start by deleting the ages that cannot legaly vote
for i in range(18):
    df_sex_to_age = df_sex_to_age.drop(df_sex_to_age.index[[0]])

In [10]:
#to avoid overfitting, create age groups every 4 years
df_sex_to_agegroup = df_sex_to_age.groupby(np.arange(len(df_sex_to_age))//4).sum()
df_sex_to_agegroup.index = df_sex_to_age.loc[1::4,'Age']

In [11]:
#Find the total number of men and women
totals = df_sex_to_age.sum(axis=0)

In [12]:
#reconstruct the dataframe to only include the precentages
df_sex_to_agegroup["Number of males"] = df_sex_to_agegroup["Number of males"].astype(float)
df_sex_to_agegroup["Number of females"] = df_sex_to_agegroup["Number of females"].astype(float)
for index, row in df_sex_to_agegroup.iterrows():
    row[0]=(row[0]/totals["Number of males"])
    row[1]=(row[1]/totals["Number of females"])
print df_sex_to_agegroup

     Number of males  Number of females
Age                                    
18          0.066878           0.060318
22          0.072109           0.066678
26          0.070812           0.067333
30          0.069682           0.067036
34          0.065900           0.062987
38          0.065440           0.062975
42          0.072479           0.070616
46          0.074537           0.072751
50          0.072203           0.070203
54          0.064226           0.062309
58          0.057229           0.056035
62          0.055368           0.055059
66          0.055595           0.055897
70          0.041469           0.043284
74          0.034191           0.037489
78          0.026345           0.031315
82          0.018620           0.025433
86          0.010553           0.017238
90+         0.006366           0.015044


In [13]:
#create a dataframe to hold the means for every qualification level
df_mean_qualifications = pd.DataFrame({ 'Qualification' : np.array(["No Qualification","Level 1","Level 2","Level 3","Level 4","Other"],dtype='string'),
                     '%' : np.array([df_qualifications["% No Qualifications"].mean()*0.01,
                                     df_qualifications["% Level 1"].mean()*0.01,
                                     df_qualifications["% Level 2"].mean()*0.01,
                                     df_qualifications["% Level 3"].mean()*0.01,
                                     df_qualifications["% Level 4"].mean()*0.01,
                                     df_qualifications["% Other"].mean()*0.01],dtype='float')})

In [14]:
#find the total number of readers and then use it to compute the precentages
news_total = df_newspaper["Combined"].values.sum()   
pn = []
for readers in df_newspaper["Combined"].values:
    pn.append("{0:.5g}".format((float(readers)/float(news_total))))
cn = list(df_newspaper["Title"].values)
df_percent_newspapers = pd.DataFrame([pn,cn])
df_percent_newspapers = df_percent_newspapers.T

In [15]:
#find the respective precentages in df_online and df_phone in order to perform the bootstrap
def find_precentages_in_data(data):
    ######### AGES ##########
    p_ages_male= np.zeros((19,),dtype=np.float)
    p_ages_female = np.zeros((19,), dtype=np.float)
    step = 0
    for i in range(18,90,4):
        p_ages_male[step]=len((data.query( str(i)+'<= Age <='+str(i+3)).query('1 == gender')).index)
        p_ages_female[step]=len((data.query( str(i)+'<= Age <='+str(i+3)).query('2 == gender')).index)
        step = step + 1
    #90+
    p_ages_male[step]=len((data.query( str(90)+'< Age').query('1 == gender')).index)
    p_ages_female[step]=len((data.query( str(90)+'< Age').query('2 == gender')).index)
    
    #precentages per age group
    total_m = p_ages_male.sum()
    total_f = p_ages_female.sum()
    for i in range(0,len(p_ages_male)):
        p_ages_male[i] = "{0:.3f}".format(p_ages_male[i]/total_m)
    for i in range(0,len(p_ages_female)):
        p_ages_female[i] = "{0:.3f}".format(p_ages_female[i]/total_f)
    
    ########## SOCIAL GRADE ##########
    p_sgrade = np.zeros((4,),dtype=np.float)
    for i in range(0,4):
        p_sgrade[i]=len((data.query(str(i+1)+'== Socgrade_matrix_w8').index))
    
    total_s = p_sgrade.sum()
    for i in range(0,4):
        p_sgrade[i] = "{0:.3f}".format(p_sgrade[i]/total_s)
        
        
    ########## 2015 ELECTIONS VOTING ##########
    p_party = np.zeros((9,),dtype=np.float)
    
    voted_data =  data.query(str(1)+'== voted2015')['pastvote_2015']
    for i in voted_data:
        temp = int(i)
        if(temp<=9):p_party[temp-1] = p_party[temp-1] + 1
    
    total_p = p_party.sum()
    for i in range(0,9):
        p_party[i] = "{0:.3f}".format(p_party[i]/total_p)
        
    
    ########## EDUCATION QUALIFICATIONS ##########
    p_levels = np.zeros((6,),dtype=np.float)
    #No qualifications
    p_levels[0] = len((data.query(str(1)+'== profile_education_level').index))
    #Level 1 qualifications
    p_levels[1] = len((data.query(str(2)+'== profile_education_level').index))+len((data.query(str(5)+'== profile_education_level').index))+len((data.query(str(8)+'== profile_education_level').index))
    #Level 2 qualifications
    p_levels[2] = len((data.query(str(6)+'== profile_education_level').index))+len((data.query(str(9)+'== profile_education_level').index))+len((data.query(str(10)+'== profile_education_level').index))
            
    #Level 3 qualifications
    p_levels[3] = len((data.query(str(11)+'== profile_education_level').index))+len((data.query(str(7)+'== profile_education_level').index))+len((data.query(str(12)+'== profile_education_level').index))+len((data.query(str(13)+'== profile_education_level').index))+len((data.query(str(14)+'== profile_education_level').index))
    #Level 4 qualifications
    p_levels[4] = len((data.query(str(15)+'== profile_education_level').index))+len((data.query(str(16)+'== profile_education_level').index))+len((data.query(str(17)+'== profile_education_level').index))
    #Other qualifications
    p_levels[5] = len((data.query(str(18)+'== profile_education_level').index))
    
    total_l = p_levels.sum()
    for i in range(0,6):
        p_levels[i] = "{0:.3f}".format(p_levels[i]/total_l)
    
    ########## NEWSPAPERS PREFERRED ##########
    p_newspapers = np.zeros((15,),dtype=np.float)
    for i in range(0,14):
        p_newspapers[i] = len((data.query(str(i+1)+'== Newspaper_Read').index))
    
    p_newspapers[14] = len((data.query(str(16)+'== Newspaper_Read').index))+len((data.query(str(17)+'== Newspaper_Read').index))
    
    total_np = p_newspapers.sum()
    p_newsp = np.zeros((15,),dtype=np.float)
    for i in range(0,15):
        p_newsp[i] = "{0:.3f}".format(p_newspapers[i]/total_np)
    
    ########## NEWS STATION PREFERRED ##########
    p_newschannel = np.zeros((6,),dtype=np.float)
    for i in range(0,6):
        if ((i+1)>1): p_newschannel[i] = len((data.query(str(i+2)+'== TV_News').index))
        else: p_newschannel[i] = len((data.query(str(i+1)+'== TV_News').index))
    
    total_nc = p_newschannel.sum()
    for i in range(0,6):
        p_newschannel[i] = "{0:.3f}".format(p_newschannel[i]/total_nc)
    
    
    ########## FINAL ARRAY ##########
    t1 = np.array([[p_ages_male], [p_ages_female], [p_sgrade], [p_party], [p_levels], [p_newsp], [p_newschannel]], dtype=object)
    ########## ########### ##########
    return t1
    
    

In [16]:
online_precentages = find_precentages_in_data(df_online)

In [17]:
#use bootstrapping to create new examples in order for the data precentages to match the population precentages
def bootstrap(data, sample_of_interest, condition):
    column_names = list(data.columns.values)
    #create a new id for the new sample
    new_id = (data[column_names[0]].iloc[-1])+1
    #create a list to add to the dataframe as new example
    temp = []
    temp.append(new_id)
    for column_name in itertools.islice(column_names,1,len(column_names)):
        #create a sample for a data value that was under-represented in the polling
        if (sample_of_interest == column_name):
            pool = data.query(sample_of_interest+condition)
            sample = (np.random.choice(pool[column_name],1)).item(0)
        else:
            sample = (np.random.choice(data[column_name],1)).item(0)
        temp.append(sample)
    return temp
    
        

In [18]:
#use multiple conditions to avoid confusion in the code
def conditions(online_precentages, df_sex_to_agegroup, df_social_grade, df_elections, df_mean_qualifications, df_percent_newspapers, df_station_ratings):
    ##### PRECENTAGE DIFFERENCE IN AGES OF MEN #####
    condition_ages_male = []
    for i in range(online_precentages[0][0].size):
        if("{0:.3f}".format(float(online_precentages[0][0][i])) < "{0:.3f}".format(float(df_sex_to_agegroup.iat[i,0]))):
            condition_ages_male.append('<')
        elif("{0:.3f}".format(float(online_precentages[0][0][i])) == "{0:.3f}".format(float(df_sex_to_agegroup.iat[i,0]))):
            condition_ages_male.append('=')
        else:
            condition_ages_male.append('>')
    
    ##### PRECENTAGE DIFFERENCE IN AGES OF WOMEN #####
    condition_ages_female = []
    for i in range(online_precentages[1][0].size):
        if("{0:.3f}".format(float(online_precentages[1][0][i])) < "{0:.3f}".format(float(df_sex_to_agegroup.iat[i,1]))):
            condition_ages_female.append('<')
        if("{0:.3f}".format(float(online_precentages[1][0][i])) == "{0:.3f}".format(float(df_sex_to_agegroup.iat[i,1]))):
            condition_ages_female.append('=')
        else:
            condition_ages_female.append('>')
        
    ##### PRECENTAGE DIFFERENCE IN SOCIAL GRADES #####
    condition_social_grades = []
    for i in range(online_precentages[2][0].size):
        if("{0:.3f}".format(float(online_precentages[2][0][i])) < "{0:.3f}".format(float(df_social_grade.iat[0,(i+5)]))):
            condition_social_grades.append('<')
        elif("{0:.3f}".format(float(online_precentages[2][0][i])) == "{0:.3f}".format(float(df_social_grade.iat[0,(i+5)]))):
            condition_social_grades.append('=')
        else:
            condition_social_grades.append('>')
        
        
    ##### PRECENTAGE DIFFERENCE IN 2015 ELECTIONS VOTING #####
    condition_voting_2015 = []
    for i in range(online_precentages[3][0].size):
        if("{0:.3f}".format(float(online_precentages[3][0][i])) < "{0:.3f}".format(float(df_elections.iat[i,2]))):
            condition_voting_2015.append('<')
        if("{0:.3f}".format(float(online_precentages[3][0][i])) == "{0:.3f}".format(float(df_elections.iat[i,2]))):
            condition_voting_2015.append('=')
        else:
            condition_voting_2015.append('>')
        
        
    ##### PRECENTAGE DIFFERENCE IN POPULATION QUALIFICATIONS #####
    condition_qualifications = []
    for i in range(online_precentages[4][0].size):
        if("{0:.3f}".format(float(online_precentages[4][0][i])) < "{0:.3f}".format(float(df_mean_qualifications.iat[i,0]))):
            condition_qualifications.append('<')
        elif("{0:.3f}".format(float(online_precentages[4][0][i])) == "{0:.3f}".format(float(df_mean_qualifications.iat[i,0]))):
            condition_qualifications.append('=')
        else:
            condition_qualifications.append('>')
        
    
    ##### PRECENTAGE DIFFERENCE IN NEWSPAPERS PREFERENCE #####
    condition_newspapers = []
    for i in range(online_precentages[5][0].size):
        if("{0:.3f}".format(float(online_precentages[5][0][i])) < "{0:.3f}".format(float(df_percent_newspapers.iat[i,0]))):
            condition_newspapers.append('<')
        elif("{0:.3f}".format(float(online_precentages[5][0][i])) == "{0:.3f}".format(float(df_percent_newspapers.iat[i,0]))):
            condition_newspapers.append('=')
        else:
            condition_newspapers.append('>')
        
        
    ##### PRECENTAGE DIFFERENCE IN NEWS STATIONS #####
    condition_tvnews = []
    for i in range(online_precentages[6][0].size):
        if("{0:.3f}".format(online_precentages[6][0][i]) < "{0:.3f}".format(df_station_ratings.iat[i,1])):
            condition_tvnews.append('<')
        elif("{0:.3f}".format(online_precentages[6][0][i]) == "{0:.3f}".format(df_station_ratings.iat[i,1])):
            condition_tvnews.append('=')
        else:
            condition_tvnews.append('>')
        
    
    ########## FINAL CONDITION CONSTRUCTION ##########
    condition = [condition_ages_male ,condition_ages_female ,condition_social_grades ,condition_voting_2015 ,condition_qualifications ,condition_newspapers ,condition_tvnews]
    
    return condition

In [19]:
loop_condition = conditions(online_precentages, df_sex_to_agegroup, df_social_grade, df_elections, df_mean_qualifications, df_percent_newspapers, df_station_ratings)

In [None]:
def check(condition_elements):
    flag = True
    for ar in condition_elements:
        for element in condition_elements:
            if (element!='='):
                flag = False
                break
        if (flag == False): 
            break

In [None]:
#combine all conditions and iterate until they resemble the population
flag = 0
for el in loop_condition[0]:
    if (el!='='):
        flag = flag + 1


j=0   
########## MEN BY AGE ##########
while (flag > 1):
    i = 18
    for c in range (0,len(loop_condition[0])):
        while (loop_condition[0][c] == '<'):
            age_chosen = random.randint(i,i+3)
            string = str(age_chosen)
            string = " == "+string
            if (i>=86):
                string = " > 90"
            temp = bootstrap(df_online, "Age", string)
            while (int(temp[23])!=1):
                temp = bootstrap(df_online, "Age", string)
            #TO AVOID NOT BOOTSTRAPPING THE AGE CHOSEN
            temp[19] = age_chosen
            #Make sure if the people voted, they voted for someone

            #(Data with values that show that the person did not vote,
            #but have a voted party will be discarded by the precentages function)
            while ((temp[17] == 1) & (temp[18] == ' ')):
                temp = bootstrap(df_online, "Age", string)
            df_additional = pd.DataFrame([temp], columns=list(df_online.columns.values))
            df_online = df_online.append(df_additional, ignore_index=True)
            online_precentages = find_precentages_in_data(df_online)
            loop_condition = conditions(online_precentages, df_sex_to_agegroup, df_social_grade, df_elections, df_mean_qualifications, df_percent_newspapers, df_station_ratings)

        i = i+4
        
    flag = 0
    for el in loop_condition[0]:
        if (el!='='):
            flag = flag + 1
            
    
print online_precentages[0][0]
    