In [1]:
%matplotlib inline
import matplotlib
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

from sklearn.metrics import confusion_matrix
import itertools

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import mean_absolute_error as mae
from sklearn.metrics import accuracy_score as acc
from sklearn.metrics import make_scorer
from sklearn.dummy import DummyRegressor
from sklearn.dummy import DummyClassifier
from sklearn.preprocessing import LabelEncoder

In [2]:
df_online = pd.read_csv("./NatRep_Online_Upload.csv", delimiter = ",")
df_phone = pd.read_csv("./NatRep_Phone_upload.csv", delimiter = ",")

In [3]:
df_elections = pd.read_csv("./2015_general_elections/2015_voting_gen_election.csv", delimiter=",")

In [4]:
df_elections.head()

Unnamed: 0,Party,Votes,Vote Share
0,Conservative,11334576,36.9
1,Labour,9347304,30.4
2,UKIP,3881099,12.6
3,Liberal Democrat,2415862,7.9
4,Scotish National Party,1254436,4.7


In [5]:
df_qualifications = pd.read_csv("./Education_qualifications/UK_Qualifications.csv", delimiter=",")

In [6]:
df_sex_to_age = pd.read_csv("./Gender_demographics_by_age/UK_M_to_F_ratio_by_age.csv", delimiter=",")

In [7]:
df_newspaper = pd.read_csv("./Newspaper_readability/Newspaper_readerships_uk.csv", delimiter=",")

In [8]:
df_social_grade = pd.read_csv("./Social_grade/Aproximated_social_grade.csv", delimiter=",")

In [9]:
df_social_grade.head()

Unnamed: 0,Area code,Area name,Unnamed: 2,Unnamed: 3,All categories: Approximated social grade,Approximated social grade AB,Approximated social grade C1,Approximated social grade C2,Approximated social grade DE
0,K04000001,ENGLAND AND WALES,,,17266580,22.7,30.8,20.8,25.7
1,E92000001,ENGLAND,,,16339853,23.0,30.9,20.6,25.5
2,E12000001,NORTH EAST,,,824152,16.6,29.6,21.7,32.0
3,E12000002,NORTH WEST,,,2222061,19.3,30.3,20.7,29.7
4,E12000003,YORKSHIRE AND THE HUMBER,,,1640564,18.8,29.3,22.3,29.6


In [10]:
df_station_ratings = pd.read_csv("./Station_ratings/Station_Ratings_UK.csv", delimiter=",")

In [11]:
df_station_ratings.head()

Unnamed: 0,Station,Ratings
0,BBC,31.8
1,ITV,22.6
2,Chanel 4,11.3
3,Five,5.9
4,Sky,6.8


In [12]:
#start by deleting the ages that cannot legaly vote
for i in range(18):
    df_sex_to_age = df_sex_to_age.drop(df_sex_to_age.index[[0]])

In [13]:
#to avoid overfitting, create age groups every 4 years
df_sex_to_agegroup = df_sex_to_age.groupby(np.arange(len(df_sex_to_age))//4).sum()
df_sex_to_agegroup.index = df_sex_to_age.loc[1::4,'Age']
df_sex_to_agegroup

Unnamed: 0_level_0,Number of males,Number of females
Age,Unnamed: 1_level_1,Unnamed: 2_level_1
18,1657490,1575830
22,1787140,1741962
26,1755006,1759090
30,1726987,1751330
34,1633251,1645540
38,1621862,1645240
42,1796311,1844864
46,1847323,1900632
50,1789466,1834055
54,1591763,1627838


In [14]:
#create a dataframe to hold the means for every qualification level
df_mean_qualifications = pd.DataFrame({ 'Qualification' : np.array(["No Qualification","Level 1","Level 2","Level 3","Level 4","Other"],dtype='string'),
                     '%' : np.array([df_qualifications["% No Qualifications"].mean(),
                                     df_qualifications["% Level 1"].mean(),
                                     df_qualifications["% Level 2"].mean(),
                                     df_qualifications["% Level 3"].mean(),
                                     df_qualifications["% Level 4"].mean(),
                                     df_qualifications["% Other"].mean()],dtype='float')})
df_mean_qualifications

Unnamed: 0,%,Qualification
0,14.502011,No Qualification
1,15.601149,Level 1
2,17.831609,Level 2
3,14.468103,Level 3
4,29.292529,Level 4
5,4.972701,Other


In [15]:
#find the total number of readers and then use it to compute the precentages
news_total = df_newspaper.iat[0,3]+ df_newspaper.iat[1,3]+df_newspaper.iat[2,3]+df_newspaper.iat[3,3]+df_newspaper.iat[4,3]
df_percent_newspapers = pd.DataFrame({ 'Newspapers' : np.array([df_newspaper.iat[0,0],df_newspaper.iat[1,0],df_newspaper.iat[2,0],df_newspaper.iat[3,0],df_newspaper.iat[4,0]],dtype='string'),
                     '%' : np.array([(float(df_newspaper.iat[0,3])/news_total),
                                     (float(df_newspaper.iat[1,3])/news_total),
                                     (float(df_newspaper.iat[2,3])/news_total),
                                     (float(df_newspaper.iat[3,3])/news_total),
                                     (float(df_newspaper.iat[4,3])/news_total)],dtype='float')})
df_percent_newspapers

Unnamed: 0,%,Newspapers
0,0.290212,The Sun
1,0.25778,Daily Mail
2,0.168989,Metro
3,0.169807,Daily Mirror
4,0.113212,The Guardian
