In [None]:
import pandas as pd
import numpy as np
import missingno as msno
import plotly as py
import pandas as pd
import geopandas as gpd
import optuna
from optuna.integration import LightGBMPruningCallback
from geopy.geocoders import Nominatim
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, log_loss, accuracy_score
from sklearn.model_selection import StratifiedKFold, cross_val_score, train_test_split
import xgboost as xgb
import lightgbm as lgbm
import time

r_s = 0

## Load in data

In [None]:
# Load in housing data

house_data = pd.read_csv("housing_in_london/housing_in_london_yearly_variables.csv") #billy you need to cut sub 2001 data?
house_data.rename(columns={"area": "borough_name"}, inplace=True)
house_data.borough_name = house_data.borough_name.str.title()
house_data.borough_name = house_data.borough_name.replace(['Of','And'],['of','and'], regex=True)
house_data.dtypes # Lets check datatypes. FOr some reason mean salary and recylcing percentages are objects not floats. Lets change that!
house_data.mean_salary = house_data.mean_salary.apply(pd.to_numeric, errors='coerce') #Drop strings and make float
house_data.recycling_pct = house_data.recycling_pct.apply(pd.to_numeric, errors='coerce') #Drop strings and make float
msno.matrix(house_data)

# Let's first drop the entire "life_satisfaction" column here as it is missing lots of values. We can then remove individual rows that contain a "NaN.
house_data.drop(columns = ["life_satisfaction"], inplace = True)

# Change dates to just years
house_data['year'] = house_data['date'].str.split('-').str[0]
house_data.drop(columns = ["date"], inplace = True)

house_data.dropna(inplace = True)
house_data=house_data[(house_data.borough_name != 'London') & (house_data.borough_name != 'England' )]

# Do this quickly, optimise later

def add_election(row):
    if int(row.year) < 2006:
        election = "2005"
        return election
    elif int(row.year) < 2011:
        election = "2010"
        return election
    elif int(row.year) < 2016:
        election = "2015"
        return election
    elif int(row.year) < 2018:
        election = "2017"
        return election
    else:
        election = "2019"
        return election
        
house_data["election"] = house_data.apply(lambda row: add_election(row), axis=1)


msno.matrix(house_data)
#Done


In [None]:
# Load in election data

election_data = pd.read_csv("1918-2019election_results.csv",encoding = "ISO-8859-1") #Needs encoding specified or wont read file
election_data = election_data[election_data["election"]>"2001"] #this now matches other data
election_data = election_data[election_data["country/region"]=='London'] #Only London
election_data = election_data[["constituency_id", "constituency_name", "con_share", "lib_share", "lab_share", "election"]] # We want the share (percentage of votes rather than raw number)
election_data.constituency_name = election_data.constituency_name.replace(['&','And'],'and', regex=True)
election_data.constituency_name = election_data.constituency_name.replace('Of','of', regex=True)
#Load in constituency list for conversion

constituencies = pd.read_excel("PCON_DEC_2021_UK_NC.xlsx", sheet_name="PCON_DEC_2021_UK_NC", names=["constituency_code", "constituency_name"])
constituencies.constituency_name = constituencies.constituency_name.replace(',','', regex=True)
dict_name_code = dict(zip(constituencies.constituency_name,constituencies.constituency_code))

# Remap to account for different or out-of-date codes

election_data.constituency_id = election_data.constituency_name.map(dict_name_code)

msno.matrix(election_data)
#Still some NaNs will clean this up in future work,for now just drop!
election_data.dropna(inplace = True)

#Determine winner by year

def determine_winner(row):
    vote_shares = [row.con_share, row.lib_share, row.lab_share]
    win = np.argmax(vote_shares)
    if win == 0:
        winner = "con"
    elif win == 1:
        winner = "lib"
    elif win == 2:
        winner = "lab"
    return winner
        
election_data["winner"] = election_data.apply(lambda row: determine_winner(row), axis=1)

msno.matrix(election_data)

In [None]:
constituencies_boroughs = pd.read_csv("constituencies_boroughs.csv")
dict_constituencies_boroughs = dict(zip(constituencies_boroughs.Constituency,constituencies_boroughs.Borough))

election_data["borough_name"]=election_data["constituency_name"].map(dict_constituencies_boroughs)

In [None]:
# Load in geo/polygon data YOU MAY NOT NEED THIS YET

geo_data = gpd.read_file('BoroughFiles/London_Borough_Excluding_MHW.shp')

In [None]:
combined_data = pd.merge(election_data, house_data, how="outer", on=["borough_name", "election"])

#Drop Nan (There shouldnt be any, but just in case!)
combined_data.dropna(inplace=True)

In [None]:
# #No, we do need some kind of dummy data. Think of a smarter way to do this in future

# # Some manual adds of dummy data for the sake of Tableau plotting

# #2019 dummy data
# bexley_2019_dummy = combined_data.iloc[938,:]
# bexley_2019_dummy[["election","winner","year"]] = ["2019","con", "2018"]
# redbridge_2019_dummy = combined_data.iloc[956,:]
# redbridge_2019_dummy[["election","winner","year"]] = ["2019","lab", "2018"]
# kingston_upon_thames_2019_dummy = combined_data.iloc[1017,:]
# kingston_upon_thames_2019_dummy[["election","winner","year"]] = ["2019","lib", "2018"]
# merton_2019_dummy = combined_data.iloc[1027,:]
# merton_2019_dummy[["election","winner","year"]] = ["2019","con", "2018"]

# westminster_2005_dummy = combined_data.iloc[590,:]
# westminster_2005_dummy[["election","winner","year"]] = ["2005","con", "2004"]
# kensington_and_chelsea_2005_dummy = combined_data.iloc[540,:]
# kensington_and_chelsea_2005_dummy[["election","winner","year"]] = ["2005","con", "2004"]
# hammersmith_and_fulham_2005_dummy = combined_data.iloc[496,:]
# hammersmith_and_fulham_2005_dummy[["election","winner","year"]] = ["2005","con", "2004"]

# dummy_df = pd.DataFrame([bexley_2019_dummy, redbridge_2019_dummy, kingston_upon_thames_2019_dummy, merton_2019_dummy, westminster_2005_dummy, kensington_and_chelsea_2005_dummy, hammersmith_and_fulham_2005_dummy])
# combined_data = pd.concat([combined_data,dummy_df])

# #combined_data.reset_index(inplace=True, drop=True) # Do I need this? Yeah maybe, you added a bug somewhere ... noworries, fix tomorrow!

## Potential visualisation section

In [None]:
#We should do some visualisation of "final" prepared dataset for feature selection? or do it earlier? let me think ...

## Here is the ML section, you are going to predict the 2019 results and compare them to the real 2019 and see the effect of Brexit

In [None]:
data_for_ml = combined_data.drop(columns = ['constituency_name', 'constituency_id', 'con_share', 'lib_share', 'lab_share', 'election',
       'borough_name', 'code', 'borough_flag', 'year'] ,inplace = False)

#Split post-2019 and pre-2019 everything pre-2019 election results is train basically
pre = data_for_ml[combined_data.election != "2019"] #No "election" column in ML dataset, but can use information from original combined_dataset
post = data_for_ml[combined_data.election == "2019"]

### Simple ML

In [None]:
X = pre.drop(columns="winner") #wierd indexing problem here. Same as the reset index earlier, not solved though. Annyoing but ok
y = pre.winner

#Step 1. Define an objective function to be maximized.
def objective(trial):

    classifier_name = trial.suggest_categorical("classifier", ["SVC", "RandomForest"])
    
    # Step 2. Setup values for the hyperparameters:
        
    if classifier_name == "SVC":
        svc_c = trial.suggest_float("svc_c", 1e-10, 1e10, log=True)
        classifier_obj = SVC(C=svc_c, gamma="auto")    

    else:
        rf_n_estimators = trial.suggest_int("rf_n_estimators", 10, 1000)
        rf_max_depth = trial.suggest_int("rf_max_depth", 2, 32, log=True)
        classifier_obj = RandomForestClassifier(
            max_depth=rf_max_depth, n_estimators=rf_n_estimators)

    # Step 3: Scoring method:
    score = cross_val_score(classifier_obj, X, y, n_jobs=-1, cv=5)
    accuracy = score.mean()
    return accuracy

# Step 4: Running it
optuna.logging.set_verbosity(optuna.logging.CRITICAL)
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=100)
print(study.best_trial)

### More complex ML (add XGBoost later)

In [None]:
# Now for more complex, lightgbm and xgboost

#Start simple, then when you get the hang of it, follow from here

#https://towardsdatascience.com/kagglers-guide-to-lightgbm-hyperparameter-tuning-with-optuna-in-2021-ed048d9838b5  (he does CV within the objective call, which is what I wanted to know! So you CAN do it)
#https://towardsdatascience.com/how-to-beat-the-heck-out-of-xgboost-with-lightgbm-comprehensive-tutorial-5eba5219599

#This is the one you ended up basing yourself on: https://practicaldatascience.co.uk/machine-learning/how-to-tune-a-lightgbmclassifier-model-with-optuna

X = pre.drop(columns="winner") #wierd indexing problem here. Same as the reset index earlier, not solved though. Annyoing but ok
y = pre.winner

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)


def objective(trial):
    """
    Objective function to be minimized.
    """
    param = {
        "objective": "multiclass",
        "metric": "multi_logloss",
        "verbosity": -1,
        "boosting_type": "gbdt",
        "num_class": 3,
        "lambda_l1": trial.suggest_float("lambda_l1", 1e-8, 10.0, log=True),
        "lambda_l2": trial.suggest_float("lambda_l2", 1e-8, 10.0, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.4, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.4, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
    }
    gbm = lgb.LGBMClassifier(**param)
    gbm.fit(X_train, y_train, verbose=False)
    preds = gbm.predict(X_test, verbose=-100)
    accuracy = accuracy_score(y_test, preds)
    return accuracy

optuna.logging.set_verbosity(optuna.logging.CRITICAL)
study = optuna.create_study(study_name="lightgbm", direction="maximize")
study.optimize(objective, n_trials=100)
print(study.best_trial)

In [None]:
# Billy in this block add XGBoost in the future for comparison