In [75]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.metrics import precision_score, recall_score

def categorical_to_numbers(df, column):
    classes = df[column].value_counts().index.values
    copy = pd.DataFrame(df)
    
    for i in range(0,copy[column].shape[0]):
        value =  copy[column].iat[i]
        copy[column].iat[i] = np.argwhere(classes == value)[0][0]
    
    return copy, classes

fields_to_filter = ["Age","Region/Country Name","Region/Country Code","the_geom","output_area_code","objectid","cartodb_id","lat","lng","Sex","Local Authority Name","Local Authority Code","Group Code","Supergroup Code","Supergroup Name","Subgroup Name","Subgroup Code"]
class_column = "Group Name"

"""
    LOADING THE DATA
"""

households = pd.read_csv("data/merge_with_households.csv")
households_dropped = households.drop(columns=fields_to_filter)
households_dropped

Unnamed: 0,One person household: Aged 65 and over,One person household: Other,One family only: Married or same-sex civil partnership couple: No children,One family only: Married or same-sex civil partnership couple: Dependent children,One family only: Married or same-sex civil partnership couple: All children non-dependent,One family only: Cohabiting couple: No children,One family only: Cohabiting couple: Dependent children,One family only: Cohabiting couple: All children non-dependent,One family only: Lone parent: Dependent children,One family only: Lone parent: All children non-dependent,...,economically_inactive_other,unemployed_age_16_to_24,unemployed_age_50_to_74,unemployed_never_worked,long_term_unemployed,full_time_,part_time_,unemployed,retired_pc,Group Name
0,0,0,0,14,0,0,2,0,2,0,...,4,5,0,1,0,42.986425,11.312217,2.262443,27.149321,Semi-Detached Suburbia
1,0,0,0,2,1,0,0,0,4,3,...,4,5,0,1,0,42.986425,11.312217,2.262443,27.149321,Semi-Detached Suburbia
2,0,1,1,2,0,1,1,0,0,0,...,4,5,0,1,0,42.986425,11.312217,2.262443,27.149321,Semi-Detached Suburbia
3,0,4,2,12,1,1,1,0,1,2,...,4,5,0,1,0,42.986425,11.312217,2.262443,27.149321,Semi-Detached Suburbia
4,4,5,17,2,8,2,0,1,0,3,...,4,5,0,1,0,42.986425,11.312217,2.262443,27.149321,Semi-Detached Suburbia
5,0,0,0,15,0,0,1,0,1,0,...,4,5,0,1,0,42.986425,11.312217,2.262443,27.149321,Semi-Detached Suburbia
6,0,0,0,3,4,1,0,1,0,1,...,4,5,0,1,0,42.986425,11.312217,2.262443,27.149321,Semi-Detached Suburbia
7,0,0,1,3,3,0,1,0,0,0,...,4,5,0,1,0,42.986425,11.312217,2.262443,27.149321,Semi-Detached Suburbia
8,0,0,3,12,1,0,1,1,3,0,...,4,5,0,1,0,42.986425,11.312217,2.262443,27.149321,Semi-Detached Suburbia
9,12,5,16,2,8,3,0,0,0,2,...,4,5,0,1,0,42.986425,11.312217,2.262443,27.149321,Semi-Detached Suburbia


In [76]:
"""
    DATA PRE-PROCESSING
"""

# numericalisation of the classes
data, classes = categorical_to_numbers(households_dropped,class_column)
data = data.rename(columns={class_column:"y"})
data

Unnamed: 0,One person household: Aged 65 and over,One person household: Other,One family only: Married or same-sex civil partnership couple: No children,One family only: Married or same-sex civil partnership couple: Dependent children,One family only: Married or same-sex civil partnership couple: All children non-dependent,One family only: Cohabiting couple: No children,One family only: Cohabiting couple: Dependent children,One family only: Cohabiting couple: All children non-dependent,One family only: Lone parent: Dependent children,One family only: Lone parent: All children non-dependent,...,economically_inactive_other,unemployed_age_16_to_24,unemployed_age_50_to_74,unemployed_never_worked,long_term_unemployed,full_time_,part_time_,unemployed,retired_pc,y
0,0,0,0,14,0,0,2,0,2,0,...,4,5,0,1,0,42.986425,11.312217,2.262443,27.149321,3
1,0,0,0,2,1,0,0,0,4,3,...,4,5,0,1,0,42.986425,11.312217,2.262443,27.149321,3
2,0,1,1,2,0,1,1,0,0,0,...,4,5,0,1,0,42.986425,11.312217,2.262443,27.149321,3
3,0,4,2,12,1,1,1,0,1,2,...,4,5,0,1,0,42.986425,11.312217,2.262443,27.149321,3
4,4,5,17,2,8,2,0,1,0,3,...,4,5,0,1,0,42.986425,11.312217,2.262443,27.149321,3
5,0,0,0,15,0,0,1,0,1,0,...,4,5,0,1,0,42.986425,11.312217,2.262443,27.149321,3
6,0,0,0,3,4,1,0,1,0,1,...,4,5,0,1,0,42.986425,11.312217,2.262443,27.149321,3
7,0,0,1,3,3,0,1,0,0,0,...,4,5,0,1,0,42.986425,11.312217,2.262443,27.149321,3
8,0,0,3,12,1,0,1,1,3,0,...,4,5,0,1,0,42.986425,11.312217,2.262443,27.149321,3
9,12,5,16,2,8,3,0,0,0,2,...,4,5,0,1,0,42.986425,11.312217,2.262443,27.149321,3


In [77]:
# generating samples and labels tranforming into arrays
y = data["y"].values
X = data.loc[:,data.columns != "y"].values

y=y.astype('int')

# normalisation of each value in range [0,1]
min_max_scaler = preprocessing.MinMaxScaler()
X_rescaled = min_max_scaler.fit_transform(X)
X_rescaled_df = pd.DataFrame(data=X_rescaled, columns=data.loc[:,data.columns != "y"].columns)
X_rescaled_df

Unnamed: 0,One person household: Aged 65 and over,One person household: Other,One family only: Married or same-sex civil partnership couple: No children,One family only: Married or same-sex civil partnership couple: Dependent children,One family only: Married or same-sex civil partnership couple: All children non-dependent,One family only: Cohabiting couple: No children,One family only: Cohabiting couple: Dependent children,One family only: Cohabiting couple: All children non-dependent,One family only: Lone parent: Dependent children,One family only: Lone parent: All children non-dependent,...,economically_inactive_long_term_sick_or_disabled,economically_inactive_other,unemployed_age_16_to_24,unemployed_age_50_to_74,unemployed_never_worked,long_term_unemployed,full_time_,part_time_,unemployed,retired_pc
0,0.000000,0.000,0.000000,0.184211,0.000000,0.00000,0.068966,0.0,0.044444,0.000000,...,0.06,0.014706,0.217391,0.000000,0.076923,0.00,0.525438,0.454678,0.056837,0.339957
1,0.000000,0.000,0.000000,0.026316,0.047619,0.00000,0.000000,0.0,0.088889,0.230769,...,0.06,0.014706,0.217391,0.000000,0.076923,0.00,0.525438,0.454678,0.056837,0.339957
2,0.000000,0.025,0.022727,0.026316,0.000000,0.03125,0.034483,0.0,0.000000,0.000000,...,0.06,0.014706,0.217391,0.000000,0.076923,0.00,0.525438,0.454678,0.056837,0.339957
3,0.000000,0.100,0.045455,0.157895,0.047619,0.03125,0.034483,0.0,0.022222,0.153846,...,0.06,0.014706,0.217391,0.000000,0.076923,0.00,0.525438,0.454678,0.056837,0.339957
4,0.035088,0.125,0.386364,0.026316,0.380952,0.06250,0.000000,0.2,0.000000,0.230769,...,0.06,0.014706,0.217391,0.000000,0.076923,0.00,0.525438,0.454678,0.056837,0.339957
5,0.000000,0.000,0.000000,0.197368,0.000000,0.00000,0.034483,0.0,0.022222,0.000000,...,0.06,0.014706,0.217391,0.000000,0.076923,0.00,0.525438,0.454678,0.056837,0.339957
6,0.000000,0.000,0.000000,0.039474,0.190476,0.03125,0.000000,0.2,0.000000,0.076923,...,0.06,0.014706,0.217391,0.000000,0.076923,0.00,0.525438,0.454678,0.056837,0.339957
7,0.000000,0.000,0.022727,0.039474,0.142857,0.00000,0.034483,0.0,0.000000,0.000000,...,0.06,0.014706,0.217391,0.000000,0.076923,0.00,0.525438,0.454678,0.056837,0.339957
8,0.000000,0.000,0.068182,0.157895,0.047619,0.00000,0.034483,0.2,0.066667,0.000000,...,0.06,0.014706,0.217391,0.000000,0.076923,0.00,0.525438,0.454678,0.056837,0.339957
9,0.105263,0.125,0.363636,0.026316,0.380952,0.09375,0.000000,0.0,0.000000,0.153846,...,0.06,0.014706,0.217391,0.000000,0.076923,0.00,0.525438,0.454678,0.056837,0.339957


In [78]:
"""
    DATA ANALYSIS
    Linear Support Vector Machine Multi-classification
"""

# splitting in train and test
X_train, X_test, y_train, y_test = train_test_split(X_normalised, y, test_size=0.2, random_state=42)
print("Number of examples in the training set: " + str(len(X_train)))
print("Number of examples in the test set: " + str(len(X_test)))

# training the classifier
C_param = 50
classifier = LinearSVC(random_state=2, C=C_param)
classifier.fit(X_train, y_train)

# evaluating performances: precision, recall and fmeasure
y_score = classifier.predict(X_test)
precision = precision_score(y_test, y_score, average="micro")
recall = recall_score(y_test, y_score, average="micro")
fmeasure = (2*precision*recall)/(precision+recall)

print("F-Measure " + str(fmeasure))

Number of examples in the training set: 6304
Number of examples in the test set: 1576
F-Measure 0.5241116751269036
