#### First look at the data

In [13]:
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import pandas as pd

def try_to_numerify(x):
    try:
        return int(x)
    except:
        try:
            return float(x)
        except:
            return x

def loader(filename,separator='\t',nan='-1'):
    with open(filename,'r') as f:
        rows = [x for x in f.read().split('\n') if x]
    rows = [x.split(separator) for x in rows]
    colnames = rows[0]
    data = rows[1:]
    data = [[try_to_numerify(x) for x in row] for row in data]
    return pd.DataFrame(data,columns=colnames)

df = loader("src/static/qviz.tsv")
df.describe()

Unnamed: 0,implementation,R1,R2,R3,R4,R5,R6,R7,R8,I1,...,C4,C5,C6,C7,C8,accuracy,elapse,fromsearch,age,gender
count,8855.0,8855.0,8855.0,8855.0,8855.0,8855.0,8855.0,8855.0,8855.0,8855.0,...,8855.0,8855.0,8855.0,8855.0,8855.0,8855.0,8855.0,8855.0,8855.0,8855.0
mean,1.759232,2.419537,2.15878,1.76917,2.256014,1.66437,2.321062,1.897798,1.95799,3.431959,...,2.292942,2.478035,2.636364,2.064032,2.132016,242582.0,371.139808,0.424732,-0.396386,6.891587
std,0.427574,1.296048,1.242859,1.174141,1.328759,1.057555,1.322437,1.139695,1.177704,1.313977,...,1.246006,1.367004,1.289875,1.174839,1.194761,22821050.0,5994.367811,0.49433,1.103554,15.468571
min,1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,-1.0,-1.0
25%,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,3.0,...,1.0,1.0,1.0,1.0,1.0,23.0,84.0,0.0,-1.0,-1.0
50%,2.0,2.0,2.0,1.0,2.0,1.0,2.0,2.0,2.0,4.0,...,2.0,2.0,3.0,2.0,2.0,85.0,160.0,0.0,-1.0,-1.0
75%,2.0,3.0,3.0,2.0,3.0,2.0,3.0,3.0,3.0,4.0,...,3.0,4.0,4.0,3.0,3.0,95.0,230.0,1.0,-1.0,-1.0
max,2.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,...,5.0,5.0,5.0,5.0,5.0,2147484000.0,509296.0,1.0,3.0,100.0


As we can see, there is an issue with accuracy. Implementation column seems needless. Gender and age seem almost empty. 

Let's create data sets:
* one containing the data with age and gender
* one with aggregated RIASEC measures and countries

In [28]:
dfag = df.query("age != -1 and gender != -1")
riasec = df.drop(["age","gender","fromsearch","accuracy","elapse","implementation"],1)
riasec['Rclass'] = (riasec.R1 + riasec.R2 + riasec.R3 + riasec.R4 + riasec.R5 + riasec.R6 + riasec.R7 + riasec.R8)
riasec['Iclass'] = (riasec.I1 + riasec.I2 + riasec.I3 + riasec.I4 + riasec.I5 + riasec.I6 + riasec.I7 + riasec.I8)
riasec['Aclass'] = (riasec.A1 + riasec.A2 + riasec.A3 + riasec.A4 + riasec.A5 + riasec.A6 + riasec.A7 + riasec.A8)
riasec['Sclass'] = (riasec.S1 + riasec.S2 + riasec.S3 + riasec.S4 + riasec.S5 + riasec.S6 + riasec.S7 + riasec.S8)
riasec['Eclass'] = (riasec.E1 + riasec.E2 + riasec.E3 + riasec.E4 + riasec.E5 + riasec.E6 + riasec.E7 + riasec.E8)
riasec['Cclass'] = (riasec.C1 + riasec.C2 + riasec.C3 + riasec.C4 + riasec.C5 + riasec.C6 + riasec.C7 + riasec.C8)
riasec = riasec.select(lambda x: x in ['Rclass','Iclass','Aclass','Sclass','Eclass','Cclass','country'],1)
riasec.describe()

Unnamed: 0,Rclass,Iclass,Aclass,Sclass,Eclass,Cclass
count,8855.0,8855.0,8855.0,8855.0,8855.0,8855.0
mean,16.44472,23.977301,23.499718,24.581818,19.301186,18.270807
std,7.086629,8.282103,8.092654,7.513248,6.857728,7.616697
min,-8.0,-8.0,-8.0,-8.0,-8.0,-8.0
25%,10.0,18.0,18.0,19.0,14.0,12.0
50%,15.0,24.0,24.0,25.0,19.0,17.0
75%,21.0,30.0,30.0,30.0,24.0,24.0
max,40.0,40.0,40.0,40.0,40.0,40.0


In [32]:
riasec.groupby(by='country').describe().select(lambda x:x[1]=='mean')

Unnamed: 0_level_0,Unnamed: 1_level_0,Aclass,Cclass,Eclass,Iclass,Rclass,Sclass
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
A1,mean,33.500000,14.000000,22.000000,25.000000,21.000000,26.500000
A2,mean,35.000000,17.500000,18.000000,19.500000,18.500000,34.500000
AE,mean,24.785714,15.714286,19.357143,25.928571,18.071429,27.357143
AF,mean,24.000000,9.000000,25.000000,27.000000,10.000000,23.000000
AI,mean,12.000000,22.000000,19.000000,12.000000,8.000000,29.000000
AL,mean,22.833333,20.166667,20.333333,27.500000,13.833333,19.666667
AP,mean,26.666667,25.000000,23.833333,23.166667,21.333333,26.833333
AR,mean,26.000000,15.300000,19.800000,21.100000,15.100000,25.000000
AS,mean,23.000000,22.000000,22.000000,14.000000,21.000000,33.000000
AT,mean,24.000000,19.428571,20.142857,23.214286,16.214286,23.500000


In [47]:
euro = ['AL','AD','AM','AT','BY','BE','BA','BG','CH','CY','CZ','DE','DK','EE','ES','FO','FI','FR','GB','GE','GI','GR','HU','HR','IE','IS','IT','LT','LU','LV','MC','MK','MT','NO','NL','PO','PT','RO','RU','SE','SI','SK','SM','TR','UA','VA']
dfeu = df.query("country in @euro")
dfeu.query('age > 1')['age'].mean()
dfeu.groupby(by='gender').count()

Unnamed: 0_level_0,implementation,R1,R2,R3,R4,R5,R6,R7,R8,I1,...,C4,C5,C6,C7,C8,accuracy,elapse,country,fromsearch,age
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
-1,896,896,896,896,896,896,896,896,896,896,...,896,896,896,896,896,896,896,896,896,896
14,3,3,3,3,3,3,3,3,3,3,...,3,3,3,3,3,3,3,3,3,3
15,3,3,3,3,3,3,3,3,3,3,...,3,3,3,3,3,3,3,3,3,3
16,6,6,6,6,6,6,6,6,6,6,...,6,6,6,6,6,6,6,6,6,6
17,9,9,9,9,9,9,9,9,9,9,...,9,9,9,9,9,9,9,9,9,9
18,12,12,12,12,12,12,12,12,12,12,...,12,12,12,12,12,12,12,12,12,12
19,9,9,9,9,9,9,9,9,9,9,...,9,9,9,9,9,9,9,9,9,9
20,14,14,14,14,14,14,14,14,14,14,...,14,14,14,14,14,14,14,14,14,14
21,12,12,12,12,12,12,12,12,12,12,...,12,12,12,12,12,12,12,12,12,12
22,22,22,22,22,22,22,22,22,22,22,...,22,22,22,22,22,22,22,22,22,22


In [55]:
riasec.corr()

Unnamed: 0,Rclass,Iclass,Aclass,Sclass,Eclass,Cclass
Rclass,1.0,0.344531,0.131872,0.081295,0.258789,0.458567
Iclass,0.344531,1.0,0.348474,0.191063,0.066871,0.118784
Aclass,0.131872,0.348474,1.0,0.337216,0.26284,-0.044107
Sclass,0.081295,0.191063,0.337216,1.0,0.435966,0.166866
Eclass,0.258789,0.066871,0.26284,0.435966,1.0,0.457094
Cclass,0.458567,0.118784,-0.044107,0.166866,0.457094,1.0


In [67]:
from sklearn.decomposition import PCA
pca = PCA(30).fit(df.drop('country',1).as_matrix())
pca.get_params()

{'copy': True, 'n_components': 30, 'whiten': False}