In [None]:
from numpy import mean
from numpy import std
from numpy import absolute
from pandas import read_csv
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score, RepeatedKFold, train_test_split
from sklearn.linear_model import ElasticNetCV
import fasttext

path = "D:/Tilburg University/2021-2022/Thesis/Coca/"
model_path = path + "models/"

model = fasttext.load_model(model_path + "sentenced_dim300_ws2_minn2_maxn5.bin")



In [None]:
### Read CSV File and Delete Unimportant Columns
names_ratings = read_csv("D:/Tilburg University/2021-2022/Thesis/Data analyses/Giovanni Email Data/avgRatings_annotated.csv")

#print(names_ratings.head())

print(names_ratings['rating.mean_age'].notna().sum())
print(names_ratings['rating.mean_gender'].notna().sum())
print(names_ratings['rating.mean_valence'].notna().sum())

df_age = names_ratings.loc[names_ratings['rating.mean_age'].notna(), ['name', 'rating.mean_age', 'age', 'name_type']]
print(df_age.head(), len(df_age))

df_gender = names_ratings.loc[names_ratings['rating.mean_gender'].notna(), ['name', 'rating.mean_gender', 'gender', 'name_type']]
print(df_gender.head(), len(df_gender))

df_polarity = names_ratings.loc[names_ratings['rating.mean_valence'].notna(), ['name', 'rating.mean_valence', 'polarity', 'name_type']]
print(df_polarity.head(), len(df_polarity))

119
179
63
       name  rating.mean_age    age name_type
0  Adelaide        -0.617647    old      real
2  Alasdair        18.709677  young      real
3   Alastor        13.812500    old    madeup
4    Alecto         3.593750    old    madeup
5     Alice       -13.969697  young      real 119
       name  rating.mean_gender  gender name_type
0  Adelaide           45.727273  female      real
1   Adelina           47.771429  female      real
2  Alasdair          -35.657143    male      real
3   Alastor          -38.833333    male    madeup
4    Alecto          -35.722222  female    madeup 179
        name  rating.mean_valence polarity name_type
1    Adelina            31.621622      bad      real
7    Amabala             5.935484     good    madeup
8      Apple            32.444444     good   talking
11  Arcturus           -11.166667     good    madeup
13   Arobynn             7.645161      bad    madeup 63


In [None]:
### Get repeatable train/test split with a seed

age_train, age_test = train_test_split(df_age, test_size=0.2, random_state=17042020,      ## First let's do age
                               stratify=df_age[['age', 'name_type']])

x_age_train = np.zeros((len(age_train), 300))
i = 0
for row in age_train.iterrows():
    index = row[0]
    name = row[1][0].lower()
    x_age_train[i] = model[name]
    i += 1

x_age_test = np.zeros((len(age_test), 300))
i = 0
for row in age_test.iterrows():
    index = row[0]
    name = row[1][0].lower()
    x_age_test[i] = model[name]
    i += 1

y_age_train = age_train['rating.mean_age']
y_age_test = age_test['rating.mean_age']

####################################################################################################################

gen_train, gen_test = train_test_split(df_gender, test_size=0.2, random_state=17042020,     ## Now let's do gender
                               stratify=df_gender[['gender', 'name_type']])

x_gen_train = np.zeros((len(gen_train), 300))
i = 0
for row in gen_train.iterrows():
    index = row[0]
    name = row[1][0].lower()
    x_gen_train[i] = model[name]
    i += 1

x_gen_test = np.zeros((len(gen_test), 300))
i = 0
for row in gen_test.iterrows():
    index = row[0]
    name = row[1][0].lower()
    x_gen_test[i] = model[name]
    i += 1


y_gen_train = gen_train['rating.mean_gender']
y_gen_test = gen_test['rating.mean_gender']


####################################################################################################################

pol_train, pol_test = train_test_split(df_polarity, test_size=0.2, random_state=17042020,    ## And lastly, polarity
                               stratify=df_polarity[['polarity', 'name_type']])

x_pol_train = np.zeros((len(pol_train), 300))
i = 0
for row in pol_train.iterrows():
    index = row[0]
    name = row[1][0].lower()
    x_pol_train[i] = model[name]
    i += 1

x_pol_test = np.zeros((len(pol_test), 300))
i = 0
for row in pol_test.iterrows():
    index = row[0]
    name = row[1][0].lower()
    x_pol_test[i] = model[name]
    i += 1

y_pol_train = pol_train['rating.mean_valence']
y_pol_test = pol_test['rating.mean_valence']

print(len(x_age_train), len(x_age_test), len(x_gen_train), len(x_gen_test), len(x_pol_train), len(x_pol_test))

del model

95 24 143 36 50 13


In [None]:
### Make a cross-validation loop testing a bunch of hyperparameters, all the while saving the outputs to a dict or something

In [None]:
regr = ElasticNetCV(l1_ratio = [0.01, 0.05, .1, 0.2, .5, .7, .9, .95, .99, 1], 
                    n_alphas = 250,
                    max_iter = 10000,
                    cv = len(x_gen_train),
                    selection = 'random', 
                    random_state=17042020,)

regr.fit(x_gen_train, y_gen_train)

print("Alpha:", regr.alpha_)
print("L1 Ratio:", regr.l1_ratio_)
print("# Iters:", regr.n_iter_)
print("Intercept:", regr.intercept_)
#print("Coefs:", regr.coef_)
print("\n")

print("MSE Train:", mean(regr.mse_path_))

print("MAE Test:", sklearn.metrics.mean_absolute_error(y_gen_test, regr.predict(x_gen_test)))
print("MSE Test:", sklearn.metrics.mean_squared_error(y_gen_test, regr.predict(x_gen_test)))

print("R2:", regr.score(x_gen_test, y_gen_test))

print('\n')

type_dict_gen = {}
type_counter_gen = {}
for n, i, j in zip(gen_test['name_type'], y_gen_test, x_gen_test):
    if n in type_dict_gen.keys():
        type_dict_gen[n] = type_dict_gen[n] + abs(i - regr.predict(j.reshape(1, -1)))
        type_counter_gen[n] = type_counter_gen[n] + 1
    else:
        type_dict_gen[n] = abs(i - regr.predict(j.reshape(1, -1)))
        type_counter_gen[n] = 1
    
for i in type_dict_gen.keys():
    print("MAE", i, type_dict_gen[i]/type_counter_gen[i])

print('\n')

mean_vec_gen = np.mean(x_gen_train, axis = 0)
mean_vec_array_gen = np.full((len(x_gen_test), 300), mean_vec_gen)

print("Mean-vector MAE Test:", sklearn.metrics.mean_absolute_error(y_gen_test, regr.predict(mean_vec_array_gen)))
print("Mean-vector MSE Test:", sklearn.metrics.mean_squared_error(y_gen_test, regr.predict(mean_vec_array_gen)))

print("Mean-vector R2:", regr.score(mean_vec_array_gen, y_gen_test))

Alpha: 0.07464309442408282
L1 Ratio: 0.9
# Iters: 48
Intercept: -2.2760815495287714


MSE Train: 888.8802022779203
MAE Test: 19.753931671206317
MSE Test: 548.8958414915725
R2: 0.6016848992232694


MAE madeup [24.51393291]
MAE real [15.76683143]
MAE talking [18.98103068]


Mean-vector MAE Test: 35.20711844649357
Mean-vector MSE Test: 1411.4821234797062
Mean-vector R2: -0.02426471793010454


In [None]:
regr = ElasticNetCV(l1_ratio = [0.01, 0.05, .1, 0.2, .5, .7, .9, .95, .99, 1], 
                    n_alphas = 250,
                    max_iter = 10000,
                    cv = len(x_age_train),
                    selection = 'random', 
                    random_state=17042020,)

regr.fit(x_age_train, y_age_train)

print("Alpha:", regr.alpha_)
print("L1 Ratio:", regr.l1_ratio_)
print("# Iters:", regr.n_iter_)
print("Intercept:", regr.intercept_)
#print("Coefs:", regr.coef_)
print("\n")

print("MSE Train:", mean(regr.mse_path_))

print("MAE Test:", sklearn.metrics.mean_absolute_error(y_age_test, regr.predict(x_age_test)))
print("MSE Test:", sklearn.metrics.mean_squared_error(y_age_test, regr.predict(x_age_test)))

print("R2:", regr.score(x_age_test, y_age_test))

print('\n')

type_dict_age = {}
type_counter_age = {}
for n, i, j in zip(age_test['name_type'], y_age_test, x_age_test):
    if n in type_dict_age.keys():
        type_dict_age[n] = type_dict_age[n] + abs(i - regr.predict(j.reshape(1, -1)))
        type_counter_age[n] = type_counter_age[n] + 1
    else:
        type_dict_age[n] = abs(i - regr.predict(j.reshape(1, -1)))
        type_counter_age[n] = 1
    
for i in type_dict_age.keys():
    print("MAE", i, type_dict_age[i]/type_counter_age[i])
    
print('\n')

mean_vec_age = np.mean(x_age_train, axis = 0)
mean_vec_array_age = np.full((len(x_age_test), 300), mean_vec_age)

print("Mean-vector MAE Test:", sklearn.metrics.mean_absolute_error(y_age_test, regr.predict(mean_vec_array_age)))
print("Mean-vector MSE Test:", sklearn.metrics.mean_squared_error(y_age_test, regr.predict(mean_vec_array_age)))

print("Mean-vector R2:", regr.score(mean_vec_array_age, y_age_test))

Alpha: 0.22747173388941747
L1 Ratio: 1.0
# Iters: 71
Intercept: 14.058597468383915


MSE Train: 412.7853215036653
MAE Test: 17.398978553042074
MSE Test: 403.139256131658
R2: -0.4256642361660299


MAE real [16.99593187]
MAE madeup [20.83337036]
MAE talking [14.36763342]


Mean-vector MAE Test: 17.188119883066264
Mean-vector MSE Test: 377.67235203479964
Mean-vector R2: -0.3356029141177901


In [None]:
regr = ElasticNetCV(l1_ratio = [0.01, 0.05, .1, 0.2, .5, .7, .9, .95, .99, 1], 
                    n_alphas = 250,
                    max_iter = 10000,
                    cv = len(x_pol_train)-1,
                    selection = 'random', 
                    random_state=17042020,)

regr.fit(x_pol_train, y_pol_train)

print("Alpha:", regr.alpha_)
print("L1 Ratio:", regr.l1_ratio_)
print("# Iters:", regr.n_iter_)
print("Intercept:", regr.intercept_)
#print("Coefs:", regr.coef_)
print("\n")

print("MSE Train:", mean(regr.mse_path_))



print("MAE Test:", sklearn.metrics.mean_absolute_error(y_pol_test, regr.predict(x_pol_test)))
print("MSE Test:", sklearn.metrics.mean_squared_error(y_pol_test, regr.predict(x_pol_test)))

print("R2:", regr.score(x_pol_test, y_pol_test))

print("\n")

type_dict_pol = {}
type_counter_pol = {}
for n, i, j in zip(pol_test['name_type'], y_pol_test, x_pol_test):
    if n in type_dict_pol.keys():
        type_dict_pol[n] = type_dict_pol[n] + abs(i - regr.predict(j.reshape(1, -1)))
        type_counter_pol[n] = type_counter_pol[n] + 1
    else:
        type_dict_pol[n] = abs(i - regr.predict(j.reshape(1, -1)))
        type_counter_pol[n] = 1
    
for i in type_dict_pol.keys():
    print("MAE", i, type_dict_pol[i]/type_counter_pol[i])
    
print('\n')

mean_vec_pol = np.mean(x_pol_train, axis = 0)
mean_vec_array_pol = np.full((len(x_pol_test), 300), mean_vec_pol)

print("Mean-vector MAE Test:", sklearn.metrics.mean_absolute_error(y_pol_test, regr.predict(mean_vec_array_pol)))
print("Mean-vector MSE Test:", sklearn.metrics.mean_squared_error(y_pol_test, regr.predict(mean_vec_array_pol)))

print("Mean-vector R2:", regr.score(mean_vec_array_pol, y_pol_test))

Alpha: 0.07870886844842703
L1 Ratio: 0.7
# Iters: 32
Intercept: -5.456713127098343


MSE Train: 329.220553447748
MAE Test: 17.37965861479007
MSE Test: 495.4071445569156
R2: 0.005814772620807318


MAE real [13.75827945]
MAE madeup [19.33767269]
MAE talking [18.55352018]


Mean-vector MAE Test: 19.8674717396795
Mean-vector MSE Test: 516.5873977208159
Mean-vector R2: -0.036689852189426775


### Now, we're going to do it without surface form

In [None]:
### Get repeatable train/test split with a seed

x_age_train_formless = np.zeros((len(age_train), 300))
c = 0
for row in age_train.iterrows():
    index = row[0]
    name = row[1][0].lower()
    wordarray = np.zeros((len(model.get_subwords(name)[0][1:]), 300))
    for i, j in enumerate(model.get_subwords(name)[0][1:]):
        wordarray[i] = model[j]
    x_age_train_formless[c] = np.mean(wordarray, axis = 0)
    c += 1

    
x_age_test_formless = np.zeros((len(age_test), 300))
c = 0
for row in age_test.iterrows():
    index = row[0]
    name = row[1][0].lower()
    wordarray = np.zeros((len(model.get_subwords(name)[0][1:]), 300))
    for i, j in enumerate(model.get_subwords(name)[0][1:]):
        wordarray[i] = model[j]
    x_age_test_formless[c] = np.mean(wordarray, axis = 0)
    c += 1


####################################################################################################################

x_gen_train_formless = np.zeros((len(gen_train), 300))
c = 0
for row in gen_train.iterrows():
    index = row[0]
    name = row[1][0].lower()
    wordarray = np.zeros((len(model.get_subwords(name)[0][1:]), 300))
    for i, j in enumerate(model.get_subwords(name)[0][1:]):
        wordarray[i] = model[j]
    x_gen_train_formless[c] = np.mean(wordarray, axis = 0)
    c += 1

x_gen_test_formless = np.zeros((len(gen_test), 300))
c = 0
for row in gen_test.iterrows():
    index = row[0]
    name = row[1][0].lower()
    wordarray = np.zeros((len(model.get_subwords(name)[0][1:]), 300))
    for i, j in enumerate(model.get_subwords(name)[0][1:]):
        wordarray[i] = model[j]
    x_gen_test_formless[c] = np.mean(wordarray, axis = 0)
    c += 1



####################################################################################################################


x_pol_train_formless = np.zeros((len(pol_train), 300))
c = 0
for row in pol_train.iterrows():
    index = row[0]
    name = row[1][0].lower()
    wordarray = np.zeros((len(model.get_subwords(name)[0][1:]), 300))
    for i, j in enumerate(model.get_subwords(name)[0][1:]):
        wordarray[i] = model[j]
    x_pol_train_formless[c] = np.mean(wordarray, axis = 0)
    c += 1

x_pol_test_formless = np.zeros((len(pol_test), 300))
c = 0
for row in pol_test.iterrows():
    index = row[0]
    name = row[1][0].lower()
    wordarray = np.zeros((len(model.get_subwords(name)[0][1:]), 300))
    for i, j in enumerate(model.get_subwords(name)[0][1:]):
        wordarray[i] = model[j]
    x_pol_test_formless[c] = np.mean(wordarray, axis = 0)
    c += 1

del model

In [None]:
regr = ElasticNetCV(l1_ratio = [0.01, 0.05, .1, 0.2, .5, .7, .9, .95, .99, 1], 
                    n_alphas = 250,
                    max_iter = 10000,
                    cv = len(x_gen_train_formless),
                    selection = 'random', 
                    random_state=17042020,)

regr.fit(x_gen_train_formless, y_gen_train)

print("Alpha:", regr.alpha_)
print("L1 Ratio:", regr.l1_ratio_)
print("# Iters:", regr.n_iter_)
print("Intercept:", regr.intercept_)
#print("Coefs:", regr.coef_)
print("\n")

print("MSE Train:", mean(regr.mse_path_))

print("MAE Test:", sklearn.metrics.mean_absolute_error(y_gen_test, regr.predict(x_gen_test_formless)))
print("MSE Test:", sklearn.metrics.mean_squared_error(y_gen_test, regr.predict(x_gen_test_formless)))

print("R2:", regr.score(x_gen_test_formless, y_gen_test))

print('\n')

type_dict_gen_formless = {}
type_counter_gen_formless = {}
for n, i, j in zip(gen_test['name_type'], y_gen_test, x_gen_test_formless):
    if n in type_dict_gen_formless.keys():
        type_dict_gen_formless[n] = type_dict_gen_formless[n] + abs(i - regr.predict(j.reshape(1, -1)))
        type_counter_gen_formless[n] = type_counter_gen_formless[n] + 1
    else:
        type_dict_gen_formless[n] = abs(i - regr.predict(j.reshape(1, -1)))
        type_counter_gen_formless[n] = 1
    
for i in type_dict_gen_formless.keys():
    print("MAE", i, type_dict_gen_formless[i]/type_counter_gen_formless[i])
    
print('\n')

mean_vec_gen_formless = np.mean(x_gen_train_formless, axis = 0)
mean_vec_array_gen_formless = np.full((len(x_gen_test_formless), 300), mean_vec_gen_formless)

print("Mean-vector MAE Test:", sklearn.metrics.mean_absolute_error(y_gen_test, regr.predict(mean_vec_array_gen_formless)))
print("Mean-vector MSE Test:", sklearn.metrics.mean_squared_error(y_gen_test, regr.predict(mean_vec_array_gen_formless)))

print("Mean-vector R2:", regr.score(mean_vec_array_gen_formless, y_gen_test))

Alpha: 0.020702733787440326
L1 Ratio: 0.7
# Iters: 27
Intercept: -18.719668320293408


MSE Train: 1059.442673978793
MAE Test: 27.340919988702524
MSE Test: 964.7460257168639
R2: 0.29991652074984254


MAE madeup [26.48101598]
MAE real [34.57384126]
MAE talking [20.96790272]


Mean-vector MAE Test: 35.20711844649357
Mean-vector MSE Test: 1411.4821234797062
Mean-vector R2: -0.02426471793010454


In [None]:
regr = ElasticNetCV(l1_ratio = [0.01, 0.05, .1, 0.2, .5, .7, .9, .95, .99, 1], 
                    n_alphas = 250,
                    max_iter = 10000,
                    cv = len(x_age_train_formless),
                    selection = 'random', 
                    random_state=17042020,)

regr.fit(x_age_train_formless, y_age_train)

print("Alpha:", regr.alpha_)
print("L1 Ratio:", regr.l1_ratio_)
print("# Iters:", regr.n_iter_)
print("Intercept:", regr.intercept_)
#print("Coefs:", regr.coef_)
print("\n")

print("MSE Train:", mean(regr.mse_path_))

print("MAE Test:", sklearn.metrics.mean_absolute_error(y_age_test, regr.predict(x_age_test_formless)))
print("MSE Test:", sklearn.metrics.mean_squared_error(y_age_test, regr.predict(x_age_test_formless)))

print("R2:", regr.score(x_age_test_formless, y_age_test))

print('\n')

type_dict_age_formless = {}
type_counter_age_formless = {}
for n, i, j in zip(age_test['name_type'], y_age_test, x_age_test_formless):
    if n in type_dict_age_formless.keys():
        type_dict_age_formless[n] = type_dict_age_formless[n] + abs(i - regr.predict(j.reshape(1, -1)))
        type_counter_age_formless[n] = type_counter_age_formless[n] + 1
    else:
        type_dict_age_formless[n] = abs(i - regr.predict(j.reshape(1, -1)))
        type_counter_age_formless[n] = 1
    
for i in type_dict_age_formless.keys():
    print("MAE", i, type_dict_age_formless[i]/type_counter_age_formless[i])
    
print('\n')

mean_vec_age_formless = np.mean(x_age_train_formless, axis = 0)
mean_vec_array_age_formless = np.full((len(x_age_test_formless), 300), mean_vec_age_formless)

print("Mean-vector MAE Test:", sklearn.metrics.mean_absolute_error(y_age_test, regr.predict(mean_vec_array_age_formless)))
print("Mean-vector MSE Test:", sklearn.metrics.mean_squared_error(y_age_test, regr.predict(mean_vec_array_age_formless)))

print("Mean-vector R2:", regr.score(mean_vec_array_age_formless, y_age_test))

Alpha: 0.11618115283654422
L1 Ratio: 1.0
# Iters: 48
Intercept: -18.872780160900472


MSE Train: 437.0591092167435
MAE Test: 17.689812484899917
MSE Test: 458.8348529726752
R2: -0.6226265000003122


MAE real [15.52041175]
MAE madeup [17.56453267]
MAE talking [19.98449304]


Mean-vector MAE Test: 17.188119883066264
Mean-vector MSE Test: 377.6723520347996
Mean-vector R2: -0.33560291411778986


In [None]:
regr = ElasticNetCV(l1_ratio = [0.01, 0.05, .1, 0.2, .5, .7, .9, .95, .99, 1], 
                    n_alphas = 250,
                    max_iter = 10000,
                    cv = len(x_pol_train_formless)-1,
                    selection = 'random', 
                    random_state=17042020,)

regr.fit(x_pol_train_formless, y_pol_train)

print("Alpha:", regr.alpha_)
print("L1 Ratio:", regr.l1_ratio_)
print("# Iters:", regr.n_iter_)
print("Intercept:", regr.intercept_)
#print("Coefs:", regr.coef_)
print("\n")

print("MSE Train:", mean(regr.mse_path_))



print("MAE Test:", sklearn.metrics.mean_absolute_error(y_pol_test, regr.predict(x_pol_test_formless)))
print("MSE Test:", sklearn.metrics.mean_squared_error(y_pol_test, regr.predict(x_pol_test_formless)))

print("R2:", regr.score(x_pol_test_formless, y_pol_test))

print("\n")

type_dict_pol_formless = {}
type_counter_pol_formless = {}
for n, i, j in zip(pol_test['name_type'], y_pol_test, x_pol_test_formless):
    if n in type_dict_pol_formless.keys():
        type_dict_pol_formless[n] = type_dict_pol_formless[n] + abs(i - regr.predict(j.reshape(1, -1)))
        type_counter_pol_formless[n] = type_counter_pol_formless[n] + 1
    else:
        type_dict_pol_formless[n] = abs(i - regr.predict(j.reshape(1, -1)))
        type_counter_pol_formless[n] = 1
    
for i in type_dict_pol_formless.keys():
    print("MAE", i, type_dict_pol_formless[i]/type_counter_pol_formless[i])
    
print('\n')

mean_vec_pol_formless = np.mean(x_pol_train_formless, axis = 0)
mean_vec_array_pol_formless = np.full((len(x_pol_test_formless), 300), mean_vec_pol_formless)

print("Mean-vector MAE Test:", sklearn.metrics.mean_absolute_error(y_pol_test, regr.predict(mean_vec_array_pol_formless)))
print("Mean-vector MSE Test:", sklearn.metrics.mean_squared_error(y_pol_test, regr.predict(mean_vec_array_pol_formless)))

print("Mean-vector R2:", regr.score(mean_vec_array_pol_formless, y_pol_test))

Alpha: 0.0008239831458270381
L1 Ratio: 0.9
# Iters: 431
Intercept: 25.56767042655266


MSE Train: 307.0594433009455
MAE Test: 16.303994698965404
MSE Test: 401.6951765544801
R2: 0.19387636042850864


MAE real [25.9075809]
MAE madeup [16.46104833]
MAE talking [6.50409146]


Mean-vector MAE Test: 19.867471739679498
Mean-vector MSE Test: 516.5873977208159
Mean-vector R2: -0.036689852189426775
