In [None]:
!pip install fasttext

Collecting fasttext
  Downloading fasttext-0.9.2.tar.gz (68 kB)
[K     |████████████████████████████████| 68 kB 1.5 MB/s 
[?25hCollecting pybind11>=2.2
  Using cached pybind11-2.9.2-py2.py3-none-any.whl (213 kB)
Building wheels for collected packages: fasttext
  Building wheel for fasttext (setup.py) ... [?25l[?25hdone
  Created wheel for fasttext: filename=fasttext-0.9.2-cp37-cp37m-linux_x86_64.whl size=3148821 sha256=83cb565a8e1f15892401970cddff67d1c80edc072f26e64f5097735ac46294f2
  Stored in directory: /root/.cache/pip/wheels/4e/ca/bf/b020d2be95f7641801a6597a29c8f4f19e38f9c02a345bab9b
Successfully built fasttext
Installing collected packages: pybind11, fasttext
Successfully installed fasttext-0.9.2 pybind11-2.9.2


In [None]:
from numpy import mean
from numpy import std
from numpy import absolute
from pandas import read_csv
import pandas as pd
import numpy as np
import sklearn
import pickle
from sklearn.model_selection import cross_val_score, RepeatedKFold, train_test_split, StratifiedKFold
from sklearn.linear_model import ElasticNetCV
import fasttext

#path = "D:/Tilburg University/2021-2022/Thesis/Coca/"
#model_path = path + "models/"

#model = fasttext.load_model(model_path + "sentenced_dim300_ws2_minn2_maxn5.bin")

In [None]:
## When working in colab:
from google.colab import drive

drive.mount("/content/drive", force_remount=True) 

model = fasttext.load_model("drive/MyDrive/Thesis/Data/fastText and others/sentenced_dim300_ws2_minn2_maxn5.bin")

Mounted at /content/drive




In [None]:
### Read CSV File and Delete Unimportant Columns
#names_ratings = read_csv("D:/Tilburg University/2021-2022/Thesis/Data analyses/Giovanni Email Data/avgRatings_annotated.csv")

names_ratings = read_csv("drive/MyDrive/Thesis/Data/giovanni_email_data/avgRatings_annotated.csv")

#print(names_ratings.head())

print(names_ratings['rating.mean_age'].notna().sum())
print(names_ratings['rating.mean_gender'].notna().sum())
print(names_ratings['rating.mean_valence'].notna().sum())

df_age = names_ratings.loc[names_ratings['rating.mean_age'].notna(), ['name', 'rating.mean_age', 'age', 'name_type']]
print(df_age.head(), len(df_age))

df_gender = names_ratings.loc[names_ratings['rating.mean_gender'].notna(), ['name', 'rating.mean_gender', 'gender', 'name_type']]
print(df_gender.head(), len(df_gender))

df_polarity = names_ratings.loc[names_ratings['rating.mean_valence'].notna(), ['name', 'rating.mean_valence', 'polarity', 'name_type']]
print(df_polarity.head(), len(df_polarity))

119
179
63
       name  rating.mean_age    age name_type
0  Adelaide        -0.617647    old      real
2  Alasdair        18.709677  young      real
3   Alastor        13.812500    old    madeup
4    Alecto         3.593750    old    madeup
5     Alice       -13.969697  young      real 119
       name  rating.mean_gender  gender name_type
0  Adelaide           45.727273  female      real
1   Adelina           47.771429  female      real
2  Alasdair          -35.657143    male      real
3   Alastor          -38.833333    male    madeup
4    Alecto          -35.722222  female    madeup 179
        name  rating.mean_valence polarity name_type
1    Adelina            31.621622      bad      real
7    Amabala             5.935484     good    madeup
8      Apple            32.444444     good   talking
11  Arcturus           -11.166667     good    madeup
13   Arobynn             7.645161      bad    madeup 63


In [None]:
### Get repeatable train/test split with a seed

def fasttext_xifyer(input_data):
  df_output = np.zeros((len(input_data), 300))

  i = 0

  for row in input_data.iterrows():
    index = row[0]
    name = row[1][0].lower()
    df_output[i] = model[name]
    i += 1

  return df_output

################################################################################

age_train, age_test = train_test_split(df_age, test_size=0.2, random_state=17042020,      ## First let's do age
                               stratify=df_age[['age', 'name_type']])

x_age_train = fasttext_xifyer(age_train)
x_age_test = fasttext_xifyer(age_test)

y_age_train = age_train['rating.mean_age']
y_age_test = age_test['rating.mean_age']

####################################################################################################################

gen_train, gen_test = train_test_split(df_gender, test_size=0.2, random_state=17042020,     ## Now let's do gender
                               stratify=df_gender[['gender', 'name_type']])

x_gen_train = fasttext_xifyer(gen_train)
x_gen_test = fasttext_xifyer(gen_test)


y_gen_train = gen_train['rating.mean_gender']
y_gen_test = gen_test['rating.mean_gender']


####################################################################################################################

pol_train, pol_test = train_test_split(df_polarity, test_size=0.2, random_state=17042020,    ## And lastly, polarity
                               stratify=df_polarity[['polarity', 'name_type']])

x_pol_train = fasttext_xifyer(pol_train)
x_pol_test = fasttext_xifyer(pol_test)

y_pol_train = pol_train['rating.mean_valence']
y_pol_test = pol_test['rating.mean_valence']

print(len(x_age_train), len(x_age_test), len(x_gen_train), len(x_gen_test), len(x_pol_train), len(x_pol_test))

#del model

95 24 143 36 50 13


In [None]:
### Get repeatable train/test split with a seed, but without surface form

def fasttext_xifyer_formless(input_data):
  df_output = np.zeros((len(input_data), 300))

  c = 0

  for row in input_data.iterrows():
    index = row[0]
    name = row[1][0].lower()
    if name == model.get_subwords(name)[0][0]:
      wordarray = np.zeros((len(model.get_subwords(name)[0][1:]), 300))
      for i, j in enumerate(model.get_subwords(name)[0][1:]):
          wordarray[i] = model[j]
      df_output[c] = np.mean(wordarray, axis = 0)
    else:
      df_output[c] = model[name]
    c += 1

  return df_output

x_age_train_formless = fasttext_xifyer_formless(age_train)  
x_age_test_formless = fasttext_xifyer_formless(age_test)

####################################################################################################################

x_gen_train_formless = fasttext_xifyer_formless(gen_train)
x_gen_test_formless = fasttext_xifyer_formless(gen_test)



####################################################################################################################


x_pol_train_formless = fasttext_xifyer_formless(pol_train)
x_pol_test_formless = fasttext_xifyer_formless(pol_test)

print(len(x_age_train_formless), len(x_age_test_formless), len(x_gen_train_formless), len(x_gen_test_formless), len(x_pol_train_formless), len(x_pol_test_formless))

#del model

95 24 143 36 50 13


In [None]:
### Make a cross-validation loop testing a bunch of hyperparameters, all the while saving the outputs to a dict or something

In [None]:
def elasticnetifyer(x_train, y_train, x_test, y_test, test_full):
  regr = ElasticNetCV(l1_ratio = [0.01, 0.05, .1, 0.2, .5, .7, .9, .95, .99, 1], 
                      n_alphas = 250,
                      max_iter = 10000,
                      cv = len(x_train),
                      selection = 'random', 
                      random_state=17042020,)

  regr.fit(x_train, y_train)

  alpha = regr.alpha_
  l1_ratio = regr.l1_ratio_
  n_iters = regr.n_iter_
  intercept = regr.intercept_

  mse_train = mean(regr.mse_path_)

  mae_test = sklearn.metrics.mean_absolute_error(y_test, regr.predict(x_test))
  mse_test = sklearn.metrics.mean_squared_error(y_test, regr.predict(x_test))

  r2 = regr.score(x_test, y_test)

  type_dict = {}
  type_counter = {}
  for n, i, j in zip(test_full['name_type'], y_test, x_test):
      if n in type_dict.keys():
          type_dict[n] = type_dict[n] + abs(i - regr.predict(j.reshape(1, -1)))
          type_counter[n] = type_counter[n] + 1
      else:
          type_dict[n] = abs(i - regr.predict(j.reshape(1, -1)))
          type_counter[n] = 1
      
  for i in type_dict.keys():
    globals()[f"mae_{i}"] = float(type_dict[i])/float(type_counter[i])

  mean_vec = np.mean(x_train, axis = 0)
  mean_vec_array = np.full((len(x_test), 300), mean_vec)

  mean_vec_mae_test = sklearn.metrics.mean_absolute_error(y_test, regr.predict(mean_vec_array))
  mean_vec_mse_test = sklearn.metrics.mean_squared_error(y_test, regr.predict(mean_vec_array))

  mean_vec_r2 = regr.score(mean_vec_array, y_test)

  return alpha, l1_ratio, n_iters, intercept, mse_train, mae_test, mse_test, r2, mae_madeup, mae_real, mae_talking, mean_vec_mae_test, mean_vec_mse_test, mean_vec_r2

### Testing ground for nested CV

In [None]:
skf = StratifiedKFold(n_splits = 5, shuffle=True, random_state=17042020)

def nested_cross_validator(df, rating, dictionary):
  skf = StratifiedKFold(n_splits = 5, shuffle=True, random_state=17042020)

  for train_index, test_index in skf.split(df, df[['name_type']]):
    x_train_unfasttexted = df.iloc[train_index]
    x_test_unfasttexted = df.iloc[test_index]


    y_train = df.iloc[train_index][rating]
    y_test = df.iloc[test_index][rating]

    x_train = fasttext_xifyer(x_train_unfasttexted)
    x_train_formless = fasttext_xifyer_formless(x_train_unfasttexted)

    x_test = fasttext_xifyer(x_test_unfasttexted)
    x_test_formless = fasttext_xifyer_formless(x_test_unfasttexted)


    alpha, l1_ratio, n_iters, intercept, mse_train, mae_test, mse_test, r2, \
    mae_madeup, mae_real, mae_talking, mean_vec_mae_test, mean_vec_mse_test, \
    mean_vec_r2 = elasticnetifyer(x_train, y_train, x_test, y_test, df)
    
    dictionary['regular'].append([alpha, l1_ratio, n_iters, intercept, mse_train, 
                                  mae_test, mse_test, r2, mae_madeup, mae_real, 
                                  mae_talking, mean_vec_mae_test,
                                  mean_vec_mse_test, mean_vec_r2])
    
    alpha_formless, l1_ratio_formless, n_iters_formless, intercept_formless, \
    mse_train_formless, mae_test_formless, mse_test_formless, r2_formless, \
    mae_madeup_formless, mae_real_formless, mae_talking_formless, \
    mean_vec_mae_test_formless, mean_vec_mse_test_formless, \
    mean_vec_r2_formless = elasticnetifyer(x_train_formless, y_train, x_test_formless, y_test, df)

    dictionary['formless'].append([alpha_formless, l1_ratio_formless, n_iters_formless,
                                   intercept_formless, mse_train_formless, 
                                   mae_test_formless, mse_test_formless, r2_formless,
                                   mae_madeup_formless, mae_real_formless, mae_talking_formless, 
                                   mean_vec_mae_test_formless, mean_vec_mse_test_formless,
                                   mean_vec_r2_formless])
    

In [None]:
age_dict = {'regular' : [], 'formless' : []}

nested_cross_validator(df_age, 'rating.mean_age', age_dict)

In [None]:
print(age_dict)

In [None]:
gender_dict = {'regular' : [], 'formless' : []}

nested_cross_validator(df_gender, 'rating.mean_gender', gender_dict)

In [None]:
polarity_dict = {'regular' : [], 'formless' : []}

nested_cross_validator(df_polarity, 'rating.mean_valence', polarity_dict)

In [None]:
pickle_path = "drive/MyDrive/Thesis/Data/fastText and others/"

with open(pickle_path + "age_ncv.pickle", "rb") as file:
  age_dict = pickle.load(file)

with open(pickle_path + "gender_ncv.pickle", "rb") as file:
  gender_dict = pickle.load(file)

with open(pickle_path + "polarity_ncv.pickle", "rb") as file:
  polarity_dict = pickle.load(file)

In [None]:
print(age_dict)

{'regular': [[0.11832651993099204, 0.99, 79, 9.750745140118598, 352.86210536500477, 19.88786958091482, 548.6244486903391, -0.3904625308751548, 19.123439944691814, 20.426975050821646, 20.175950007949137, 17.482062238183598, 395.02559054339673, -0.0011735417526905767], [0.014720136675725845, 0.99, 507, 23.439872885969873, 314.3106120208521, 19.663225344378336, 498.76535076752367, -0.23473099610947257, 21.441818102253475, 21.074850516143016, 17.393376035019415, 16.628660317951258, 407.4202040009057, -0.008599241200374141], [0.07476911455067912, 0.05, 23, 7.2982347804540995, 436.8779861740004, 14.115917785747305, 260.56366069913696, 0.38645934361343237, 17.952497575128504, 11.15358190992553, 12.824055479735406, 18.93309365378608, 427.45854379584983, -0.006522532093903521], [0.032399526665512976, 0.05, 29, 16.631070571438944, 354.66406035723344, 16.024605600721195, 389.39983956133307, 0.20453155232764575, 16.33082637126067, 9.42763157042867, 19.73781340246513, 18.28828789163048, 493.8987044

In [None]:
def nested_cv_addotron(dictionary, name):
  variable_list = ('alpha', 'l1_ratio', 'n_iters', 'intercept', 'mse_train', 
                    'mae_test', 'mse_test', 'r2', 'mae_madeup', 'mae_real', 
                    'mae_talking', 'mean_vec_mae_test', 'mean_vec_mse_test', 'mean_vec_r2')
  
  type_list = ['regular', 'formless']

  regular_list = dictionary['regular']
  formless_list = dictionary['formless']
  
  regular_list = [sum(x) for x in zip(*regular_list)]
  regular_list = [x / 5 for x in regular_list]

  formless_list = [sum(x) for x in zip(*formless_list)]
  formless_list = [x / 5 for x in formless_list]


  for value_list, analysis_type in zip([regular_list, formless_list], type_list):
    for value, variable in zip(value_list, variable_list):
      print(f"Average {analysis_type} {variable} for {name} = {value}")
    print("\n")

In [None]:
nested_cv_addotron(age_dict, "age")

Average regular alpha for age = 0.050727568319967664
Average regular l1_ratio for age = 0.43599999999999994
Average regular n_iters for age = 136.0
Average regular intercept for age = 14.950048695609159
Average regular mse_train for age = 370.10286262790083
Average regular mae_test for age = 15.895302331440504
Average regular mse_test for age = 375.9019925597142
Average regular r2 for age = 0.08337560465475224
Average regular mae_madeup for age = 17.49016498898771
Average regular mae_real for age = 14.171504170036545
Average regular mae_talking for age = 15.617077040925466
Average regular mean_vec_mae_test for age = 17.32203833097819
Average regular mean_vec_mse_test for age = 411.44306349368134
Average regular mean_vec_r2 for age = -0.005990946203877456


Average formless alpha for age = 0.06749729879751121
Average formless l1_ratio for age = 0.47999999999999987
Average formless n_iters for age = 26.0
Average formless intercept for age = 12.721929855320898
Average formless mse_train f

In [None]:
nested_cv_addotron(gender_dict, "gender")

Average regular alpha for gender = 0.15315895481366898
Average regular l1_ratio for gender = 0.9299999999999999
Average regular n_iters for gender = 39.8
Average regular intercept for gender = -8.204630453730866
Average regular mse_train for gender = 905.1063392504739
Average regular mae_test for gender = 19.233139844182542
Average regular mse_test for gender = 551.880482279404
Average regular r2 for gender = 0.5898452840428152
Average regular mae_madeup for gender = 20.807363397808466
Average regular mae_real for gender = 19.696746192132213
Average regular mae_talking for gender = 16.90857010080669
Average regular mean_vec_mae_test for gender = 34.171191904988795
Average regular mean_vec_mse_test for gender = 1375.8183692632363
Average regular mean_vec_r2 for gender = -0.029618932567722033


Average formless alpha for gender = 0.17331042386763665
Average formless l1_ratio for gender = 0.7000000000000001
Average formless n_iters for gender = 33.4
Average formless intercept for gender =

In [None]:
nested_cv_addotron(polarity_dict, "polarity")

Average regular alpha for polarity = 0.24835051374456635
Average regular l1_ratio for polarity = 0.782
Average regular n_iters for polarity = 272.0
Average regular intercept for polarity = 4.408481760823829
Average regular mse_train for polarity = 379.57109960567766
Average regular mae_test for polarity = 15.719904883395973
Average regular mse_test for polarity = 376.6090454961285
Average regular r2 for polarity = -0.031915705525985306
Average regular mae_madeup for polarity = 15.492608786809763
Average regular mae_real for polarity = 17.452835468858975
Average regular mae_talking for polarity = 14.396252329425153
Average regular mean_vec_mae_test for polarity = 16.8109491256648
Average regular mean_vec_mse_test for polarity = 389.5595898967131
Average regular mean_vec_r2 for polarity = -0.05702949871174532


Average formless alpha for polarity = 0.23515659044232623
Average formless l1_ratio for polarity = 0.966
Average formless n_iters for polarity = 33.8
Average formless intercept fo

# Now let's do Neural Networks

In [None]:
# Sequential model with Dense layers
# Output layer activation = linear
# Loss function = MSE (I think this is better because we want to especially penalize large errors; however we use MAE to derive the final test-set score)
# Hidden layer activation = LeakyReLU 

# Weight initalization = HeNormal
# Optimizer: Adam

# Input layer size = 300

# Number of layers: Try (2, 3, 4)

# Number of nodes per layer: (Try all 300, all 216, all 512, as well as going up and going down (300, 200, 100; 300, 400, 500))

# Dropout percentage: (50, 60, 70, 80)