In [None]:
!pip install fasttext

In [25]:
from numpy import mean
from numpy import std
from numpy import absolute
from pandas import read_csv
import pandas as pd
import numpy as np
import sklearn
from sklearn.model_selection import cross_val_score, RepeatedKFold, train_test_split
from sklearn.linear_model import ElasticNetCV
import fasttext

#path = "D:/Tilburg University/2021-2022/Thesis/Coca/"
#model_path = path + "models/"

#model = fasttext.load_model(model_path + "sentenced_dim300_ws2_minn2_maxn5.bin")

In [21]:
## When working in colab:
from google.colab import drive

drive.mount("/content/drive", force_remount=True) 

model = fasttext.load_model("drive/MyDrive/Thesis/Data/fastText and others/sentenced_dim300_ws2_minn2_maxn5.bin")

Mounted at /content/drive




In [15]:
### Read CSV File and Delete Unimportant Columns
#names_ratings = read_csv("D:/Tilburg University/2021-2022/Thesis/Data analyses/Giovanni Email Data/avgRatings_annotated.csv")

names_ratings = read_csv("drive/MyDrive/Thesis/Data/giovanni_email_data/avgRatings_annotated.csv")

#print(names_ratings.head())

print(names_ratings['rating.mean_age'].notna().sum())
print(names_ratings['rating.mean_gender'].notna().sum())
print(names_ratings['rating.mean_valence'].notna().sum())

df_age = names_ratings.loc[names_ratings['rating.mean_age'].notna(), ['name', 'rating.mean_age', 'age', 'name_type']]
print(df_age.head(), len(df_age))

df_gender = names_ratings.loc[names_ratings['rating.mean_gender'].notna(), ['name', 'rating.mean_gender', 'gender', 'name_type']]
print(df_gender.head(), len(df_gender))

df_polarity = names_ratings.loc[names_ratings['rating.mean_valence'].notna(), ['name', 'rating.mean_valence', 'polarity', 'name_type']]
print(df_polarity.head(), len(df_polarity))

119
179
63
       name  rating.mean_age    age name_type
0  Adelaide        -0.617647    old      real
2  Alasdair        18.709677  young      real
3   Alastor        13.812500    old    madeup
4    Alecto         3.593750    old    madeup
5     Alice       -13.969697  young      real 119
       name  rating.mean_gender  gender name_type
0  Adelaide           45.727273  female      real
1   Adelina           47.771429  female      real
2  Alasdair          -35.657143    male      real
3   Alastor          -38.833333    male    madeup
4    Alecto          -35.722222  female    madeup 179
        name  rating.mean_valence polarity name_type
1    Adelina            31.621622      bad      real
7    Amabala             5.935484     good    madeup
8      Apple            32.444444     good   talking
11  Arcturus           -11.166667     good    madeup
13   Arobynn             7.645161      bad    madeup 63


In [22]:
### Get repeatable train/test split with a seed

def fasttext_xifyer(input_data):
  df_output = np.zeros((len(input_data), 300))

  i = 0

  for row in input_data.iterrows():
    index = row[0]
    name = row[1][0].lower()
    df_output[i] = model[name]
    i += 1

  return df_output

################################################################################

age_train, age_test = train_test_split(df_age, test_size=0.2, random_state=17042020,      ## First let's do age
                               stratify=df_age[['age', 'name_type']])

x_age_train = fasttext_xifyer(age_train)
x_age_test = fasttext_xifyer(age_test)

y_age_train = age_train['rating.mean_age']
y_age_test = age_test['rating.mean_age']

####################################################################################################################

gen_train, gen_test = train_test_split(df_gender, test_size=0.2, random_state=17042020,     ## Now let's do gender
                               stratify=df_gender[['gender', 'name_type']])

x_gen_train = fasttext_xifyer(gen_train)
x_gen_test = fasttext_xifyer(gen_test)


y_gen_train = gen_train['rating.mean_gender']
y_gen_test = gen_test['rating.mean_gender']


####################################################################################################################

pol_train, pol_test = train_test_split(df_polarity, test_size=0.2, random_state=17042020,    ## And lastly, polarity
                               stratify=df_polarity[['polarity', 'name_type']])

x_pol_train = fasttext_xifyer(pol_train)
x_pol_test = fasttext_xifyer(pol_test)

y_pol_train = pol_train['rating.mean_valence']
y_pol_test = pol_test['rating.mean_valence']

print(len(x_age_train), len(x_age_test), len(x_gen_train), len(x_gen_test), len(x_pol_train), len(x_pol_test))

#del model

95 24 143 36 50 13


In [49]:
### Get repeatable train/test split with a seed, but without surface form

def fasttext_xifyer_formless(input_data):
  df_output = np.zeros((len(input_data), 300))

  c = 0

  for row in input_data.iterrows():
    index = row[0]
    name = row[1][0].lower()
    if name == model.get_subwords(name)[0][0]:
      wordarray = np.zeros((len(model.get_subwords(name)[0][1:]), 300))
      for i, j in enumerate(model.get_subwords(name)[0][1:]):
          wordarray[i] = model[j]
      df_output[c] = np.mean(wordarray, axis = 0)
    else:
      df_output[c] = model[name]
    c += 1

  return df_output

x_age_train_formless = fasttext_xifyer_formless(age_train)  
x_age_test_formless = fasttext_xifyer_formless(age_test)

####################################################################################################################

x_gen_train_formless = fasttext_xifyer_formless(gen_train)
x_gen_test_formless = fasttext_xifyer_formless(gen_test)



####################################################################################################################


x_pol_train_formless = fasttext_xifyer_formless(pol_train)
x_pol_test_formless = fasttext_xifyer_formless(pol_test)

print(len(x_age_train_formless), len(x_age_test_formless), len(x_gen_train_formless), len(x_gen_test_formless), len(x_pol_train_formless), len(x_pol_test_formless))

#del model

95 24 143 36 50 13


In [None]:
### Make a cross-validation loop testing a bunch of hyperparameters, all the while saving the outputs to a dict or something

In [34]:
def elasticnetifyer(x_train, y_train, x_test, y_test, test_full):
  regr = ElasticNetCV(l1_ratio = [0.01, 0.05, .1, 0.2, .5, .7, .9, .95, .99, 1], 
                      n_alphas = 250,
                      max_iter = 10000,
                      cv = len(x_train),
                      selection = 'random', 
                      random_state=17042020,)

  regr.fit(x_train, y_train)

  print("Alpha:", regr.alpha_)
  print("L1 Ratio:", regr.l1_ratio_)
  print("# Iters:", regr.n_iter_)
  print("Intercept:", regr.intercept_)
  #print("Coefs:", regr.coef_)
  print("\n")

  print("MSE Train:", mean(regr.mse_path_))

  print("MAE Test:", sklearn.metrics.mean_absolute_error(y_test, regr.predict(x_test)))
  print("MSE Test:", sklearn.metrics.mean_squared_error(y_test, regr.predict(x_test)))

  print("R2:", regr.score(x_test, y_test))

  print('\n')

  type_dict = {}
  type_counter = {}
  for n, i, j in zip(test_full['name_type'], y_test, x_test):
      if n in type_dict.keys():
          type_dict[n] = type_dict[n] + abs(i - regr.predict(j.reshape(1, -1)))
          type_counter[n] = type_counter[n] + 1
      else:
          type_dict[n] = abs(i - regr.predict(j.reshape(1, -1)))
          type_counter[n] = 1
      
  for i in type_dict.keys():
      print("MAE", i, type_dict[i]/type_counter[i])

  print('\n')

  mean_vec = np.mean(x_train, axis = 0)
  mean_vec_array = np.full((len(x_test), 300), mean_vec)

  print("Mean-vector MAE Test:", sklearn.metrics.mean_absolute_error(y_test, regr.predict(mean_vec_array)))
  print("Mean-vector MSE Test:", sklearn.metrics.mean_squared_error(y_test, regr.predict(mean_vec_array)))

  print("Mean-vector R2:", regr.score(mean_vec_array, y_test))

In [26]:
elasticnetifyer(x_gen_train, y_gen_train, x_gen_test, y_gen_test, gen_test)

Alpha: 0.07464309442408282
L1 Ratio: 0.9
# Iters: 48
Intercept: -2.276081549528773


MSE Train: 888.8802022779203
MAE Test: 19.75393167120632
MSE Test: 548.8958414915724
R2: 0.6016848992232695


MAE madeup [24.51393291]
MAE real [15.76683143]
MAE talking [18.98103068]


Mean-vector MAE Test: 35.20711844649357
Mean-vector MSE Test: 1411.4821234797062
Mean-vector R2: -0.02426471793010454


In [36]:
elasticnetifyer(x_age_train, y_age_train, x_age_test, y_age_test, age_test)

Alpha: 0.22747173388941747
L1 Ratio: 1.0
# Iters: 71
Intercept: 14.058597468383912


MSE Train: 412.7853215036653
MAE Test: 17.398978553042074
MSE Test: 403.1392561316581
R2: -0.4256642361660301


MAE real [16.99593187]
MAE madeup [20.83337036]
MAE talking [14.36763342]


Mean-vector MAE Test: 17.188119883066264
Mean-vector MSE Test: 377.67235203479964
Mean-vector R2: -0.3356029141177901


In [37]:
elasticnetifyer(x_pol_train, y_pol_train, x_pol_test, y_pol_test, pol_test)

Alpha: 0.08319937318664238
L1 Ratio: 0.7
# Iters: 32
Intercept: -5.172295579705981


MSE Train: 327.8059654834631
MAE Test: 17.375498152043754
MSE Test: 493.0342242180459
R2: 0.010576759549279835


MAE real [13.7378035]
MAE madeup [19.34068643]
MAE talking [18.55670745]


Mean-vector MAE Test: 19.8674717396795
Mean-vector MSE Test: 516.5873977208159
Mean-vector R2: -0.036689852189426775


In [38]:
elasticnetifyer(x_gen_train_formless, y_gen_train, x_gen_test_formless, y_gen_test, gen_test)

Alpha: 0.3109822127529715
L1 Ratio: 0.99
# Iters: 30
Intercept: -13.108990119160646


MSE Train: 1298.5679473768355
MAE Test: 29.675574225184594
MSE Test: 1108.4647117007096
R2: 0.1956247433962447


MAE madeup [23.90430818]
MAE real [38.41121527]
MAE talking [26.71119922]


Mean-vector MAE Test: 35.20711844649357
Mean-vector MSE Test: 1411.4821234797062
Mean-vector R2: -0.02426471793010454


In [35]:
elasticnetifyer(x_age_train_formless, y_age_train, x_age_test_formless, y_age_test, age_test)

Alpha: 0.0483303750010695
L1 Ratio: 0.05
# Iters: 21
Intercept: 12.582004171099673


MSE Train: 505.0297382646456
MAE Test: 15.861944755220998
MSE Test: 345.6672356059924
R2: -0.2224198162852662


MAE real [14.10345426]
MAE madeup [15.00685642]
MAE talking [18.47552359]


Mean-vector MAE Test: 17.188119883066268
Mean-vector MSE Test: 377.67235203479976
Mean-vector R2: -0.3356029141177903


In [39]:
elasticnetifyer(x_pol_train_formless, y_pol_train, x_pol_test_formless, y_pol_test, pol_test)

Alpha: 0.14005586221191413
L1 Ratio: 1.0
# Iters: 102
Intercept: -3.7604477095609585


MSE Train: 322.319403529407
MAE Test: 22.61523431617912
MSE Test: 647.0281906912072
R2: -0.2984590068001791


MAE real [20.69309151]
MAE madeup [23.65215119]
MAE talking [23.24123103]


Mean-vector MAE Test: 19.8674717396795
Mean-vector MSE Test: 516.5873977208159
Mean-vector R2: -0.036689852189426775


### Testing ground for nested CV

In [52]:
from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(n_splits = 5, shuffle=True, random_state=17042020)

for analysis_package in [(df_age, 'rating.mean_age'), (df_gender, 'rating.mean_gender'), (df_polarity, 'rating.mean_valence')]:
  df = analysis_package[0]
  rating = analysis_package[1]
  for train_index, test_index in skf.split(df, df[['name_type']]):
    x_train_unfasttexted = df.iloc[train_index]
    x_test_unfasttexted = df.iloc[test_index]


    y_train = df.iloc[train_index][rating]
    y_test = df.iloc[test_index][rating]

    x_train = fasttext_xifyer(x_train_unfasttexted)
    x_train_formless = fasttext_xifyer_formless(x_train_unfasttexted)

    x_test = fasttext_xifyer(x_test_unfasttexted)
    x_test_formless = fasttext_xifyer(x_test_unfasttexted)


    elasticnetifyer(x_train, y_train, x_test, y_test, df)
    elasticnetifyer(x_train_formless, y_train, x_test_formless, y_test, df)

hello


KeyboardInterrupt: ignored