In [None]:
!pip install fasttext

In [25]:
from numpy import mean
from numpy import std
from numpy import absolute
from pandas import read_csv
import pandas as pd
import numpy as np
import sklearn
from sklearn.model_selection import cross_val_score, RepeatedKFold, train_test_split, StratifiedKFold
from sklearn.linear_model import ElasticNetCV
import fasttext

#path = "D:/Tilburg University/2021-2022/Thesis/Coca/"
#model_path = path + "models/"

#model = fasttext.load_model(model_path + "sentenced_dim300_ws2_minn2_maxn5.bin")

In [21]:
## When working in colab:
from google.colab import drive

drive.mount("/content/drive", force_remount=True) 

model = fasttext.load_model("drive/MyDrive/Thesis/Data/fastText and others/sentenced_dim300_ws2_minn2_maxn5.bin")

Mounted at /content/drive




In [15]:
### Read CSV File and Delete Unimportant Columns
#names_ratings = read_csv("D:/Tilburg University/2021-2022/Thesis/Data analyses/Giovanni Email Data/avgRatings_annotated.csv")

names_ratings = read_csv("drive/MyDrive/Thesis/Data/giovanni_email_data/avgRatings_annotated.csv")

#print(names_ratings.head())

print(names_ratings['rating.mean_age'].notna().sum())
print(names_ratings['rating.mean_gender'].notna().sum())
print(names_ratings['rating.mean_valence'].notna().sum())

df_age = names_ratings.loc[names_ratings['rating.mean_age'].notna(), ['name', 'rating.mean_age', 'age', 'name_type']]
print(df_age.head(), len(df_age))

df_gender = names_ratings.loc[names_ratings['rating.mean_gender'].notna(), ['name', 'rating.mean_gender', 'gender', 'name_type']]
print(df_gender.head(), len(df_gender))

df_polarity = names_ratings.loc[names_ratings['rating.mean_valence'].notna(), ['name', 'rating.mean_valence', 'polarity', 'name_type']]
print(df_polarity.head(), len(df_polarity))

119
179
63
       name  rating.mean_age    age name_type
0  Adelaide        -0.617647    old      real
2  Alasdair        18.709677  young      real
3   Alastor        13.812500    old    madeup
4    Alecto         3.593750    old    madeup
5     Alice       -13.969697  young      real 119
       name  rating.mean_gender  gender name_type
0  Adelaide           45.727273  female      real
1   Adelina           47.771429  female      real
2  Alasdair          -35.657143    male      real
3   Alastor          -38.833333    male    madeup
4    Alecto          -35.722222  female    madeup 179
        name  rating.mean_valence polarity name_type
1    Adelina            31.621622      bad      real
7    Amabala             5.935484     good    madeup
8      Apple            32.444444     good   talking
11  Arcturus           -11.166667     good    madeup
13   Arobynn             7.645161      bad    madeup 63


In [22]:
### Get repeatable train/test split with a seed

def fasttext_xifyer(input_data):
  df_output = np.zeros((len(input_data), 300))

  i = 0

  for row in input_data.iterrows():
    index = row[0]
    name = row[1][0].lower()
    df_output[i] = model[name]
    i += 1

  return df_output

################################################################################

age_train, age_test = train_test_split(df_age, test_size=0.2, random_state=17042020,      ## First let's do age
                               stratify=df_age[['age', 'name_type']])

x_age_train = fasttext_xifyer(age_train)
x_age_test = fasttext_xifyer(age_test)

y_age_train = age_train['rating.mean_age']
y_age_test = age_test['rating.mean_age']

####################################################################################################################

gen_train, gen_test = train_test_split(df_gender, test_size=0.2, random_state=17042020,     ## Now let's do gender
                               stratify=df_gender[['gender', 'name_type']])

x_gen_train = fasttext_xifyer(gen_train)
x_gen_test = fasttext_xifyer(gen_test)


y_gen_train = gen_train['rating.mean_gender']
y_gen_test = gen_test['rating.mean_gender']


####################################################################################################################

pol_train, pol_test = train_test_split(df_polarity, test_size=0.2, random_state=17042020,    ## And lastly, polarity
                               stratify=df_polarity[['polarity', 'name_type']])

x_pol_train = fasttext_xifyer(pol_train)
x_pol_test = fasttext_xifyer(pol_test)

y_pol_train = pol_train['rating.mean_valence']
y_pol_test = pol_test['rating.mean_valence']

print(len(x_age_train), len(x_age_test), len(x_gen_train), len(x_gen_test), len(x_pol_train), len(x_pol_test))

#del model

95 24 143 36 50 13


In [59]:
### Get repeatable train/test split with a seed, but without surface form

def fasttext_xifyer_formless(input_data):
  df_output = np.zeros((len(input_data), 300))

  c = 0

  for row in input_data.iterrows():
    index = row[0]
    name = row[1][0].lower()
    if name == model.get_subwords(name)[0][0]:
      wordarray = np.zeros((len(model.get_subwords(name)[0][1:]), 300))
      for i, j in enumerate(model.get_subwords(name)[0][1:]):
          wordarray[i] = model[j]
      df_output[c] = np.mean(wordarray, axis = 0)
    else:
      df_output[c] = model[name]
    c += 1

  return df_output

x_age_train_formless = fasttext_xifyer_formless(age_train)  
x_age_test_formless = fasttext_xifyer_formless(age_test)

####################################################################################################################

x_gen_train_formless = fasttext_xifyer_formless(gen_train)
x_gen_test_formless = fasttext_xifyer_formless(gen_test)



####################################################################################################################


x_pol_train_formless = fasttext_xifyer_formless(pol_train)
x_pol_test_formless = fasttext_xifyer_formless(pol_test)

print(len(x_age_train_formless), len(x_age_test_formless), len(x_gen_train_formless), len(x_gen_test_formless), len(x_pol_train_formless), len(x_pol_test_formless))

#del model

95 24 143 36 50 13


In [None]:
### Make a cross-validation loop testing a bunch of hyperparameters, all the while saving the outputs to a dict or something

In [97]:
def elasticnetifyer(x_train, y_train, x_test, y_test, test_full):
  regr = ElasticNetCV(l1_ratio = [0.01, 0.05, .1, 0.2, .5, .7, .9, .95, .99, 1], 
                      n_alphas = 250,
                      max_iter = 10000,
                      cv = len(x_train),
                      selection = 'random', 
                      random_state=17042020,)

  regr.fit(x_train, y_train)

  alpha = regr.alpha_
  l1_ratio = regr.l1_ratio_
  n_iters = regr.n_iter_
  intercept = regr.intercept_

  mse_train = mean(regr.mse_path_)

  mae_test = sklearn.metrics.mean_absolute_error(y_test, regr.predict(x_test))
  mse_test = sklearn.metrics.mean_squared_error(y_test, regr.predict(x_test))

  r2 = regr.score(x_test, y_test)

  type_dict = {}
  type_counter = {}
  for n, i, j in zip(test_full['name_type'], y_test, x_test):
      if n in type_dict.keys():
          type_dict[n] = type_dict[n] + abs(i - regr.predict(j.reshape(1, -1)))
          type_counter[n] = type_counter[n] + 1
      else:
          type_dict[n] = abs(i - regr.predict(j.reshape(1, -1)))
          type_counter[n] = 1
      
  for i in type_dict.keys():
    globals()[f"mae_{i}"] = float(type_dict[i])/float(type_counter[i])

  mean_vec = np.mean(x_train, axis = 0)
  mean_vec_array = np.full((len(x_test), 300), mean_vec)

  mean_vec_mae_test = sklearn.metrics.mean_absolute_error(y_test, regr.predict(mean_vec_array))
  mean_vec_mse_test = sklearn.metrics.mean_squared_error(y_test, regr.predict(mean_vec_array))

  mean_vec_r2 = regr.score(mean_vec_array, y_test)

  return alpha, l1_ratio, n_iters, intercept, mse_train, mae_test, mse_test, r2, mae_madeup, mae_real, mae_talking, mean_vec_mae_test, mean_vec_mse_test, mean_vec_r2

### Testing ground for nested CV

In [99]:
skf = StratifiedKFold(n_splits = 5, shuffle=True, random_state=17042020)

def nested_cross_validator(df, rating, dictionary):
  skf = StratifiedKFold(n_splits = 5, shuffle=True, random_state=17042020)

  for train_index, test_index in skf.split(df, df[['name_type']]):
    x_train_unfasttexted = df.iloc[train_index]
    x_test_unfasttexted = df.iloc[test_index]


    y_train = df.iloc[train_index][rating]
    y_test = df.iloc[test_index][rating]

    x_train = fasttext_xifyer(x_train_unfasttexted)
    x_train_formless = fasttext_xifyer_formless(x_train_unfasttexted)

    x_test = fasttext_xifyer(x_test_unfasttexted)
    x_test_formless = fasttext_xifyer_formless(x_test_unfasttexted)


    alpha, l1_ratio, n_iters, intercept, mse_train, mae_test, mse_test, r2, \
    mae_madeup, mae_real, mae_talking, mean_vec_mae_test, mean_vec_mse_test, \
    mean_vec_r2 = elasticnetifyer(x_train, y_train, x_test, y_test, df)
    
    dictionary['regular'].append([alpha, l1_ratio, n_iters, intercept, mse_train, 
                                  mae_test, mse_test, r2, mae_madeup, mae_real, 
                                  mae_talking, mean_vec_mae_test,
                                  mean_vec_mse_test, mean_vec_r2])
    
    alpha_formless, l1_ratio_formless, n_iters_formless, intercept_formless, \
    mse_train_formless, mae_test_formless, mse_test_formless, r2_formless, \
    mae_madeup_formless, mae_real_formless, mae_talking_formless, \
    mean_vec_mae_test_formless, mean_vec_mse_test_formless, \
    mean_vec_r2_formless = elasticnetifyer(x_train_formless, y_train, x_test_formless, y_test, df)

    dictionary['formless'].append([alpha_formless, l1_ratio_formless, n_iters_formless,
                                   intercept_formless, mse_train_formless, 
                                   mae_test_formless, mse_test_formless, r2_formless,
                                   mea_madeup_formless, mae_real_formless, mae_talking_formless, 
                                   mean_vec_mae_test_formless, mean_vec_mse_test_formless,
                                   mean_vec_r2_formless])
    

In [None]:
age_dict = {'regular' : [], 'formless' : []}

nested_cross_validator(df_age, 'rating.mean_age', age_dict)

In [None]:
print(age_dict)

In [None]:
gender_dict = {'regular' : [], 'formless' : []}

nested_cross_validator(df_gender, 'rating.mean_gender', gender_dict)

In [None]:
polarity_dict = {'regular' : [], 'formless' : []}

nested_cross_validator(df_polarity, 'rating.mean_valence', polarity_dict)