In [128]:
import pandas as pd
import numpy as np
from googletrans import Translator
import copy
# import stuff
%load_ext autoreload
%autoreload 2
%matplotlib inline

from random import randint
import torch
import os
import time

os.chdir('../InferSent/')
from models import InferSent

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Load the profiles and edges information into two dataframes

In [129]:
columns = ['user_id', 'public', 'completion_percentage', 'gender', 'region', 'last_login', 'registration', 'age', 'body',
    'I_am_working_in_field', 'spoken_languages', 'hobbies', 'I_most_enjoy_good_food', 'pets', 'body_type', 'my_eyesight',
    'eye_color', 'hair_color', 'hair_type', 'completed_level_of_education', 'favourite_color', 'relation_to_smoking',
    'relation_to_alcohol', 'sign_in_zodiac', 'on_pokec_i_am_looking_for', 'love_is_for_me', 'relation_to_casual_sex',
    'my_partner_should_be', 'marital_status', 'children', 'relation_to_children', 'I_like_movies', 'I_like_watching_movie',
    'I_like_music', 'I_mostly_like_listening_to_music', 'the_idea_of_good_evening', 'I_like_specialties_from_kitchen',
    'fun', 'I_am_going_to_concerts', 'my_active_sports', 'my_passive_sports', 'profession', 'I_like_books', 'life_style',
    'music', 'cars', 'politics', 'relationships', 'art_culture', 'hobbies_interests', 'science_technologies',
    'computers_internet', 'education', 'sport', 'movies', 'travelling', 'health', 'companies_brands', 'more', 'huh']

profiles = pd.read_csv('../data/pokec/soc-pokec-profiles.txt', sep='\t', names=columns)
edges = pd.read_csv('../data/pokec/soc-pokec-relationships.txt', sep='\t', names=['source', 'destination'])

  interactivity=interactivity, compiler=compiler, result=result)


### Remove unwanted columns and drop those rows that have NaN values for the columns we care about and create aggregate sentence feature

In [266]:
sentence_vars = ['I_like_movies', 'hobbies', 'children', 'profession']
numerical_vars = ['user_id', 'age','gender']

profiles_ss = profiles[numerical_vars + sentence_vars]
profiles_ss = profiles_ss.dropna()
profiles_ss = profiles_ss[(profiles_ss.age > 5) & (profiles_ss.age < 75)].reset_index(drop=True)

profiles_ss['agg_sent'] = profiles_ss['I_like_movies'] + ' ' + profiles_ss['hobbies'] + ' ' + profiles_ss['children'] + ' ' + profiles_ss['profession']
profiles_ss = profiles_ss.drop(sentence_vars, axis=1)

profiles_ss.head(3)

Unnamed: 0,user_id,age,gender,agg_sent
0,16,23.0,1.0,"take co ma uputaju cestovanie, pocuvanie hudby..."
1,32,21.0,1.0,"akcne, horory, komedie, serialy, dokumentarne,..."
2,46,21.0,0.0,"horory, komedie, romanticke, serialy, rodinne ..."


### Reduce dataset size

In [267]:
# don't shuffle, because the smaller profile_1.user_id - profile_2.user_id is, the more likely 
# it is that profile_1 and profile_2 are friends
shuffle = False

In [268]:
# fraction of data to use
fraction = 1/5

num_samples = len(profiles_ss.index)
print("original number of samples: {:,}".format(num_samples))

# save profiles_ss to pickle file
profiles_ss.to_pickle('../data/pokec_cleaned/profiles_ss.pkl')

#shuffle rows of profiles_ss. essentially randomly sampling
if shuffle:
    profiles_ss = profiles_ss.sample(frac=1).reset_index(drop=True)

# retain only a fraction of the original data
profiles_ss = profiles_ss[:int(num_samples*fraction)]

profiles_ss = profiles_ss[1020:]

num_samples = len(profiles_ss.index)
print("new number of samples: {:,}".format(num_samples))

profiles_ss.head(3)

original number of samples: 174,483
new number of samples: 33,876


Unnamed: 0,user_id,age,gender,agg_sent
1020,7209,23.0,1.0,neviem nemam na ne cas ale uz ked tak mam rad ...
1021,7226,14.0,1.0,michael jackson velmi rad pocuvam michaela jac...
1022,7210,18.0,0.0,"akcne, komedie, serialy pocuvanie hudby, tanco..."


### Translate the sentences in the agg_sent column of the dataframe

In [269]:
split_amt = 1000
segment_len = int(num_samples/split_amt)

# if true, save the new dataframe
save = True

print("number of sentences per translator.translate call: {:,}".format(segment_len))

number of sentences per translator.translate call: 33


In [270]:
agg_sent = list(profiles_ss['agg_sent'])

translations_text = []
for i in range(split_amt):
    translator = Translator()
    try:
        if i < split_amt - 1:
            translations = translator.translate(agg_sent[i*segment_len:(i+1)*segment_len])
            time.sleep(1)
        else:
            translations = translator.translate(agg_sent[i*segment_len:])
        translations_text.extend([translation.text for translation in translations])
    except Exception as e:
        print(e)
        print('exiting for loop')
        break

Expecting value: line 1 column 1 (char 0)
exiting for loop


In [271]:
len_translations = len(translations_text)
len_translations

528

In [272]:
profiles_truncated = profiles_ss[:len_translations]

profiles_truncated['agg_sent_trans'] = translations_text

if save:
    profiles_truncated.to_pickle('../data/pokec_cleaned/profiles_truncated1020.pkl')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()


### Load the saved models

In [282]:
profs_1 = pd.read_pickle('../data/pokec_cleaned/profiles_truncated.pkl')
profs_2 = pd.read_pickle('../data/pokec_cleaned/profiles_truncated510.pkl')
profs_3 = pd.read_pickle('../data/pokec_cleaned/profiles_truncated1020.pkl')

profs_all = pd.concat([profs_1, profs_2, profs_3])
profs_all.loc[1544, 'agg_sent_trans']

'action, horror, comedy, sci-fi, family, box-office, but mainly cartoon: d sports, work with pc, pc games, web surfing, watching movies, discos, pool, cooking, cinema, party, sleeping, shopping, these are really many :-) yet none yet: d'

### Load the InferSent sentence embedding model and the word embedding model

In [142]:
embedding_size = 2048

model_version = 1
MODEL_PATH = "encoder/infersent%s.pickle" % model_version
params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': embedding_size,
                'pool_type': 'max', 'dpout_model': 0.0, 'version': model_version}
model = InferSent(params_model)
model.load_state_dict(torch.load(MODEL_PATH))

# Keep it on CPU or put it on GPU
use_cuda = False
model = model.cuda() if use_cuda else model

# If infersent1 -> use GloVe embeddings. If infersent2 -> use InferSent embeddings.
W2V_PATH = 'dataset/GloVe/glove.840B.300d.txt' if model_version == 1 else 'dataset/fastText/crawl-300d-2M.vec'
model.set_w2v_path(W2V_PATH)

# Load embeddings of K most frequent words
model.build_vocab_k_words(K=100000)

Vocab size : 100000


### Encode the english sentences using the InferSent model

In [195]:
enc_sents = model.encode(list(profiles_ss.agg_sent_trans), bsize=128, tokenize=False, verbose=True)
df_enc_sents = pd.DataFrame(enc_sents, columns=list(range(embedding_size*2)))
profiles_ss = pd.concat([profiles_ss[numerical_vars], df_enc_sents], axis=1)

### Find the average aggregate sentence embedding of each profile's friends and combine these new features with the original ones

In [245]:
def average_and_merge(profs, edges):
    profs_avgs = pd.DataFrame(columns=profs.columns)
    for index in range(len(profs.index)):
        friend_idxs = list(edges[edges.source == index].destination)
        profs_avgs.loc[index] = list(profs.loc[friend_idxs].mean())
    profs_aggregate = pd.concat([profs, profs_avgs], axis=1)
    return profs_aggregate

# profee = average_and_merge(profs, edges)

### Scrap

In [35]:
l = [4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19]
df2 = pd.DataFrame(columns=list(range(len(l))))

df2.loc[0] = l

edges_mini = pd.DataFrame({'source':[1,1,1,1,2,2,2,3,3,4,4,4,5,5,5,6,7,7], 'destination':[4,5,6,7,3,4,7,2,5,1,2,5,1,3,4,1,1,2]}, columns=['source', 'destination'])

num_feats = 7
profs_mini = pd.DataFrame(columns=list(range(num_feats)))
for i in range(num_feats):
    profs_mini.loc[i+1] = list(range(i*num_feats, (i+1)*num_feats))

In [93]:
df1 = df1.sample(frac=1).reset_index(drop=True)

In [205]:
df1 = pd.DataFrame({'1':[1,2,3],'2':[4,5,6],'3':[7,8,9]}, columns=['1', '2', '3'])

In [248]:
df1.index.name = 'u_id'
df1.index -= 1

In [254]:
df2 = pd.DataFrame({'source':[0,0,1,1,2,2], 'destination':[1,2,0,2,0,1]}, columns=['source', 'destination'])

In [255]:
df3 = average_and_merge(df1, df2)

In [256]:
df1

Unnamed: 0_level_0,1,2,3
u_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,1,4,7
1,2,5,8
2,3,6,9


In [257]:
df2

Unnamed: 0,source,destination
0,0,1
1,0,2
2,1,0
3,1,2
4,2,0
5,2,1


In [258]:
df3

Unnamed: 0_level_0,1,2,3,1,2,3
u_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,1,4,7,2.5,5.5,8.5
1,2,5,8,2.0,5.0,8.0
2,3,6,9,1.5,4.5,7.5
