# 🗂️ Carga de ficheros

In [108]:
# Monting Google Drive
from google.colab import drive
drive.mount('/content/drive')

%cd "/content/drive/MyDrive/Deusto/Cuarto/NLP/NLP Grupo/Codigo"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/.shortcut-targets-by-id/1tbwtJBZplHfm91lceTlpEA9t4RR96dyM/NLP Grupo/Codigo


In [109]:
import pandas as pd
import json
df_tokens_trilogy = pd.read_csv("../BookNLP/Trilogia/trilogia.tokens", delimiter="\t")
df_entities_trilogy = pd.read_csv("../BookNLP/Trilogia/trilogia.entities", delimiter="\t")
df_quotes_trilogy = pd.read_csv("../BookNLP/Trilogia/trilogia.quotes", delimiter="\t")
df_supersense_trilogy = pd.read_csv("../BookNLP/Trilogia/trilogia.supersense", delimiter="\t")

In [110]:
with open ("../BookNLP/Trilogia/trilogia.book", "r") as f:
    book_data = json.load(f)

book_data.keys()
book_data['characters'][0].keys()

dict_keys(['agent', 'patient', 'mod', 'poss', 'id', 'g', 'count', 'mentions'])

In [111]:
book_data['characters'][20]['id']
df_quotes_trilogy[df_quotes_trilogy['char_id'] == 332]


Unnamed: 0,quote_start,quote_end,mention_start,mention_end,mention_phrase,char_id,quote
760,32509,32518,32497,32497,Clubs,332,"What are you really planning , Kelsier ?"
859,35436,35442,35424,35424,he,332,The Soother is gone ?
863,35497,35504,35505,35505,he,332,"I had to be sure ,"
864,35508,35524,35505,35505,he,332,Never can trust yourself when a Soother is ar...
867,35559,35600,35556,35556,Clubs,332,I do n’t like Soothers . It ’s not just Allom...
...,...,...,...,...,...,...,...
13469,554105,554108,554109,554109,Clubs,332,"Breeze ,"
13470,554116,554121,554109,554109,Clubs,332,Time to go .
13471,554145,554153,554154,554154,Clubs,332,Fall back to the harrying positions !
13473,554327,554330,554331,554331,Clubs,332,Damn !


# 🚹 Características de personajes

In [112]:
from collections import Counter

# Diccionario de tokens
def get_counter_from_dependency_list(dep_list):
    counter=Counter()
    for token in dep_list:
        term=token["w"]
        tokenGlobalIndex=token["i"]
        counter[term]+=1
    return counter

# Obtener información de personajes
def create_character_data(data, printTop):
    character_data = {}
    for character in data["characters"]:

        agentList=character["agent"]
        patientList=character["patient"]
        possList=character["poss"]
        modList=character["mod"]

        character_id=character["id"]
        count=character["count"]

        referential_gender_distribution=referential_gender_prediction="unknown"

        if character["g"] is not None and character["g"] != "unknown":
            referential_gender_distribution=character["g"]["inference"]
            referential_gender=character["g"]["argmax"]

        mentions=character["mentions"]
        proper_mentions=mentions["proper"]
        max_proper_mention=""

        #Let's create some empty lists that we can append to.
        poss_items = []
        agent_items = []
        patient_items = []
        mod_items = []

        # just print out information about named characters
        if len(mentions["proper"]) > 0:
            max_proper_mention=mentions["proper"][0]["n"]
            for k, v in get_counter_from_dependency_list(possList).most_common(printTop):
                poss_items.append((v,k))

            for k, v in get_counter_from_dependency_list(agentList).most_common(printTop):
                agent_items.append((v,k))

            for k, v in get_counter_from_dependency_list(patientList).most_common(printTop):
                patient_items.append((v,k))

            for k, v in get_counter_from_dependency_list(modList).most_common(printTop):
                mod_items.append((v,k))




            # print(character_id, count, max_proper_mention, referential_gender)
            character_data[character_id] = {"id": character_id,
                                  "count": count,
                                  "max_proper_mention": max_proper_mention,
                                  "referential_gender": referential_gender,
                                  "possList": poss_items,
                                  "agentList": agent_items,
                                  "patientList": patient_items,
                                  "modList": mod_items
                                 }

    return character_data

def print_character_data(character_data) :
    print(f'ID: {character_data["id"]}')
    print(f'Apariciones: {character_data["count"]}')
    print(f"Nombre que aparece más frecuentemente: {character_data['max_proper_mention']}")
    print(f"Lista de posesión: {character_data['possList']}")
    print(f"Lista agente: {character_data['agentList']}")
    print(f"Lista paciente: {character_data['patientList']}")
    print(f"Modificadores: {character_data['modList']}")

In [113]:
character_data = create_character_data(book_data, 1000)

In [114]:
print_character_data(character_data[313])
character_data.keys()

ID: 313
Apariciones: 23236
Nombre que aparece más frecuentemente: Vin
Lista de posesión: [(199, 'head'), (169, 'eyes'), (103, 'mind'), (86, 'feet'), (74, 'body'), (65, 'hand'), (57, 'side'), (47, 'shoulder'), (44, 'pewter'), (40, 'way'), (40, 'arm'), (40, 'arms'), (39, 'fingers'), (37, 'life'), (37, 'hands'), (35, 'tin'), (34, 'teeth'), (32, 'face'), (32, 'dress'), (30, 'chest'), (29, 'brother'), (28, 'friends'), (28, 'chair'), (27, 'own'), (27, 'hair'), (26, 'emotions'), (26, 'ears'), (25, 'opponent'), (24, 'father'), (23, 'knees'), (22, 'instincts'), (22, 'metals'), (20, 'attention'), (20, 'copper'), (20, 'power'), (19, 'cloak'), (19, 'mother'), (19, 'legs'), (19, 'Push'), (19, 'pouch'), (18, 'strength'), (18, 'Allomancy'), (18, 'mistcloak'), (18, 'stomach'), (18, 'dagger'), (17, 'cheek'), (17, 'Luck'), (17, 'shirt'), (17, 'table'), (17, 'clothing'), (17, 'senses'), (17, 'daggers'), (17, 'bronze'), (16, 'voice'), (15, 'steel'), (15, 'earring'), (15, 'place'), (15, 'right'), (14, 'ski

dict_keys([313, 376, 382, 298, 434, 328, 329, 334, 627, 516, 346, 813, 624, 360, 790, 640, 339, 332, 583, 468, 295, 817, 316, 445, 603, 333, 816, 777, 442, 585, 477, 324, 301, 492, 419, 311, 323, 807, 797, 314, 798, 604, 824, 303, 680, 632, 795, 554, 385, 292, 482, 553, 349, 357, 325, 315, 635, 845, 584, 521, 542, 800, 383, 456, 440, 741, 470, 374, 910, 713, 300, 321, 868, 342, 792, 502, 319, 386, 540, 571, 600, 418, 375, 430, 637, 327, 779, 302, 793, 535, 885, 399, 586, 734, 391, 427, 581, 806, 354, 642, 933, 387, 849, 489, 447, 483, 852, 208, 647, 864, 491, 474, 499, 670, 359, 850, 305, 336, 397, 428, 394, 396, 435, 563, 318, 345, 348, 884, 365, 416, 425, 488, 515, 522, 561, 677, 698, 350, 398, 413, 451, 462, 556, 576, 639, 715, 732, 761, 125, 904, 320, 9, 503, 720, 794, 818, 853, 378, 525, 605, 607, 620, 835, 854, 925, 363, 393, 405, 469, 587, 590, 10803, 708, 726, 746, 755, 304, 330, 368, 42, 409, 439, 458, 479, 485, 517, 528, 538, 560, 564, 572, 591, 601, 615, 659, 711, 763, 810, 

# 📕 Creación del dataset

In [115]:
import random

paths = ["../BookNLP/TheFinalEmpire/finalempire.book", "../BookNLP/TheWellOfAscension/wellascension.book", "../BookNLP/TheHeroOfAges/heroages.book", "../BookNLP/Trilogia/trilogia.book"]
# paths = ["../BookNLP/Trilogia/trilogia.book"]
# paths = ["../BookNLP/TheFinalEmpire/finalempire.book", "../BookNLP/TheWellOfAscension/wellascension.book", "../BookNLP/TheHeroOfAges/heroages.book"]

columns = ["local_idx", "char", "apar", "mods", "len_mods"]
rows_processed = []
rows_raw = []

for path in paths:
    with open(path, "r") as f:
        book_data = json.load(f)

    character_data = create_character_data(book_data, 1000)

    for char in character_data:
        char_mods_processed = [word[1] for word in character_data[char]['modList']]
        char_mods_raw = []

        # Básicamente podríamos hacer un for que por cada word en lugar de meter word[1] meta [word[1] -> word[0] veces]
        for times, word in character_data[char]['modList']:
          for i in range(times):
            char_mods_raw.append(word)

        random.shuffle(char_mods_processed)
        random.shuffle(char_mods_raw)

        for i in range((len(char_mods_processed) // 20) + 1):
            group_mods = char_mods_processed[i * 20 : (i * 20) + 20]
            group_mods_str = " ".join(group_mods)
            data = [
                character_data[char]['id'],
                character_data[char]['max_proper_mention'],
                character_data[char]['count'],
                group_mods_str,
                len(group_mods),
            ]
            rows_processed.append(data)

        for i in range((len(char_mods_raw) // 20) + 1):
            group_mods = char_mods_raw[i * 20 : (i * 20) + 20]
            group_mods_str = " ".join(group_mods)
            data = [
                character_data[char]['id'],
                character_data[char]['max_proper_mention'],
                character_data[char]['count'],
                group_mods_str,
                len(group_mods),
            ]
            rows_raw.append(data)

character_df = pd.DataFrame(rows_processed, columns=columns)
character_df_raw = pd.DataFrame(rows_raw, columns=columns)


In [116]:
# Filtro los que no tengan +10 mods
character_df = character_df[character_df['len_mods'] > 10]
print(len(character_df))
character_df.head(15)

160


Unnamed: 0,local_idx,char,apar,mods,len_mods
0,138,Vin,8743,Sazed strains amazed spy old harsh weak fine I...,20
1,138,Vin,8743,dizzy idiot glad same hesitant heavier life yo...,20
2,138,Vin,8743,slow hungry wrong invincible more asleep certa...,20
3,138,Vin,8743,able sorry eager shocked loath grateful impres...,20
4,138,Vin,8743,creature ignorant jumpy lenient accustomed sym...,20
6,124,Kelsier,5479,proud one troublemaker effective confident cen...,20
7,124,Kelsier,5479,tall serious noblemen insa close sorry happier...,20
8,124,Kelsier,5479,ignorant delighted best surprised hope leader ...,20
9,124,Kelsier,5479,reticent sure miscreant invincible crazy more ...,20
11,120,Elend,1691,anarchist observant soldier fond dear stealthy...,20


In [117]:
character_df_raw = character_df_raw[character_df_raw['len_mods'] > 10]
print(len(character_df_raw))
character_df_raw.head(15)

285


Unnamed: 0,local_idx,char,apar,mods,len_mods
0,138,Vin,8743,one certain able sorry hungry uncomfortable su...,20
1,138,Vin,8743,lenient strong sympathetic able slave surprise...,20
2,138,Vin,8743,one friendly alive right certain alone noble c...,20
3,138,Vin,8743,able fine wrong person same accustomed right w...,20
4,138,Vin,8743,fool unprepared inferior fascinating weak glad...,20
5,138,Vin,8743,weak close timid able able patient young safe ...,20
6,138,Vin,8743,able afraid confident worried grateful sure ab...,20
7,138,Vin,8743,slow right wrong glad strains guest noble able...,20
8,138,Vin,8743,able fine able dead graceful old certain about...,20
9,138,Vin,8743,able one skaa interested surprised careful wro...,20


In [118]:
set(character_df['char'])

{'Beldre',
 'Breeze',
 'Camon',
 'Cett',
 'Clubs',
 'Demoux',
 'Dockson',
 'Elend',
 'Ham',
 'Jastes',
 'Kell',
 'Kelsier',
 'Marsh',
 'Mennis',
 'OreSeur',
 'Penrod',
 'Quellion',
 'Renoux',
 'Ruin',
 'Sazed',
 'Spook',
 'Straff',
 'Telden',
 'TenSoon',
 'Tindwyl',
 'Valette',
 'Vin',
 'Yeden',
 'Yomen',
 'Zane'}

In [119]:
set(character_df_raw['char'])

{'Alendi',
 'Beldre',
 'Breeze',
 'Camon',
 'Cett',
 'Clubs',
 'Demoux',
 'Dockson',
 'Elend',
 'Fatren',
 'Ham',
 'Jastes',
 'Kell',
 'Kelsier',
 'Marsh',
 'Mennis',
 'OreSeur',
 'Penrod',
 'Preservation',
 'Quellion',
 'Rashek',
 'Renoux',
 'Ruin',
 'Sazed',
 'Spook',
 'Straff',
 'Telden',
 'TenSoon',
 'Tindwyl',
 'Valette',
 'Vin',
 'Yeden',
 'Yomen',
 'Zane'}

In [120]:
character_df['char'].value_counts()

Vin         32
Elend       24
Sazed       14
Kelsier     11
Spook        8
Breeze       7
Marsh        5
Straff       5
Ham          5
Zane         4
Tindwyl      4
Cett         4
Yomen        4
Dockson      3
Telden       2
Quellion     2
TenSoon      2
Beldre       2
Ruin         2
Demoux       2
OreSeur      2
Jastes       2
Penrod       2
Clubs        2
Valette      2
Renoux       2
Yeden        2
Camon        2
Kell         1
Mennis       1
Name: char, dtype: int64

In [121]:
character_df_raw['char'].value_counts()

Vin             71
Elend           50
Sazed           30
Kelsier         23
Spook           12
Breeze           8
Marsh            8
Cett             7
Straff           7
Zane             6
Yomen            6
Ham              6
TenSoon          4
Tindwyl          4
OreSeur          4
Dockson          4
Ruin             4
Demoux           3
Quellion         2
Preservation     2
Telden           2
Beldre           2
Clubs            2
Alendi           2
Jastes           2
Penrod           2
Valette          2
Renoux           2
Yeden            2
Camon            2
Fatren           1
Rashek           1
Kell             1
Mennis           1
Name: char, dtype: int64

In [122]:
remaining_chars = set(character_df_raw['char']).difference(set(character_df['char']))
print(remaining_chars)

{'Fatren', 'Rashek', 'Preservation', 'Alendi'}


In [123]:
character_df.drop(['local_idx', 'apar'], axis=1, inplace=True)

final_mistborn_df = character_df
final_mistborn_df = final_mistborn_df.sort_index()

In [124]:
character_df_raw.drop(['local_idx', 'apar'], axis=1, inplace=True)

final_mistborn_df_raw = character_df_raw
final_mistborn_df_raw = final_mistborn_df_raw.sort_index()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  character_df_raw.drop(['local_idx', 'apar'], axis=1, inplace=True)


In [125]:
final_mistborn_df.shape

(160, 3)

In [126]:
final_mistborn_df_raw.shape

(285, 3)

In [127]:
final_mistborn_df.head(30)

Unnamed: 0,char,mods,len_mods
0,Vin,Sazed strains amazed spy old harsh weak fine I...,20
1,Vin,dizzy idiot glad same hesitant heavier life yo...,20
2,Vin,slow hungry wrong invincible more asleep certa...,20
3,Vin,able sorry eager shocked loath grateful impres...,20
4,Vin,creature ignorant jumpy lenient accustomed sym...,20
6,Kelsier,proud one troublemaker effective confident cen...,20
7,Kelsier,tall serious noblemen insa close sorry happier...,20
8,Kelsier,ignorant delighted best surprised hope leader ...,20
9,Kelsier,reticent sure miscreant invincible crazy more ...,20
11,Elend,anarchist observant soldier fond dear stealthy...,20


In [128]:
final_mistborn_df_raw.head(30)

Unnamed: 0,char,mods,len_mods
0,Vin,one certain able sorry hungry uncomfortable su...,20
1,Vin,lenient strong sympathetic able slave surprise...,20
2,Vin,one friendly alive right certain alone noble c...,20
3,Vin,able fine wrong person same accustomed right w...,20
4,Vin,fool unprepared inferior fascinating weak glad...,20
5,Vin,weak close timid able able patient young safe ...,20
6,Vin,able afraid confident worried grateful sure ab...,20
7,Vin,slow right wrong glad strains guest noble able...,20
8,Vin,able fine able dead graceful old certain about...,20
9,Vin,able one skaa interested surprised careful wro...,20


In [129]:
import numpy as np

# Etiquetado a mano

char_to_target_mapping = {
    'Elend': 'infp',
    'Vin': 'istp',
    'Beldre' : 'esfj',
    'Breeze' : 'esfj',
    'Camon' : 'estp',
    'Cett' : 'estp',
    'Clubs' : 'istj',
    'Demoux' : 'isfj',
    'Dockson' : 'istj',
    'Ham' : 'entp',
    'Jastes' : 'isfp',
    'Kell' : 'entp',
    'Kelsier' : 'entp',
    'Marsh' : 'istj',
    'Mennis' : 'enfj',
    'OreSeur' : 'istj',
    'Penrod' : 'istj',
    'Quellion' : 'estj',
    'Renoux' : 'istj',
    'Ruin' : 'entp',
    'Sazed' : 'infj',
    'Spook' : 'isfp',
    'Straff' : 'entj',
    'Telden' : 'esfp',
    'TenSoon' : 'istj',
    'Tindwyl' : 'estj',
    'Valette' : 'istp',
    'Yeden' : 'istj',
    'Yomen' : 'istj',
    'Zane' : 'intj',
    'Alendi' : 'enfj',# For Raw version
    'Rashek' : 'intj',
    'Fatren' : 'estj',
    'Preservation' : 'infp'
}

final_mistborn_df['target'] = np.vectorize(char_to_target_mapping.get)(final_mistborn_df['char'])
final_mistborn_df_raw['target'] = np.vectorize(char_to_target_mapping.get)(final_mistborn_df_raw['char'])

In [130]:
final_mistborn_df.head(30)

Unnamed: 0,char,mods,len_mods,target
0,Vin,Sazed strains amazed spy old harsh weak fine I...,20,istp
1,Vin,dizzy idiot glad same hesitant heavier life yo...,20,istp
2,Vin,slow hungry wrong invincible more asleep certa...,20,istp
3,Vin,able sorry eager shocked loath grateful impres...,20,istp
4,Vin,creature ignorant jumpy lenient accustomed sym...,20,istp
6,Kelsier,proud one troublemaker effective confident cen...,20,entp
7,Kelsier,tall serious noblemen insa close sorry happier...,20,entp
8,Kelsier,ignorant delighted best surprised hope leader ...,20,entp
9,Kelsier,reticent sure miscreant invincible crazy more ...,20,entp
11,Elend,anarchist observant soldier fond dear stealthy...,20,infp


In [131]:
final_mistborn_df_raw.head(30)

Unnamed: 0,char,mods,len_mods,target
0,Vin,one certain able sorry hungry uncomfortable su...,20,istp
1,Vin,lenient strong sympathetic able slave surprise...,20,istp
2,Vin,one friendly alive right certain alone noble c...,20,istp
3,Vin,able fine wrong person same accustomed right w...,20,istp
4,Vin,fool unprepared inferior fascinating weak glad...,20,istp
5,Vin,weak close timid able able patient young safe ...,20,istp
6,Vin,able afraid confident worried grateful sure ab...,20,istp
7,Vin,slow right wrong glad strains guest noble able...,20,istp
8,Vin,able fine able dead graceful old certain about...,20,istp
9,Vin,able one skaa interested surprised careful wro...,20,istp


In [132]:
final_mistborn_df.to_csv('../Datasets/mistborn_booknlp_dataset.csv', index=False)
final_mistborn_df_raw.to_csv('../Datasets/mistborn_booknlp_dataset_raw.csv', index=False)

In [133]:
final_mistborn_df.shape

(160, 4)

In [134]:
final_mistborn_df_raw.shape

(285, 4)

# 🔥 Ensamblaje de datasets

In [135]:
mistborn_final_df = pd.read_csv('../Datasets/mistborn_booknlp_dataset.csv')
personalities_final_df = pd.read_csv('../Datasets/personalities_16_packs_20_words_classification/p_16_top_20_words_few_plus_classification.csv')

In [136]:
mistborn_final_df_raw = pd.read_csv('../Datasets/mistborn_booknlp_dataset_raw.csv')
personalities_final_df_raw = pd.read_csv('../Datasets/personalities_16_packs_20_words_classification/p_16_raw_20_words_few_plus_classification.csv')

In [137]:
mistborn_final_df.drop(['len_mods'], axis = 1, inplace=True)

personalities_final_df['char'] = 'NA'
# personalities_final_df.rename(columns={'Type':'target', 'Top_Words':'mods'}, inplace=True)
personalities_final_df = personalities_final_df[['char','mods','target']]

In [138]:
mistborn_final_df_raw.drop(['len_mods'], axis = 1, inplace=True)

personalities_final_df_raw['char'] = 'NA'
# personalities_final_df.rename(columns={'Type':'target', 'Top_Words':'mods'}, inplace=True)
personalities_final_df_raw = personalities_final_df_raw[['char','mods','target']]

In [139]:
print(mistborn_final_df.columns)
print(personalities_final_df.columns)

mistborn_final_df.head(3)

Index(['char', 'mods', 'target'], dtype='object')
Index(['char', 'mods', 'target'], dtype='object')


Unnamed: 0,char,mods,target
0,Vin,Sazed strains amazed spy old harsh weak fine I...,istp
1,Vin,dizzy idiot glad same hesitant heavier life yo...,istp
2,Vin,slow hungry wrong invincible more asleep certa...,istp


In [140]:
print(mistborn_final_df_raw.columns)
print(personalities_final_df_raw.columns)

mistborn_final_df_raw.head(3)

Index(['char', 'mods', 'target'], dtype='object')
Index(['char', 'mods', 'target'], dtype='object')


Unnamed: 0,char,mods,target
0,Vin,one certain able sorry hungry uncomfortable su...,istp
1,Vin,lenient strong sympathetic able slave surprise...,istp
2,Vin,one friendly alive right certain alone noble c...,istp


In [141]:
personalities_final_df.head(3)

Unnamed: 0,char,mods,target
0,,negative guide whole personally get overcommit...,enfj
1,,purpose born tend much greater find overly fee...,enfj
2,,one speaking thrive maintain empathetic insigh...,enfj


In [142]:
personalities_final_df_raw.head(3)

Unnamed: 0,char,mods,target
0,,make roles forthright talking rarely strive wi...,enfj
1,,ones see getting opportunity judging educate f...,enfj
2,,ideas motivating relationships voice like sens...,enfj


In [143]:
classify_char_data = pd.concat([mistborn_final_df, personalities_final_df])
classify_char_data_raw = pd.concat([mistborn_final_df_raw, personalities_final_df_raw])

In [144]:
from sklearn.model_selection import train_test_split

# Ahora divido en train, val, test
train_size = 0.7
val_size = 0.15
test_size = 0.15

train_df, temp_df = train_test_split(classify_char_data, test_size=(val_size + test_size), random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=(test_size / (val_size + test_size)), random_state=42)

# Add a 'split' column to indicate the split
train_df['split'] = 'train'
val_df['split'] = 'val'
test_df['split'] = 'test'

# Concatenate the DataFrames back together
final_classify_char_data = pd.concat([train_df, val_df, test_df])
final_classify_char_data = final_classify_char_data.sort_index()

In [145]:
train_df_raw, temp_df_raw = train_test_split(classify_char_data_raw, test_size=(val_size + test_size), random_state=42)
val_df_raw, test_df_raw = train_test_split(temp_df_raw, test_size=(test_size / (val_size + test_size)), random_state=42)

# Add a 'split' column to indicate the split
train_df_raw['split'] = 'train'
val_df_raw['split'] = 'val'
test_df_raw['split'] = 'test'

# Concatenate the DataFrames back together
final_classify_char_data_raw = pd.concat([train_df_raw, val_df_raw, test_df_raw])
final_classify_char_data_raw = final_classify_char_data_raw.sort_index()

In [146]:
final_classify_char_data.to_csv('../Datasets/dataset_clasificador_final/classify_char_processed.csv', index=False)
print(final_classify_char_data.shape)
final_classify_char_data.head(20)

(320, 4)


Unnamed: 0,char,mods,target,split
0,Vin,Sazed strains amazed spy old harsh weak fine I...,istp,train
0,,negative guide whole personally get overcommit...,enfj,train
1,Vin,dizzy idiot glad same hesitant heavier life yo...,istp,train
1,,purpose born tend much greater find overly fee...,enfj,train
2,,one speaking thrive maintain empathetic insigh...,enfj,train
2,Vin,slow hungry wrong invincible more asleep certa...,istp,train
3,Vin,able sorry eager shocked loath grateful impres...,istp,test
3,,persuading motivating coworker mission walk en...,enfj,train
4,Vin,creature ignorant jumpy lenient accustomed sym...,istp,train
4,,insight theres sure allow among relationships ...,enfj,train


In [147]:
final_classify_char_data_raw.to_csv('../Datasets/dataset_clasificador_final/classify_char_raw.csv', index=False)
print(final_classify_char_data_raw.shape)
final_classify_char_data_raw.head(20)

(1302, 4)


Unnamed: 0,char,mods,target,split
0,,make roles forthright talking rarely strive wi...,enfj,test
0,Vin,one certain able sorry hungry uncomfortable su...,istp,train
1,Vin,lenient strong sympathetic able slave surprise...,istp,train
1,,ones see getting opportunity judging educate f...,enfj,test
2,Vin,one friendly alive right certain alone noble c...,istp,train
2,,ideas motivating relationships voice like sens...,enfj,train
3,Vin,able fine wrong person same accustomed right w...,istp,train
3,,problems purpose map strengths coaches humanit...,enfj,train
4,Vin,fool unprepared inferior fascinating weak glad...,istp,train
4,,train help sixteen reliable arrows heart using...,enfj,val


## Datasets where train and test are P16 & Books respectively

In [148]:
mistborn_test_df_raw = mistborn_final_df_raw.copy()
personalities_train_df_raw = personalities_final_df_raw.copy()

In [149]:
mistborn_test_df_raw["split"] = 'test'
indexes_for_val = list(range(0, len(mistborn_test_df_raw)))
random.shuffle(indexes_for_val)
mistborn_test_df_raw.loc[indexes_for_val[0:len(indexes_for_val)//2], "split"] = 'val'

personalities_train_df_raw["split"] = 'train'

In [150]:
mistborn_test_df_raw.head(5)

Unnamed: 0,char,mods,target,split
0,Vin,one certain able sorry hungry uncomfortable su...,istp,test
1,Vin,lenient strong sympathetic able slave surprise...,istp,val
2,Vin,one friendly alive right certain alone noble c...,istp,test
3,Vin,able fine wrong person same accustomed right w...,istp,val
4,Vin,fool unprepared inferior fascinating weak glad...,istp,test


In [151]:
personalities_train_df_raw.head(5)

Unnamed: 0,char,mods,target,split
0,,make roles forthright talking rarely strive wi...,enfj,train
1,,ones see getting opportunity judging educate f...,enfj,train
2,,ideas motivating relationships voice like sens...,enfj,train
3,,problems purpose map strengths coaches humanit...,enfj,train
4,,train help sixteen reliable arrows heart using...,enfj,train


In [153]:
classify_mistTest_p16Train_raw = pd.concat([mistborn_test_df_raw, personalities_train_df_raw])
classify_mistTest_p16Train_raw.to_csv('../Datasets/dataset_clasificador_final/classify_mistTest_p16Train_raw.csv', index=False)
print(classify_mistTest_p16Train_raw.shape)
classify_mistTest_p16Train_raw.head(10)

(1302, 4)


Unnamed: 0,char,mods,target,split
0,Vin,one certain able sorry hungry uncomfortable su...,istp,test
1,Vin,lenient strong sympathetic able slave surprise...,istp,val
2,Vin,one friendly alive right certain alone noble c...,istp,test
3,Vin,able fine wrong person same accustomed right w...,istp,val
4,Vin,fool unprepared inferior fascinating weak glad...,istp,test
...,...,...,...,...
710,,present unconventional makes get intuitive tri...,intp,train
711,,four comfortably everyday tend worst concepts ...,intp,train
712,,times introverted granted environment problems...,intp,train
713,,conversations understanding resurface exceptio...,intp,train
