In [2]:
import json 
import numpy as np
import pandas as pd

#!pip install fasttext
#!wget -O ./data/lid.176.bin https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin
import fasttext
import re

import string

from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

## [WH+TW] Create datasets

In [5]:
def get_whisper(filename):
    serial = []
    text = []
    for line in open(filename, 'r'):
        new_data = json.loads(line)
        serial.append(new_data["serial"])
        text.append(new_data["text"])
    return pd.DataFrame({"serial": serial, "text": text})

def get_twitter(filename):
    text = []
    for line in open(filename, 'r'):
        splitted = line.split('\t')
        if len(splitted) == 4:
            text.append(splitted[2])
    return pd.DataFrame({"text": text})

def is_english(text):
    dl = lang_model.predict(text)[0][0][-2:]
    return dl == 'en'

def replace_if_lang(df, text):
    def func(row):
        if is_english(row["text"]):
            return row["text"]
        return ""
    df['text'] = df.apply(func, axis=1).tolist()
    return df

def replace_in_df(df, regex, text):
    def func(row):
        return re.sub(regex, text, row["text"])
    df['text'] = df.apply(func, axis=1).tolist()
    return df

def clean_tweets(df, min_length=8):
    # delete the entire tweet (1/2)
    df = replace_in_df(df, r".*RT @.*", "") # retweets
    df = replace_in_df(df, r".*\[\[.*", "") # placeholders
    # delete only a part of the tweet
    df = replace_in_df(df, r"http(s?)://\S+", "") # links
    df = replace_in_df(df, r"@\S+", "") # mentions
    df = replace_in_df(df, r"#\S+", "") # hashtags
    df = replace_in_df(df, r"\n", " ") # newline
    df = replace_in_df(df, r"  +", " ") # multiple spaces
    df = replace_in_df(df, r"^ ", "") # space at the begin of line
    df = replace_in_df(df, r" $", "") # space at the end of line
    # delete the entire tweet (2/2)
    df = replace_if_lang(df, "") # texts not in English
    df = df[df['text'].map(len) >= min_length] # short text
    return df.drop_duplicates()

def get_datasets(whisper_filename, twitter_filename):
    def get_dataset(filename, func, cls):
        data = func(filename)
        data["class"] = cls
        data = data[['text', 'class']]
        return clean_tweets(data)
    data_w = get_dataset(whisper_filename, get_whisper, "sens")
    data_t = get_dataset(twitter_filename, get_twitter, "ns")
    return data_w, data_t

def save_samples_data(num_samples, df_sens, df_ns, num_sens, num_ns, filename_prefix):
    for count in range(num_samples):
        df_sens[(num_sens*count):(num_sens*(count+1))].append(df_ns[(num_ns*count):(num_ns*(count+1))]).sample(frac=1, random_state=2**count).reset_index(drop=True).to_csv(filename_prefix+("0"+str(count+1))[-2:]+".csv", index=False)
    return

lang_model = fasttext.load_model('./data/lid.176.bin')




In [None]:
df_sens, df_ns = get_datasets('/data/whisper/final-anonymized.jun14.jun16.whisper-part-000.json', 
                '/data/twitter-cikm-2010/test_set_tweets.txt')

In [92]:
df_sens

Unnamed: 0,text,class
0,My mom just sold my fucking bed without tellin...,sens
1,Me and this guy had a full heart to heart conv...,sens
3,NJ women message me,sens
4,Who am I even anymore?,sens
5,I'm a post man. N would love a lonely houswife...,sens
...,...,...
3595080,because of tumblr girls I wish I wasn't my race,sens
3595081,Still looking to find that first attractive gi...,sens
3595082,Just dropped my phone on my face three times w...,sens
3595083,"I never really appreciated my mom until, as a ...",sens


In [93]:
df_ns

Unnamed: 0,text,class
0,Ok today I have to find something to wear for ...,ns
1,I am glad I'm having this show but I can't wai...,ns
2,Honestly I don't even know what's going on any...,ns
3,hey sorry I'm sitting infront of this sewing m...,ns
4,Sitting infront of this sewing machine ... I d...,ns
...,...,...
5108193,Where's the spot,ns
5108197,I need 2 labtops. 1 for playin music n 1 4 the...,ns
5108198,whip like a slave is that song u was askin bou...,ns
5108201,das rite b4 eastternparkway rite (comin from m...,ns


In [88]:
save_samples_data(10, df_sens.sample(frac=1, replace=False, random_state=512), df_ns.sample(frac=1, replace=False, random_state=256), 3336, 5429, "./data/sample_ann2_")    


## Dataset Analysis

In [8]:
import itertools
import collections

def get_data(path, lim=2):
    data = pd.read_csv(path, sep=',', error_bad_lines=False)
    res = {"uri":[], "text":[], "class":[]}
    for tag in data.columns[2:]:
        tmp = data[data[tag] >= lim]
        res["uri"] += tmp["uri"].tolist()
        res["text"] += tmp["text"].tolist()
        res["class"] += [tag]*len(tmp)
    res = pd.DataFrame(res).sort_values(by="uri")
    return res

# keep only "Sensibile" and "Non sensibile" as "sens" and "ns"
def get_two_classes(df):
    res = df[df["class"].isin(["Sensibile", "Non sensibile"])].copy()
    res.loc[res["class"] == "Sensibile", "class"] = "sens"
    res.loc[res["class"] == "Non sensibile", "class"] = "ns"
    return res

STOPWORDS = set(stopwords.words("english"))
STEMMER = SnowballStemmer('english')
LEMMATIZER = WordNetLemmatizer()

def clean_text(text):
    ## Remove puncuation
    text = text.translate(string.punctuation)
    text = word_tokenize(text)  # tokenize text
    ## Remove english stop words
    #stop_words = set(stopwords.words("english"))

    text = [w for w in text if w not in STOPWORDS and w.lower() not in STOPWORDS]  # remove stop words
    text = " ".join(text)

    # Clean the text
    text = re.sub("[^a-zA-z0-9\s]", " ", text)
    # text = re.sub("[^a-zA-Z#]", " ", text)
    text = word_tokenize(text)  # tokenize text
    stemmer = SnowballStemmer('english')
    stemmed_words = [stemmer.stem(word) for word in text]
    text = " ".join(stemmed_words)

    return text

#stem_or_lemma: 1 for stemming, 2 for lemmatization, other for none
def clean_text(text, remove_sw=False, stem_or_lemma=0):
    def remove_url(txt):
        return " ".join(re.sub("([^0-9A-Za-z \t])|(\w+:\/\/\S+)", "", txt).split())
    
    res = remove_url(text).lower().split()
    # Remove english stop words
    if remove_sw:
        #stop_words = set(stopwords.words("english"))
        res = [word for word in res if not word in STOPWORDS]
    if stem_or_lemma == 1: # stemming
        #stemmer = SnowballStemmer('english')
        res = [STEMMER.stem(word) for word in res]
        #res = " ".join([stemmer.stem(word) for word in res])
    elif stem_or_lemma == 2: # lemmatization
        #lemmatizer = WordNetLemmatizer()
        res = [LEMMATIZER.lemmatize(word, pos='v') for word in res]

    return res#.split() #remove_url(text).lower().split()

def get_df_stats(df):
    df_mean = df.groupby("word").mean().reset_index().sort_values(by="all", ascending=False).reset_index(drop=True)
    df_mean.columns = ["word"]+[x+"_mean" for x in df_mean.columns[1:]]
    df_std = df.groupby("word").std().fillna(0).reset_index()
    df_std.columns = ["word"]+[x+"_std" for x in df_std.columns[1:]]
    return df_mean.merge(df_std)

def get_df_stats_2(data, main_col):
    words_sens = list(itertools.chain(*data[data['class'] == 'sens'][main_col].values.tolist()))
    #words_ns = list(itertools.chain(*data[data['class'] == 'ns'][main_col].values.tolist()))
    words_all = list(itertools.chain(*data[main_col].values.tolist()))
    wordcount_sens = collections.Counter(words_sens)
    #wordcount_ns = collections.Counter(words_ns)
    wordcount_all = collections.Counter(words_all)
    print("Top words", wordcount_all.most_common(20),"\n")
    top_wc_dict = {"word": [], "all": [], "sens": [], "ns": [], "perc_sens": [], "perc_ns": []}
    for word in wordcount_all:
        num_all = wordcount_all[word]
        num_sens = wordcount_sens[word]
        perc_sens = round((num_sens*100)/num_all,2)
        #print(item[0], item[1], perc_sens, round((100.0-perc_sens), 2))
        top_wc_dict["word"].append(word)
        top_wc_dict["all"].append(num_all)
        top_wc_dict["sens"].append(num_sens)
        top_wc_dict["ns"].append(num_all-num_sens)
        top_wc_dict["perc_sens"].append(perc_sens)
        top_wc_dict["perc_ns"].append(round((100.0-perc_sens), 2))
    return pd.DataFrame(top_wc_dict)


### [Sens] Datasets

In [15]:
data = get_data("./data/annotation_results.csv", lim=2)
data = get_two_classes(data)
data['text_cleaned_stem'] = data['text'].map(lambda x: clean_text(x, remove_sw=True, stem_or_lemma=1))
data = get_df_stats_2(data, 'text_cleaned_stem').sort_values(by="all", ascending=False).reset_index(drop=True)
data

Top words [('propnam', 1062), ('go', 775), ('get', 651), ('day', 610), ('im', 493), ('like', 490), ('work', 458), ('time', 447), ('one', 433), ('love', 406), ('back', 364), ('want', 361), ('new', 358), ('today', 346), ('good', 343), ('think', 327), ('make', 306), ('know', 301), ('need', 295), ('got', 290)] 



Unnamed: 0,word,all,sens,ns,perc_sens,perc_ns
0,propnam,1062,580,482,54.61,45.39
1,go,775,426,349,54.97,45.03
2,get,651,316,335,48.54,51.46
3,day,610,316,294,51.80,48.20
4,im,493,253,240,51.32,48.68
...,...,...,...,...,...,...
11824,converg,1,1,0,100.00,0.00
11825,augustand,1,1,0,100.00,0.00
11826,uchicon,1,1,0,100.00,0.00
11827,uchicago,1,1,0,100.00,0.00


In [20]:
main_cols = ["word", "all", "perc_sens", "perc_ns"]
rename_cols = {"perc_sens":"%_sens", "perc_ns":"%_ns"}
top_values = 20
print("##  TOP", top_values, "all words")
print(data[main_cols][:top_values])
print("\n##  TOP", top_values, "sensible words")
main_cols = ["word", "sens", "perc_sens", "perc_ns"]
print(data.sort_values(by="sens", ascending=False)[main_cols].rename(columns=rename_cols)[:top_values])
print("\n##  TOP", top_values, "not sensible words")
main_cols = ["word", "ns", "perc_sens", "perc_ns"]
print(data.sort_values(by="ns", ascending=False)[main_cols].rename(columns=rename_cols)[:top_values])
print("\n##  TOP", top_values, "percentage sensible words")
main_cols = ["word", "all", "perc_sens", "perc_ns"]
print(data[data["all"]>=100].sort_values(by="perc_sens", ascending=False)[main_cols].rename(columns=rename_cols)[:top_values])
print("\n##  TOP", top_values, "percentage not sensible words")
main_cols = ["word", "all", "perc_sens", "perc_ns"]
print(data[data["all"]>=100].sort_values(by="perc_ns", ascending=False)[main_cols].rename(columns=rename_cols)[:top_values])


##  TOP 20 all words
       word   all  perc_sens  perc_ns
0   propnam  1062      54.61    45.39
1        go   775      54.97    45.03
2       get   651      48.54    51.46
3       day   610      51.80    48.20
4        im   493      51.32    48.68
5      like   490      32.24    67.76
6      work   458      64.19    35.81
7      time   447      41.61    58.39
8       one   433      30.95    69.05
9      love   406      32.27    67.73
10     back   364      64.01    35.99
11     want   361      41.83    58.17
12      new   358      36.03    63.97
13    today   346      54.62    45.38
14     good   343      40.82    59.18
15    think   327      31.80    68.20
16     make   306      35.29    64.71
17     know   301      38.87    61.13
18     need   295      40.00    60.00
19      got   290      51.03    48.97

##  TOP 20 sensible words
        word  sens  %_sens   %_ns
0    propnam   580   54.61  45.39
1         go   426   54.97  45.03
2        get   316   48.54  51.46
3        day   316

In [21]:
data = get_data("./data/annotation_results.csv", lim=3)
data = get_two_classes(data)
data['text_cleaned_stem'] = data['text'].map(lambda x: clean_text(x, remove_sw=True, stem_or_lemma=1))
data = get_df_stats_2(data, 'text_cleaned_stem').sort_values(by="all", ascending=False).reset_index(drop=True)
data

Top words [('propnam', 518), ('go', 382), ('get', 308), ('day', 267), ('like', 223), ('im', 206), ('one', 203), ('love', 200), ('time', 199), ('work', 187), ('back', 187), ('new', 168), ('see', 165), ('today', 160), ('want', 155), ('peopl', 155), ('good', 152), ('think', 152), ('dont', 151), ('make', 150)] 



Unnamed: 0,word,all,sens,ns,perc_sens,perc_ns
0,propnam,518,323,195,62.36,37.64
1,go,382,234,148,61.26,38.74
2,get,308,154,154,50.00,50.00
3,day,267,138,129,51.69,48.31
4,like,223,64,159,28.70,71.30
...,...,...,...,...,...,...
7303,disciplin,1,0,1,0.00,100.00
7304,ember,1,0,1,0.00,100.00
7305,alight,1,0,1,0.00,100.00
7306,suprem,1,1,0,100.00,0.00


In [22]:
main_cols = ["word", "all", "perc_sens", "perc_ns"]
rename_cols = {"perc_sens":"%_sens", "perc_ns":"%_ns"}
top_values = 20
print("##  TOP", top_values, "all words")
print(data[main_cols][:top_values])
print("\n##  TOP", top_values, "sensible words")
main_cols = ["word", "sens", "perc_sens", "perc_ns"]
print(data.sort_values(by="sens", ascending=False)[main_cols].rename(columns=rename_cols)[:top_values])
print("\n##  TOP", top_values, "not sensible words")
main_cols = ["word", "ns", "perc_sens", "perc_ns"]
print(data.sort_values(by="ns", ascending=False)[main_cols].rename(columns=rename_cols)[:top_values])
print("\n##  TOP", top_values, "percentage sensible words")
main_cols = ["word", "all", "perc_sens", "perc_ns"]
print(data[data["all"]>=100].sort_values(by="perc_sens", ascending=False)[main_cols].rename(columns=rename_cols)[:top_values])
print("\n##  TOP", top_values, "percentage not sensible words")
main_cols = ["word", "all", "perc_sens", "perc_ns"]
print(data[data["all"]>=100].sort_values(by="perc_ns", ascending=False)[main_cols].rename(columns=rename_cols)[:top_values])


##  TOP 20 all words
       word  all  perc_sens  perc_ns
0   propnam  518      62.36    37.64
1        go  382      61.26    38.74
2       get  308      50.00    50.00
3       day  267      51.69    48.31
4      like  223      28.70    71.30
5        im  206      52.91    47.09
6       one  203      23.65    76.35
7      love  200      26.00    74.00
8      time  199      44.22    55.78
9      work  187      72.73    27.27
10     back  187      70.59    29.41
11      new  168      32.14    67.86
12      see  165      48.48    51.52
13    today  160      57.50    42.50
14    peopl  155      17.42    82.58
15     want  155      40.00    60.00
16    think  152      24.34    75.66
17     good  152      36.84    63.16
18     dont  151      25.17    74.83
19     make  150      28.67    71.33

##  TOP 20 sensible words
        word  sens  %_sens   %_ns
0    propnam   323   62.36  37.64
1         go   234   61.26  38.74
2        get   154   50.00  50.00
3        day   138   51.69  48.31
9    

### [WH+TW] Datasets

In [10]:
data = pd.read_csv("./data/sample_ann2_01.csv")
data['text_cleaned_stem'] = data['text'].map(lambda x: clean_text(x, remove_sw=True, stem_or_lemma=1))
tmp = tmp_func(data, 'text_cleaned_stem')

data = pd.read_csv("./data/sample_ann2_02.csv")
data['text_cleaned_stem'] = data['text'].map(lambda x: clean_text(x, remove_sw=True, stem_or_lemma=1))
tmp = tmp_func(data, 'text_cleaned_stem').append(tmp)


Top words [('im', 1255), ('like', 704), ('want', 647), ('u', 631), ('get', 602), ('go', 566), ('lol', 548), ('love', 497), ('dont', 481), ('girl', 478), ('know', 408), ('good', 381), ('guy', 345), ('day', 339), ('one', 329), ('time', 325), ('feel', 317), ('think', 311), ('need', 307), ('look', 275)] 

Top words [('im', 1276), ('like', 704), ('u', 663), ('get', 646), ('want', 626), ('love', 536), ('dont', 527), ('girl', 500), ('go', 497), ('lol', 493), ('guy', 380), ('know', 372), ('good', 341), ('one', 341), ('need', 332), ('time', 320), ('feel', 318), ('day', 300), ('look', 292), ('got', 276)] 



In [46]:
data = pd.DataFrame()
for index in range(10):
    data_loc = pd.read_csv("./data/sample_ann2_"+(("0"+str(index+1))[-2:])+".csv")
    data_loc['text_cleaned_stem'] = data_loc['text'].map(lambda x: clean_text(x, remove_sw=True, stem_or_lemma=1))
    data = data.append(get_df_stats_2(data_loc, 'text_cleaned_stem'))

data = get_df_stats(data).sort_values(by="all_mean", ascending=False).reset_index(drop=True)
data


Top words [('im', 1255), ('like', 704), ('want', 647), ('u', 631), ('get', 602), ('go', 566), ('lol', 548), ('love', 497), ('dont', 481), ('girl', 478), ('know', 408), ('good', 381), ('guy', 345), ('day', 339), ('one', 329), ('time', 325), ('feel', 317), ('think', 311), ('need', 307), ('look', 275)] 

Top words [('im', 1276), ('like', 704), ('u', 663), ('get', 646), ('want', 626), ('love', 536), ('dont', 527), ('girl', 500), ('go', 497), ('lol', 493), ('guy', 380), ('know', 372), ('good', 341), ('one', 341), ('need', 332), ('time', 320), ('feel', 318), ('day', 300), ('look', 292), ('got', 276)] 

Top words [('im', 1321), ('like', 720), ('get', 633), ('u', 593), ('want', 571), ('lol', 556), ('go', 547), ('love', 535), ('girl', 523), ('dont', 475), ('know', 413), ('guy', 406), ('one', 373), ('good', 352), ('day', 347), ('think', 334), ('feel', 328), ('need', 322), ('got', 302), ('time', 293)] 

Top words [('im', 1321), ('like', 711), ('get', 626), ('u', 598), ('want', 557), ('girl', 524)

Unnamed: 0,word,all_mean,sens_mean,ns_mean,perc_sens_mean,perc_ns_mean,all_std,sens_std,ns_std,perc_sens_std,perc_ns_std
0,im,1280.0,721.6,558.4,56.378,43.622,40.759457,26.990533,24.949727,1.259151,1.259151
1,like,696.8,369.4,327.4,53.052,46.948,25.633312,14.261058,23.726216,2.226591,2.226591
2,get,629.2,239.5,389.7,38.060,61.940,23.602260,13.664634,17.269433,1.586485,1.586485
3,u,613.9,36.9,577.0,6.015,93.985,31.067847,5.724218,30.404678,0.934074,0.934074
4,want,604.3,473.6,130.7,78.368,21.632,30.485151,26.213652,11.235361,1.597879,1.597879
...,...,...,...,...,...,...,...,...,...,...,...
42286,rescart,1.0,0.0,1.0,0.000,100.000,0.000000,0.000000,0.000000,0.000000,0.000000
42287,uglyyyyyyi,1.0,1.0,0.0,100.000,0.000,0.000000,0.000000,0.000000,0.000000,0.000000
42288,reschedulei,1.0,0.0,1.0,0.000,100.000,0.000000,0.000000,0.000000,0.000000,0.000000
42289,uglyperson,1.0,0.0,1.0,0.000,100.000,0.000000,0.000000,0.000000,0.000000,0.000000


In [67]:
main_cols = ["word", "all_mean", "all_std", "perc_sens_mean", "perc_sens_std", "perc_ns_mean", "perc_ns_std"]
rename_cols = {"perc_sens_mean":"%_sens_mean", "perc_sens_std":"%_sens_std", "perc_ns_mean":"%_ns_mean", "perc_ns_std":"%_ns_std"}
top_values = 20
print("##  TOP", top_values, "all words")
print(data[main_cols].rename(columns=rename_cols)[:top_values])
print("\n##  TOP", top_values, "sensible words")
main_cols = ["word", "sens_mean", "sens_std", "perc_sens_mean", "perc_sens_std", "perc_ns_mean", "perc_ns_std"]
print(data.sort_values(by="sens_mean", ascending=False)[main_cols].rename(columns=rename_cols)[:top_values])
print("\n##  TOP", top_values, "not sensible words")
main_cols = ["word", "ns_mean", "ns_std", "perc_sens_mean", "perc_sens_std", "perc_ns_mean", "perc_ns_std"]
print(data.sort_values(by="ns_mean", ascending=False)[main_cols].rename(columns=rename_cols)[:top_values])
print("\n##  TOP", top_values, "percentage sensible words")
main_cols = ["word", "all_mean", "all_std", "perc_sens_mean", "perc_sens_std", "perc_ns_mean", "perc_ns_std"]
print(data[data["all_mean"]>=100].sort_values(by="perc_sens_mean", ascending=False)[main_cols].rename(columns=rename_cols)[:top_values])
print("\n##  TOP", top_values, "percentage not sensible words")
main_cols = ["word", "all_mean", "all_std", "perc_sens_mean", "perc_sens_std", "perc_ns_mean", "perc_ns_std"]
print(data[data["all_mean"]>=100].sort_values(by="perc_ns_mean", ascending=False)[main_cols].rename(columns=rename_cols)[:top_values])



##  TOP 20 all words
     word  all_mean    all_std  %_sens_mean  %_sens_std  %_ns_mean  %_ns_std
0      im    1280.0  40.759457       56.378    1.259151     43.622  1.259151
1    like     696.8  25.633312       53.052    2.226591     46.948  2.226591
2     get     629.2  23.602260       38.060    1.586485     61.940  1.586485
3       u     613.9  31.067847        6.015    0.934074     93.985  0.934074
4    want     604.3  30.485151       78.368    1.597879     21.632  1.597879
5      go     524.8  24.507595       34.380    1.802745     65.620  1.802745
6     lol     520.2  30.124925        7.640    1.083974     92.360  1.083974
7    love     517.8  22.812277       58.655    1.820264     41.345  1.820264
8    girl     515.3  28.433353       83.171    1.078162     16.829  1.078162
9    dont     489.7  23.589310       53.134    2.405462     46.866  2.405462
10    guy     396.2  27.454204       83.885    2.203463     16.115  2.203463
11   know     389.2  19.147382       49.406    2.587505

## Create sets: training, validation and test

In [10]:
from lib.utils import get_data, get_two_classes, my_train_val_test_split

# training
train_ratio = 0.75
validation_ratio = 0.15
test_ratio = 0.10


In [11]:
for ann in [2,3]:
    print("## agreement on", ann, "annotators")
    # read data
    data = get_data("./data/annotation_results.csv", lim=ann)
    data = get_two_classes(data)
    # create train, validation and test sets
    x_train, x_val, x_test, y_train, y_val, y_test = my_train_val_test_split(data['text'].values, data['class'].values, validation_ratio, test_ratio, 512)
    print("Size of: training -> "+str(len(x_train))+" ("+str(int(100*len(x_train)/(len(data))))+"%), val -> "+str(len(x_val))+" ("+str(int(100*len(x_val)/(len(data))))+"%), test -> "+str(len(x_test))+" ("+str(int(100*len(x_test)/(len(data))))+"%)")
    # save sets
    pd.DataFrame({"text": x_train, "class": y_train}).to_csv("./data/annotation_results__ann"+str(ann)+"_training.csv", index=False)
    pd.DataFrame({"text": x_val, "class": y_val}).to_csv("./data/annotation_results__ann"+str(ann)+"_validation.csv", index=False)
    pd.DataFrame({"text": x_test, "class": y_test}).to_csv("./data/annotation_results__ann"+str(ann)+"_test.csv", index=False)

## agreement on 2 annotators
Size of: training -> 6575 (75%), val -> 1314 (14%), test -> 876 (9%)
## agreement on 3 annotators
Size of: training -> 3036 (75%), val -> 606 (14%), test -> 404 (9%)


In [12]:
for index in range(10):
    print("## sample", (index+1))
    # read data
    pathname = "./data/sample_ann2_"+(("0"+str(index+1))[-2:])
    data = pd.read_csv(pathname+".csv")
    # create train, validation and test sets
    x_train, x_val, x_test, y_train, y_val, y_test = my_train_val_test_split(data['text'].values, data['class'].values, validation_ratio, test_ratio, 512)
    print("Size of: training -> "+str(len(x_train))+" ("+str(int(100*len(x_train)/(len(data))))+"%), val -> "+str(len(x_val))+" ("+str(int(100*len(x_val)/(len(data))))+"%), test -> "+str(len(x_test))+" ("+str(int(100*len(x_test)/(len(data))))+"%)")
    # save sets
    pd.DataFrame({"text": x_train, "class": y_train}).to_csv(pathname+"_training.csv", index=False)
    pd.DataFrame({"text": x_val, "class": y_val}).to_csv(pathname+"_validation.csv", index=False)
    pd.DataFrame({"text": x_test, "class": y_test}).to_csv(pathname+"_test.csv", index=False)


## sample 1
Size of: training -> 6575 (75%), val -> 1314 (14%), test -> 876 (9%)
## sample 2
Size of: training -> 6575 (75%), val -> 1314 (14%), test -> 876 (9%)
## sample 3
Size of: training -> 6575 (75%), val -> 1314 (14%), test -> 876 (9%)
## sample 4
Size of: training -> 6575 (75%), val -> 1314 (14%), test -> 876 (9%)
## sample 5
Size of: training -> 6575 (75%), val -> 1314 (14%), test -> 876 (9%)
## sample 6
Size of: training -> 6575 (75%), val -> 1314 (14%), test -> 876 (9%)
## sample 7
Size of: training -> 6575 (75%), val -> 1314 (14%), test -> 876 (9%)
## sample 8
Size of: training -> 6575 (75%), val -> 1314 (14%), test -> 876 (9%)
## sample 9
Size of: training -> 6575 (75%), val -> 1314 (14%), test -> 876 (9%)
## sample 10
Size of: training -> 6575 (75%), val -> 1314 (14%), test -> 876 (9%)
