In [1]:
import numpy as np
import fasttext
import pandas as pd
from resources.tokTT import CommentTokenizer as CT
from resources.basicIO import InputOutput as IO
from resources.filterLang import FilterLanguage as FL
from active_expansion.fasttext_batch_avg import Expander

[nltk_data] Downloading package wordnet to C:\Users\AJAY
[nltk_data]     BISWAS\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


### Preprocess Corpus and Random Sample

In [2]:
raw_corpus = IO.load_text('datasets/corpus.txt')
tokenized_corpus = CT.cleaned('datasets/corpus.txt','utf-8',['0','1'])
IO.save_text('datasets/tokenized_corpus.txt',tokenized_corpus)

### Make dataframes

In [3]:
df_dict = {'raw_comment': raw_corpus, 'tokenized_comment': tokenized_corpus}
df_corpus = pd.DataFrame(df_dict)
df_corpus.to_csv('datasets/corpus_data.csv', index=False)

In [8]:
expansion_text = IO.load_csv_col('datasets/random_sample.csv','comment')
expansion_text = expansion_text[0:800]
expansion_TK = [CT.tokenize(x,['0','1']) for x in expansion_text]
expansion_text_labels = IO.load_csv_col('datasets/random_sample.csv', 'label')
expansion_text_labels = list(map(str, map(int, expansion_text_labels[0:800])))

df_dict = {'raw_comment': expansion_text, 'tokenized_comment': expansion_TK, 'label': expansion_text_labels}
df_sample = pd.DataFrame(df_dict)
df_sample.to_csv('datasets/random_sample_data.csv', index=False)


In [9]:
seed_set_text = IO.load_text('datasets/seed_set.txt')
seed_set_labels = IO.load_text('datasets/seed_set_labels.txt')
seed_set_labels = list(map(int, seed_set_labels))
seed_set_TK = CT.cleaned('datasets/seed_set.txt')

df_dict = {'raw_comment': seed_set_text, 'tokenized_comment': seed_set_TK, 'label': seed_set_labels}
df_seed = pd.DataFrame(df_dict)
df_seed.to_csv('datasets/seed_set_data.csv', index=False)

### Make Fasttext Model

In [6]:
model_2 = fasttext.train_unsupervised(input="datasets/tokenized_corpus.txt", lr=0.01, epoch=10, wordNgrams=2, dim=40)
model_2.save_model("models/ft_unsupervised_N_2.bin")

### Seed Set Expansion

In [10]:
Expander.Expand("models/ft_unsupervised_N_2.bin",
                seed_set_text,
                seed_set_labels,
                seed_set_TK,
                expansion_text, 
                expansion_text_labels,
                expansion_TK,
                10,
                40,
                'datasets_post/batch_N_2.txt',
                'datasets_post/batch_labels_N_2.txt')


