# Data Wrangling

## Structure

The data wrangling process goes through 8 stages:

__1.__ Load data from source jsonl files <br><br> 
__2.__ Filter data for extractive summaries only<br><br> 
__3.1__ Preprocess data with spacy to form cleaned sentence lists for each article <br><br>
__3.2__ Use sentence tranformer to calcuate BERT sentence embeddings as the features set <br><br>
__4.__ Calculate target variables by finding highest cosine similarity article sentence for each summary sentence <br><br> 
__5.__ Add additional features (sentence number and document length) <br><br>
__6.__ Add subject domain labels for each article <br><br>
__7.__ Preprocess for train_test_split on embeddings features set only with split at document level <br><br>
__8.__ Add sentence number to embeddings features for another train_test_split set <br><br>


## Connect to GoogleDrive

In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [2]:
%cd /content/drive/MyDrive/Colab\ Notebooks/CIS5930_Project/
%ls

/content/drive/MyDrive/Colab Notebooks/CIS5930_Project
'Arunima-CIS 5930 Project: Data Wrangling.ipynb'
'CIS 5930 Project: Data Wrangling.ipynb'
'CIS 5930 Project: ML Model.ipynb'
'CIS 5930 Project: ML Results.ipynb'
 cv_results_LEDE3.pickle
 cv_results_lstm_bi25_embeddings_only_epochs1_top3.pickle
 cv_results_lstm_bi50_embeddings_only_epochs1_top3.pickle
 cv_results_lstm_uni25_embeddings_only_epochs1_top3.pickle
 cv_results_lstm_uni50_embeddings_only_epochs1_top3.pickle
 cv_results_nn2525_embeddings_only_cw_top3_epochs50.pickle
 cv_results_nn2550_embeddings_only_cw_top3_epochs50.pickle
 cv_results_nn5050_embeddings_only_cw_top3_epochs50.pickle
 cv_results_textrank.pickle
 cv_results_top_3_logreg_cw_balanced.pickle
 cv_results_top_3_logreg_default.pickle
 cv_results_top_3_logreg_elasticnet_gridsearch.pickle
 cv_results_top_3_logreg_elasticnet.pickle
 cv_results_top_3_logreg_sent_num_bal.pickle
 cv_results_top_3_logreg_sent_num_no_bal.pickle
 dev-stats.jsonl
 extractive_all_domain_label

## Install the dependencies 

In [None]:
!pip install sentence-transformers

Collecting sentence-transformers
  Downloading sentence-transformers-2.2.0.tar.gz (79 kB)
[K     |████████████████████████████████| 79 kB 1.2 MB/s 
[?25hCollecting transformers<5.0.0,>=4.6.0
  Downloading transformers-4.18.0-py3-none-any.whl (4.0 MB)
[K     |████████████████████████████████| 4.0 MB 31.5 MB/s 
Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 34.9 MB/s 
[?25hCollecting huggingface-hub
  Downloading huggingface_hub-0.5.1-py3-none-any.whl (77 kB)
[K     |████████████████████████████████| 77 kB 6.3 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.11.6-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.5 MB)
[K     |████████████████████████████████| 6.5 MB 36.2 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64

In [None]:
!pip install -U pip setuptools wheel
!pip install -U spacy
!python -m spacy download en_core_web_lg

Collecting pip
  Downloading pip-22.0.4-py3-none-any.whl (2.1 MB)
[K     |████████████████████████████████| 2.1 MB 6.1 MB/s 
Collecting setuptools
  Downloading setuptools-62.0.0-py3-none-any.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 30.6 MB/s 
Installing collected packages: setuptools, pip
  Attempting uninstall: setuptools
    Found existing installation: setuptools 57.4.0
    Uninstalling setuptools-57.4.0:
      Successfully uninstalled setuptools-57.4.0
  Attempting uninstall: pip
    Found existing installation: pip 21.1.3
    Uninstalling pip-21.1.3:
      Successfully uninstalled pip-21.1.3
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tensorflow 2.8.0 requires tf-estimator-nightly==2.8.0.dev2021122109, which is not installed.
datascience 0.10.6 requires folium==0.2.1, but you have folium 0.8.3 which is incompatible.[0m
Suc

Collecting spacy
  Downloading spacy-3.2.4-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.0/6.0 MB[0m [31m41.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pathy>=0.3.5
  Downloading pathy-0.6.1-py3-none-any.whl (42 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 KB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting thinc<8.1.0,>=8.0.12
  Downloading thinc-8.0.15-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (653 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m653.3/653.3 KB[0m [31m37.8 MB/s[0m eta [36m0:00:00[0m
Collecting spacy-legacy<3.1.0,>=3.0.8
  Downloading spacy_legacy-3.0.9-py2.py3-none-any.whl (20 kB)
Collecting catalogue<2.1.0,>=2.0.6
  Downloading catalogue-2.0.7-py3-none-any.whl (17 kB)
Collecting srsly<3.0.0,>=2.4.1
  Downloading srsly-2.4.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (451 kB)
[2

In [None]:
!pip install rouge-score

Collecting rouge-score
  Downloading rouge_score-0.0.4-py2.py3-none-any.whl (22 kB)
Installing collected packages: rouge-score
Successfully installed rouge-score-0.0.4
[0m

## 1. Load Data

Load data from source jsonl files

In [None]:
"""
load_data_from_jsonl.py
"""
import json
import pandas as pd
import pickle

input_file = 'train-stats.jsonl'
output_file = 'train_stats_df_no_spacy.pickle'

#read jsonl file into list of sample rows
counter=0
data=[]
with open(input_file, 'r', encoding='utf-8') as f:
    for line in f:
        data.append(json.loads(line.rstrip('\n|\r')))
        counter +=1
        if counter > 500000 :
          break
print('number of lines:', counter)
#wrap in dataframe        
df = pd.DataFrame(data)

#save to pickle
with open(output_file, 'wb') as handle:                                     
    pickle.dump(df, handle)


number of lines: 500001


In [None]:
df.columns

Index(['url', 'archive', 'title', 'date', 'text', 'summary', 'compression',
       'coverage', 'density', 'compression_bin', 'coverage_bin',
       'density_bin'],
      dtype='object')

## 2. Filter Data

Filter data for extractive summaries only

In [None]:
"""
filter_extractive_data_only.py
""" 
import pickle
import pandas as pd
 
output_file = 'train_stats_df_extractive_no_spacy.pickle'

#load all data
df = pd.read_pickle('train_stats_df_no_spacy.pickle')

#filter for extractive summaries only
df = df[df.density_bin == 'extractive']

#save to pickle file
with open(output_file, 'wb') as handle:                                     
    pickle.dump(df, handle)


In [None]:
df.shape

(197017, 12)

## 3. Preprocessing: Sentence Tokenization and Embeddings (Features)

3.1 Preprocess data with spacy to form cleaned sentence lists for each article <br> 
3.2 Use sentence tranformer to calcuate BERT sentence embeddings as the features set

In [None]:
!pip install sentence-transformers

[0m

In [None]:
!pip install -U pip setuptools wheel
!pip install -U spacy
!python -m spacy download en_core_web_lg

[0mCollecting en-core-web-lg==3.2.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.2.0/en_core_web_lg-3.2.0-py3-none-any.whl (777.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m777.4/777.4 MB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: en-core-web-lg
  Attempting uninstall: en-core-web-lg
    Found existing installation: en-core-web-lg 2.2.5
    Uninstalling en-core-web-lg-2.2.5:
      Successfully uninstalled en-core-web-lg-2.2.5
Successfully installed en-core-web-lg-3.2.0
[0m[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')


In [None]:
"""
preprocessing_embed.py
"""
import pickle
import pandas as pd
import numpy as np
import spacy
from sentence_transformers import SentenceTransformer
from datetime import datetime as dt


### Helper function

def text_to_sent_list(text, 
                      nlp = spacy.load("en_core_web_lg"), 
                      embedder = SentenceTransformer('distilbert-base-nli-mean-tokens'),
                      min_len=2):
    
    ''' Returns cleaned article sentences and BERT sentence embeddings'''
    
    #convert to list of sentences
    text = nlp(text)
    sents = list(text.sents)
    #remove short sentences by threshhold                                                                                                
    sents_clean = [sentence.text for sentence in sents if len(sentence)> min_len]
    #remove entries with empty list
    sents_clean = [sentence for sentence in sents_clean if len(sentence)!=0]
    #embed sentences (deafult uses BERT SentenceTransformer)
    sents_embedding= np.array(embedder.encode(sents_clean, convert_to_tensor=True).cpu())
    
    return sents_clean, sents_embedding



### Script

output_file = 'train_stats_df_processed_extr_5000.pickle'  
#load full extractive df
df = pd.read_pickle('train_stats_df_extractive_no_spacy.pickle')

#truncate for local computation
df= df.head(5000).reset_index(drop=True)

#load nlp and embedder
nlp = spacy.load("en_core_web_lg")
embedder = SentenceTransformer('distilbert-base-nli-mean-tokens')

t1 = dt.now()
print(t1)

#extract clean sentence list and sentence embedding for each article TEXT
f = lambda text: text_to_sent_list(text, nlp=nlp, embedder=embedder, min_len=2)
s_interim_tuple = df['text'].apply(f)

df['text_clean'] = s_interim_tuple.apply(lambda x: x[0])
df['text_embedding'] = s_interim_tuple.apply(lambda x: x[1])

#extract clean sentence list and sentence embedding for each article SUMMARY
f = lambda summ: text_to_sent_list(summ, nlp=nlp, embedder=embedder, min_len=0)
s_interim_tuple = df['summary'].apply(f)

df['summary_clean'] = s_interim_tuple.apply(lambda x: x[0])
df['summary_embedding'] = s_interim_tuple.apply(lambda x: x[1])

with open(output_file, 'wb') as handle:                                     
    pickle.dump(df, handle)

t2=dt.now()
print(t2)
print(t2-t1)

#5000 articles took 1hr40mins

2022-04-05 03:11:03.384244
2022-04-05 03:26:05.598771
0:15:02.214527


## 4. Calculating Target Labels

Calculate target variables by finding highest cosine similarity article sentence for each summary sentence

In [None]:
"""
preprocessing label_target.py
"""
import pickle
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

from datetime import datetime as dt

### Helper Functions

def find_sim_single_summary(summary_sentence_embed, doc_emedding):
    '''returns array of indices for max cosine sim per summary sentences'''
    cos_sim_mat = cosine_similarity(doc_emedding, summary_sentence_embed)
    idx_arr = np.argmax(cos_sim_mat, axis=0)
    
    return idx_arr

def label_sent_in_summary(s_text, s_summary):
    '''returns index list and binary target labels in an array'''
    doc_num = s_text.shape[0]
    
    #initialize zeros
    labels = [np.zeros(doc.shape[0]) for doc in s_text.tolist()] 
    
    #calc idx for most similar
    
    idx_list = [np.sort(find_sim_single_summary(s_summary[j], s_text[j])) for j 
                                                            in range(doc_num)]
      
    for j in range(doc_num):
        labels[j][idx_list[j]]= 1 
    
    return idx_list, labels


### Script

t1 = dt.now()
print(t1)

output_file = 'train_stats_df_processed_extr_label_5000.pickle'

df = pd.read_pickle('train_stats_df_processed_extr_5000.pickle' )

#get index list and target labels
idx_list, labels = label_sent_in_summary(df.text_embedding, df.summary_embedding)

#wrap in dataframe
df['labels'] = labels
df['labels_idx_list'] = idx_list

#save to pickle
with open(output_file, 'wb') as handle:                                     
    pickle.dump(df, handle)

t2 = dt.now()

print(t2)

print(t2-t1)


2022-04-05 03:29:33.828039
2022-04-05 03:29:41.615898
0:00:07.787859


In [None]:
df[0]['text_clean']
df[0]['summary_clean']

Index(['url', 'archive', 'title', 'date', 'text', 'summary', 'compression',
       'coverage', 'density', 'compression_bin', 'coverage_bin', 'density_bin',
       'text_clean', 'text_embedding', 'summary_clean', 'summary_embedding',
       'labels', 'labels_idx_list'],
      dtype='object')

## 5. Additional Features

Add additional features (sentence number and document length)

In [None]:
"""
add_additional_features.py
"""

import numpy as np
import pandas as pd
import pickle
from datetime import datetime as dt

t1 = dt.now()
print(t1)
    
output_file = 'train_stats_dict_processed_extr_final_5000_.pickle'
    
df = pd.read_pickle('train_stats_df_processed_extr_label_5000.pickle')

#define features and labels as separate series
s_embed_text = df.text_embedding
s_y_labels= df.labels

#label docs
s_doc_label = pd.Series(range(df.shape[0]), name = 'doc_label')

#calculate doc mean
s_doc_mean = s_embed_text.apply(lambda x: x.mean(axis=0).reshape(1,-1))
    
#calculate doc sent length
s_doc_length = s_embed_text.apply(lambda x: x.shape[0])


#create values for each sentence in doc 
X_doc_label_list =[]
X_doc_mean_list = []
X_doc_length_list = []
X_sent_num_list = []

for j in range(len(df)):
    X_doc_label = s_doc_label[j]
    X_doc_mean = s_doc_mean[j]
    X_doc_length = s_doc_length[j]
    X_text = s_embed_text [j]
    n = X_text.shape[0]
    
    X_doc_label_fixed = X_doc_label
    X_doc_mean_fixed = X_doc_mean
    X_doc_length_fixed = X_doc_length 
    sent_num = []
    for i in range(n-1): 
        X_doc_label = np.vstack((X_doc_label, X_doc_label_fixed )) 
        X_doc_mean = np.vstack((X_doc_mean, X_doc_mean_fixed )) 
        X_doc_length = np.vstack((X_doc_length, X_doc_length_fixed )) 
        sent_num.append(i)
    sent_num.append(n-1)
    
    X_doc_label_list.append(X_doc_label)
    X_doc_mean_list.append(X_doc_mean)
    X_doc_length_list.append(X_doc_length)
    X_sent_num_list.append(np.array(sent_num).reshape(-1,1))
    
#from list to pandas series
s_doc_label = pd.Series(X_doc_label_list)
s_doc_mean = pd.Series(X_doc_mean_list)
s_doc_length = pd.Series(X_doc_length_list)
s_sent_num = pd.Series(X_sent_num_list)

#concatenate documents with rows = sentences
  #intialize
Xy_doc_label = s_doc_label.values[0]
X = np.hstack((s_embed_text[0], s_doc_mean[0], s_sent_num[0], s_doc_length[0]))
y= s_y_labels[0].reshape(-1,1)
  #recursive population
f = np.vectorize(lambda x: x if type(x) == np.ndarray else np.array([[x]]))  
for j in range(1, len(df)):
    Xy_doc_label_new = s_doc_label.values[j]
    
    X_text_new = s_embed_text [j]
    X_sent_num_new = s_sent_num[j]
    X_doc_mean_new = s_doc_mean[j]
    X_doc_length_new = f(s_doc_length[j])
    y_new = s_y_labels[j].reshape(-1,1)
    
    X_new = np.hstack((X_text_new, X_doc_mean_new, X_sent_num_new, X_doc_length_new))
    
    X = np.vstack((X, X_new))
    y = np.vstack((y, y_new))           
    
    Xy_doc_label = np.vstack((Xy_doc_label, Xy_doc_label_new))
        
#wrap X in dataframe with lables
labels_text_embedding = ['Sent_BERT_D_' + str(j) for j in range(768)]
labels_doc_mean = ['Doc_BERT_D_' + str(j) for j in range(768)]
other_labels = ['Sent_Number', 'Doc_Length']
col_names = labels_text_embedding + labels_doc_mean + other_labels

df_X = pd.DataFrame(X, columns = col_names)
    
data_dict = {'df_original': df, 'Xy_doc_label_array': Xy_doc_label, 
              'df_X': df_X, 'y_array': y}
    
with open(output_file, 'wb') as handle:                                     
    pickle.dump(data_dict, handle)
    
t2 = dt.now()

print(t2)
print(t2-t1)

#5000 articles took 48 mins

2022-04-05 03:37:57.016367
2022-04-05 03:59:06.662883
0:21:09.646516


## 6. Subject Domain Labels

Add subject domain labels for each article

In [None]:
"""
add_domain_labels.py
"""

import pandas as pd
import numpy as np
import pickle
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

output_file = 'extractive_all_domain_labels.pickle' 

input_file = 'train_stats_dict_processed_extr_final_5000_.pickle' 
data = pd.read_pickle(input_file )

embedder = SentenceTransformer('distilbert-base-nli-mean-tokens')

#Make single df with only Embeddings and doc label
df_embed = data['df_X'].loc[:,'Doc_BERT_D_0': 'Doc_BERT_D_767']
df_doc_label = pd.DataFrame(data['Xy_doc_label_array'],columns=['doc_label'])
df = pd.concat([df_doc_label, df_embed], axis=1)
df = df.drop_duplicates().set_index('doc_label', drop=True)

#embed lambda function
embed = lambda x: embedder.encode(x, convert_to_tensor=False)

#define subject domains
domains = ['entertainment','politics', 'business', 'crime']
#find domain word embeddings using BERT
domain_embed = [embed(dom) for dom in domains]
#wrap in dataframe
df_dom_embed = pd.DataFrame(domain_embed, index = domains,
                            columns = df.columns)
#calculate cosine similarity between article and each subject
cos_matrix = cosine_similarity(df, df_dom_embed)

#return subject word from index number function
f = np.vectorize(lambda x: domains[x])
#find max cos sim and return matching subject
doc_domain = f(np.argmax(cos_matrix, axis=1))
#Add to primary dataframe
df['domain'] = doc_domain

#Add to primary dictionary for storage
data.update({'domain_labels_arr': df['domain'].values})

#save to pickle file
with open(output_file, 'wb') as handle:                                     
    pickle.dump(data, handle)

Downloading:   0%|          | 0.00/690 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.99k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/550 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/229 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/265M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/450 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

## 7. Train_test_split 1: Embedding Features Only

Preprocess for train_test_split on embeddings features set only with split at document level

In [3]:
### Helper Function

'''functions.py'''

def gen_train_test_split_doc_level(Xy_doc_label, X, y, 
                                         test_ratio, folds=1, rand_seed=42):
    '''returns train doc labels, test doc labels, and train and test sets
    for features X and target Y'''
    
    
    random.seed(rand_seed)
    
    #index is doc label 
    total_docs = Xy_doc_label.max()
    train_docs_num = int(total_docs*(1-test_ratio))
    
    #for k >1, want to ensure different seeds
    rand_state_list = random.sample(range(2*folds), folds)
    
    #look through k folds
    train_test_set = []
    for state in rand_state_list:
    
        random.seed(state)
        #sample random training set and mask
        train_docs = random.sample(range(1, total_docs+1), train_docs_num)
        train_mask = np.array([x in train_docs for x in list(Xy_doc_label)])
        
        #use mask to define train and test sets
        X_train = X[train_mask]
        y_train = y[train_mask]
    
        X_test = X[~train_mask]
        y_test = y[~train_mask]
    
        Xy_doc_label_train = Xy_doc_label[train_mask]
        Xy_doc_label_test = Xy_doc_label[~train_mask]
        
        #assign all data to tuple for each pass
        data_pass = (Xy_doc_label_train, Xy_doc_label_test,
                                             X_train, X_test, y_train, y_test)
        #append results for ith fold to set 
        train_test_set.append(data_pass)
    
    #set answer tuples to final tuple as container
    train_test_set = tuple(train_test_set)

    return train_test_set

In [4]:
!pip install rouge_score



In [5]:
#inspect dataset
import pickle
import pandas as pd
from functions import gen_train_test_split_doc_level

input_filename = 'extractive_all_domain_labels.pickle' 
folds = 1

output_file = 'train_test_set20_embeddings_only.pickle'

data_dict = pd.read_pickle(input_filename)
#Specify model inputs: df, X, y, doc_labels
df = data_dict['df_original']
Xy_doc_label = data_dict['Xy_doc_label_array']
X = data_dict['df_X'].drop(['Sent_Number','Doc_Length'], axis=1).values
y = data_dict['y_array']

In [6]:
for k,v in data_dict.items():
    print(k)

df_original
Xy_doc_label_array
df_X
y_array
domain_labels_arr


In [37]:
data_dict['df_X'].loc[38]

Sent_BERT_D_0    -0.131211 
Sent_BERT_D_1    -0.185938 
Sent_BERT_D_2     0.317033 
Sent_BERT_D_3     0.162783 
Sent_BERT_D_4    -0.453188 
                    ...    
Doc_BERT_D_765   -0.185960 
Doc_BERT_D_766    0.198736 
Doc_BERT_D_767   -0.622611 
Sent_Number       6.000000 
Doc_Length        40.000000
Name: 38, Length: 1538, dtype: float64

In [7]:
df.columns

Index(['url', 'archive', 'title', 'date', 'text', 'summary', 'compression',
       'coverage', 'density', 'compression_bin', 'coverage_bin', 'density_bin',
       'text_clean', 'text_embedding', 'summary_clean', 'summary_embedding',
       'labels', 'labels_idx_list'],
      dtype='object')

In [8]:
df.shape

(5000, 18)

In [19]:
pd.set_option('display.max_colwidth', -1)
idx = 1
print(df.loc[idx,'text_clean'])
text_em = df.loc[idx,'text_embedding']
print(text_em.shape)
print(df.loc[idx,'summary_clean'])
sum_em = df.loc[idx,'summary_embedding']
print(sum_em.shape)

['BY A.J. BENZA & MICHAEL LEWITTES\n\n', "If Simon Rex looks a little familiar, it may not have anything to do with his gig as an MTV veejay or his ads for Levi's and Tommy Hilfiger.", 'It could be because Rex did a little film work once upon a time three skin flicks to be precise!\n\n', 'In case you missed his work, Rex\' oeuvre includes titles like "Young, Hard & Solo II," its poignant sequel, "Young, Hard & Solo III," and "Hot Sessions III," which, in the opinion of this column, doesn\'t come anywhere near the first two "Hot Sessions."', 'Those were, in a word, mesmerizing.', 'Brad Posey, who owns the all-male photo and video studio Club 1821 as in men 18 to 21 which produced the videos, told us that Rex or Sebastian as he was known then "was very sweet and he performed very well, as any 19-year-old would who\'s at his sexual peak."', 'Posey told us there wasn\'t any interaction with others on the video, just "lots of solo stroking and steamy poses."', "Oh, we've bee n there.", 'Rex

  """Entry point for launching an IPython kernel.


In [11]:
df.loc[idx]

url                  http://www.nydailynews.com/archives/gossip/1996/02/13/1996-02-13_rex_flexed_pecs_for_skin_pics.html                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                

In [24]:
Xy_doc_label.shape

(151435, 1)

In [25]:
Xy_doc_label

array([[   0],
       [   0],
       [   0],
       ...,
       [4999],
       [4999],
       [4999]])

In [26]:
X.shape

(151435, 1536)

In [27]:
y.shape

(151435, 1)

In [None]:
"""
preprocess_train_test_split_embeddings_only.py
"""
import pickle
import pandas as pd
from functions import gen_train_test_split_doc_level

input_filename = 'extractive_all_domain_labels.pickle' 
folds = 1

output_file = 'train_test_set20_embeddings_only.pickle'

data_dict = pd.read_pickle(input_filename)

#Specify model inputs: df, X, y, doc_labels
df = data_dict['df_original']
Xy_doc_label = data_dict['Xy_doc_label_array']
X = data_dict['df_X'].drop(['Sent_Number','Doc_Length'], axis=1).values
y = data_dict['y_array']

        
#train test split at document level

train_test_set = gen_train_test_split_doc_level(Xy_doc_label, X, y, 
                                         test_ratio=0.2, folds=folds, rand_seed=42)

data_dict.update({'train_test_sets': train_test_set })

with open(output_file, 'wb') as handle:                                     
    pickle.dump(data_dict, handle)

## 8. Train_test_split 2: Embedding and Num Sentence Features

Add sentence number to embeddings features for another train_test_split set

In [None]:
"""
add_sent_num_to_train_test_split.py
"""
import pickle
import pandas as pd
from functions import gen_train_test_split_doc_level


input_filename = 'extractive_all_domain_labels.pickle' 
folds = 1

output_file = 'train_test_set20_embeddings_sent_num.pickle'

data_dict = pd.read_pickle(input_filename)

#Specify model inputs: df, X, y, doc_labels
df = data_dict['df_original']
Xy_doc_label = data_dict['Xy_doc_label_array']
X = data_dict['df_X'].drop(['Doc_Length'], axis=1).values
y = data_dict['y_array']

        
#train test split at document level

train_test_set = gen_train_test_split_doc_level(Xy_doc_label, X, y, 
                                         test_ratio=0.2, folds=folds, rand_seed=42)

data_dict.update({'train_test_sets': train_test_set })

with open(output_file, 'wb') as handle:                                     
    pickle.dump(data_dict, handle)