# Selecting 1 Million Samples

In the notebook, I will select 1 million rows from `dialogueText_196.csv` without losing context

## Load the full dataset

In [1]:
import pandas as pd

df = pd.read_csv('../data/Ubuntu-dialogue-corpus/dialogueText_196.csv')
df

Unnamed: 0,folder,dialogueID,date,from,to,text
0,301,1.tsv,2004-11-23T11:49:00.000Z,stuNNed,,any ideas why java plugin takes so long to load?
1,301,1.tsv,2004-11-23T11:49:00.000Z,crimsun,stuNNed,java 1.4?
2,301,1.tsv,2004-11-23T11:49:00.000Z,stuNNed,crimsun,yes
3,301,1.tsv,2004-11-23T11:49:00.000Z,crimsun,stuNNed,java 1.5 loads _much_ faster
4,301,1.tsv,2004-11-23T11:50:00.000Z,stuNNed,crimsun,noneus: how can i get 1.5 is there a .deb some...
...,...,...,...,...,...,...
9212872,13,3676.tsv,2012-07-07T20:17:00.000Z,MonkeyDust,legolas,= arian
9212873,13,3676.tsv,2012-07-07T20:18:00.000Z,MonkeyDust,legolas,"observation and deduction, dear watson"
9212874,13,16586.tsv,2008-07-25T01:53:00.000Z,linuxfce,,i am trying to install nvidia drivers from the...
9212875,13,16586.tsv,2008-07-25T01:53:00.000Z,linuxfce,,how do i enter runlevel 3? when i try telinit ...


In [2]:
df.shape

(9212877, 6)

### drop folder column as it's not relevant

In [2]:
df = df.drop(columns=['folder'])
df

Unnamed: 0,dialogueID,date,from,to,text
0,1.tsv,2004-11-23T11:49:00.000Z,stuNNed,,any ideas why java plugin takes so long to load?
1,1.tsv,2004-11-23T11:49:00.000Z,crimsun,stuNNed,java 1.4?
2,1.tsv,2004-11-23T11:49:00.000Z,stuNNed,crimsun,yes
3,1.tsv,2004-11-23T11:49:00.000Z,crimsun,stuNNed,java 1.5 loads _much_ faster
4,1.tsv,2004-11-23T11:50:00.000Z,stuNNed,crimsun,noneus: how can i get 1.5 is there a .deb some...
...,...,...,...,...,...
9212872,3676.tsv,2012-07-07T20:17:00.000Z,MonkeyDust,legolas,= arian
9212873,3676.tsv,2012-07-07T20:18:00.000Z,MonkeyDust,legolas,"observation and deduction, dear watson"
9212874,16586.tsv,2008-07-25T01:53:00.000Z,linuxfce,,i am trying to install nvidia drivers from the...
9212875,16586.tsv,2008-07-25T01:53:00.000Z,linuxfce,,how do i enter runlevel 3? when i try telinit ...


## Count number of utterances per diaglogue

In [3]:
dialogue_lengths = df.groupby('dialogueID').size().reset_index(name='num_rows')
dialogue_lengths

Unnamed: 0,dialogueID,num_rows
0,1.tsv,32901
1,10.tsv,8825
2,100.tsv,2008
3,1000.tsv,549
4,10000.tsv,155
...,...,...
346103,99995.tsv,8
346104,99996.tsv,8
346105,99997.tsv,8
346106,99998.tsv,8


## Sort by longest first or shuffle

In [4]:
dialogue_lengths = dialogue_lengths.sample(frac=1, random_state=69)
dialogue_lengths

Unnamed: 0,dialogueID,num_rows
13270,111940.tsv,8
232094,308883.tsv,3
336949,91755.tsv,19
186042,267436.tsv,3
16486,114835.tsv,8
...,...,...
310497,67949.tsv,29
210104,289092.tsv,3
299286,57859.tsv,29
25015,122510.tsv,8


## Select dialogueIDs until it reaches 1 Million rows

In [5]:
selected_ids = []
total = 0
max_rows = 1000

for _, row in dialogue_lengths.iterrows():
    if total + row['num_rows'] > max_rows:
        break
    selected_ids.append(row['dialogueID'])
    total += row['num_rows']

# filter original dataframe
subset_df = df[df['dialogueID'].isin(selected_ids)]

subset_df.to_csv(f'sample/ubuntu_context_{max_rows}.csv')