In [10]:
import numpy as np
import pandas as pd

#import and view csv file dataset.csv
df = pd.read_csv('../Dataset/dataset.csv')

In [11]:
# only keep the columns object_a, object_b, most_frequent_label, sentence
df = df[['object_a', 'object_b', 'most_frequent_label', 'sentence']]
# choose random 10 items from df where lables is not NON
df = df[df['most_frequent_label'] != 'NONE'].sample(n=10)
df

Unnamed: 0,object_a,object_b,most_frequent_label,sentence
7180,Windows 8,Windows 7,BETTER,I'm not saying Windows 8 is worse than Windows...
700,PostgreSQL,MySQL,BETTER,WHITE PAPER: Many IT professionals generally r...
6627,Cadillac,Lexus,BETTER,"Those numbers, Cadillac points out, are better..."
3048,Chrysler,Toyota,BETTER,In my opinion Dodge and Chrysler are far bette...
4939,SQLite,MySQL,WORSE,"(What about PostpreSQL you ask? Meh, decent li..."
4773,Microsoft,Google,BETTER,Microsoft Live Maps Looks Better Than Google M...
4731,OpenCL,CUDA,BETTER,AMD's OpenCL is easier to program for than CUD...
7106,Google,Kingston,BETTER,I'm using mesa and xserver 9999 ______________...
113,Java,Python,BETTER,"For a performance-critical Pig UDF, Java is mu..."
126,MySQL,PostgreSQL,BETTER,Allen downloaded the free versions of both dat...


In [4]:
dataset = df.copy()

In [5]:
# shuffle dataset
dataset = dataset.sample(frac=1).reset_index(drop=True)

# rename column most_frequent_label to y
dataset = dataset.rename(columns={'most_frequent_label': 'y'})

# rearange columns so that column y is the last one
dataset = dataset[['object_a', 'object_b', 'sentence', 'y']]

# transform labels where NONE = 1, Better = 2, WORSE = 0
dataset['y'] = dataset['y'].replace(['BETTER','NONE', 'WORSE'], [0, 1, 2])

# concatenate object_a and object_b into one column
dataset['objects'] = dataset['object_a'] + ', ' + dataset['object_b']

# drop object_a and object_b columns
dataset = dataset.drop(['object_a', 'object_b'], axis=1)

# concatenate objects and sentence into one column seperated by a comma
dataset['sentence'] = dataset['objects'] + ', ' + dataset['sentence']

# drop objects column
dataset = dataset.drop(['objects'], axis=1)

# rename column sentence to x
dataset = dataset.rename(columns={'sentence': 'x'})

# rearange columns so that column x is the first one
dataset = dataset[['x', 'y']]

# save dataset to csv file
dataset.to_csv('../Dataset/dataset_transformed.csv', index=False, sep='\t')

In [6]:
# split dataset into train, validation and test sets
train, validate, test = np.split(dataset.sample(frac=1), [int(.8*len(dataset)), int(.9*len(dataset))])

# save train, validation and test sets to csv files
train.to_csv('../Dataset/train.csv', index=False, sep='\t')
validate.to_csv('../Dataset/validate.csv', index=False, sep='\t')
test.to_csv('../Dataset/test.csv', index=False, sep='\t')

In [7]:
# check how often each label occurs in the train set
train['y'].value_counts()

y
1    4174
0    1110
2     475
Name: count, dtype: int64

In [8]:
# check how often each label occurs in the val set
validate['y'].value_counts()

y
1    532
0    126
2     62
Name: count, dtype: int64

In [9]:
# check how often each label occurs in the test set
test['y'].value_counts()

y
1    536
0    128
2     56
Name: count, dtype: int64

In [10]:
# show the first rows of train
train.head()

Unnamed: 0,x,y
6711,"tennis, volleyball, tennis,volleyball & more.",1
2031,"Nokia, Siemens, Nokia Siemens Networks Offers ...",1
5799,"juice, milk, The beetroot juice or milk sugar ...",1
6985,"Toyota, BMW, Toyota continues to dominate the ...",0
3589,"Ruby, Java, Ruby on Rails, Java, C, C++, Herok...",1
