In [78]:
import pandas as pd
from sklearn.model_selection import train_test_split
import gzip
import json
from os import listdir
from os.path import isfile, join

# SemEval-2017 Task 4A

In [79]:
input_df = pd.read_csv('./4A-English/SemEval2017-task4-dev.subtask-A.english.INPUT.txt', sep='\t', header=None)
input_df.drop([0, 3], axis = 1, inplace = True)
input_df.columns = ['label', 'text']

In [80]:
input_df['overall'] = input_df['label'].apply(lambda l: 3 if l == 'positive' else (1 if l == 'negative' else 2))
input_df.drop(['label'], axis = 1, inplace = True)

In [81]:
input_df.head()

Unnamed: 0,text,overall
0,"Picturehouse's, Pink Floyd's, 'Roger Waters: T...",2
1,Order Go Set a Watchman in store or through ou...,2
2,If these runway renovations at the airport pre...,1
3,If you could ask an onstage interview question...,2
4,A portion of book sales from our Harper Lee/Go...,3


In [82]:
train_val_df, test_df = train_test_split(input_df, random_state=29, test_size=0.2)
train_df, val_df = train_test_split(train_val_df, random_state=29, test_size=0.25)

In [83]:
print("Train Len: ", len(train_df))
print("Val Len: ", len(val_df))
print("Test Len: ", len(test_df))

Train Len:  12378
Val Len:  4127
Test Len:  4127


In [84]:
train_df['overall'].value_counts()

2    6223
3    4227
1    1928
Name: overall, dtype: int64

In [85]:
val_df['overall'].value_counts()

2    2068
3    1460
1     599
Name: overall, dtype: int64

In [86]:
test_df['overall'].value_counts()

2    2051
3    1372
1     704
Name: overall, dtype: int64

In [87]:
train_df.index.name = 'key_index'
val_df.index.name = 'key_index'
test_df.index.name = 'key_index'

In [88]:
train_df.to_csv('SemEval2017_train.csv')
val_df.to_csv('SemEval2017_val.csv')
test_df.to_csv('SemEval2017_test.csv')

# Amazon Fashion

In [89]:
def parse(path):
  g = gzip.open(path, 'rb')
  for l in g:
    yield json.loads(l)

def getDF(path):
  i = 0
  df = {}
  for d in parse(path):
    df[i] = d
    i += 1
  return pd.DataFrame.from_dict(df, orient='index')

In [90]:
df = getDF('./Amazon/AMAZON_FASHION.json.gz')

In [91]:
df = df[['reviewText', 'overall']]
df.columns = ['text', 'overall']

In [92]:
df.head()

Unnamed: 0,text,overall
0,Exactly what I needed.,5.0
1,"I agree with the other review, the opening is ...",2.0
2,Love these... I am going to order another pack...,4.0
3,too tiny an opening,2.0
4,Okay,3.0


In [93]:
df['overall'].value_counts()

5.0    465476
4.0    149331
1.0    107080
3.0     97031
2.0     64718
Name: overall, dtype: int64

In [94]:
df['text_len'] = df['text'].apply(lambda text: len(str(text).split(' ')))

In [95]:
df['text_len'].describe()

count    883636.000000
mean         28.616515
std          38.716551
min           1.000000
25%           7.000000
50%          17.000000
75%          35.000000
max        2088.000000
Name: text_len, dtype: float64

In [96]:
df = df[(df['text_len'] >= 50) & (df['text_len'] <= 100)]
df.drop(['text_len'], axis = 1, inplace = True)

In [97]:
input_df, _ = train_test_split(df, random_state=29, test_size=0.7)
input_df, _ = train_test_split(input_df, random_state=29, test_size=0.5)

In [98]:
len(input_df)

15228

In [99]:
train_val_df, test_df = train_test_split(input_df, random_state=29, test_size=0.2)
train_df, val_df = train_test_split(train_val_df, random_state=29, test_size=0.25)

In [100]:
print("Train Len: ", len(train_df))
print("Val Len: ", len(val_df))
print("Test Len: ", len(test_df))

Train Len:  9136
Val Len:  3046
Test Len:  3046


In [101]:
train_df['overall'].value_counts()

5.0    3986
4.0    1881
3.0    1232
1.0    1181
2.0     856
Name: overall, dtype: int64

In [102]:
val_df['overall'].value_counts()

5.0    1376
4.0     611
3.0     400
1.0     382
2.0     277
Name: overall, dtype: int64

In [103]:
test_df['overall'].value_counts()

5.0    1342
4.0     642
3.0     423
1.0     360
2.0     279
Name: overall, dtype: int64

In [104]:
train_df.index.name = 'key_index'
val_df.index.name = 'key_index'
test_df.index.name = 'key_index'

In [105]:
train_df.to_csv('AmazonFashion_train.csv')
val_df.to_csv('AmazonFashion_val.csv')
test_df.to_csv('AmazonFashion_test.csv')