In [1]:
import pandas as pd
import numpy as np
import joblib

#### Load ASAP Dataset - Choose Prompt 7

In [2]:
df = pd.read_csv('training_set_rel3.tsv', sep='\t', encoding = "ISO-8859-1")

PROMPT = 7

asap = df[df['essay_set']==PROMPT]
asap = asap[['essay_id', 'essay', 'domain1_score']]

In [3]:
asap.head()

Unnamed: 0,essay_id,essay,domain1_score
10684,17834,Patience is when your waiting .I was patience ...,15
10685,17836,"I am not a patience person, like I cant sit i...",13
10686,17837,One day I was at basketball practice and I was...,15
10687,17838,I going to write about a time when I went to t...,17
10688,17839,It can be very hard for somebody to be patient...,13


In [4]:
# Assuming 'asap1' is your pandas DataFrame
y = np.array(asap['domain1_score'])

# Print the result
len(y)

1569

In [5]:
joblib.dump(y, "files/score_asap7")

['files/score_asap7']

#### Create Train Test Split

In [6]:
from sklearn.model_selection import train_test_split
indices = np.arange(len(asap))
X_train, X_test, Y_train, Y_test, idx_train, idx_test = train_test_split(asap, y, indices, test_size=0.2, random_state=42)

In [7]:
len(idx_train)

1255

In [8]:
len(idx_test)

314

#### Get Training Data

In [9]:
asap_train = asap.iloc[idx_train]
len(asap_train)

1255

In [10]:
filename = 'ASAP'+str(PROMPT)+' Train Set.tsv'
asap_train.to_csv(filename, index=False, sep='\t')

#### Get Testing Data

In [10]:
asap_test = asap.iloc[idx_test]
len(asap_test)

314

In [11]:
filename = 'ASAP'+str(PROMPT)+' Test Set.tsv'
asap_test.to_csv(filename, index=False, sep='\t')

#### Get Training Features

In [16]:
feats = joblib.load('files/asap7_features_776')
features_train = feats[idx_train].squeeze()

In [17]:
features_train.shape

(1255, 776)

In [20]:
joblib.dump(features_train, 'files/asap7_features_train_776')

['files/asap7_features_train_776']

#### Get Testing Features

In [21]:
features_test = feats[idx_test].squeeze()

In [22]:
features_test.shape

(314, 776)

In [23]:
joblib.dump(features_test, 'files/asap7_features_test_776')

['files/asap7_features_test_776']