In [1]:
import pandas as pd
import numpy as np
import joblib

#### Load ASAP Dataset - Choose Prompt 1

In [2]:
df = pd.read_csv('training_set_rel3.tsv', sep='\t', encoding = "ISO-8859-1")

PROMPT = 1

asap = df[df['essay_set']==PROMPT]
asap = asap[['essay_id', 'essay', 'domain1_score']]

In [3]:
asap.head()

Unnamed: 0,essay_id,essay,domain1_score
0,1,"Dear local newspaper, I think effects computer...",8
1,2,"Dear @CAPS1 @CAPS2, I believe that using compu...",9
2,3,"Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...",7
3,4,"Dear Local Newspaper, @CAPS1 I have found that...",10
4,5,"Dear @LOCATION1, I know having computers has a...",8


In [4]:
# Assuming 'asap1' is your pandas DataFrame
y = np.array(asap['domain1_score'])

# Print the result
len(y)

1783

In [5]:
joblib.dump(y, "files/score_asap1")

['files/score_asap1']

#### Create Train Test Split

In [6]:
from sklearn.model_selection import train_test_split
indices = np.arange(len(asap))
X_train, X_test, Y_train, Y_test, idx_train, idx_test = train_test_split(asap, y, indices, test_size=0.2, random_state=42)

In [7]:
len(idx_train)

1426

In [8]:
len(idx_test)

357

#### Get Training Data

In [9]:
asap_train = asap.iloc[idx_train]
len(asap_train)

1426

In [10]:
filename = 'ASAP'+str(PROMPT)+' Train Set.tsv'
asap_train.to_csv(filename, index=False, sep='\t')

#### Get Testing Data

In [11]:
asap_test = asap.iloc[idx_test]
len(asap_test)

357

In [12]:
filename = 'ASAP'+str(PROMPT)+' Test Set.tsv'
asap_test.to_csv(filename, index=False, sep='\t')

#### Get Training Features

In [13]:
feats = joblib.load('files/asap1_features_776')
features_train = feats[idx_train].squeeze()

In [14]:
features_train.shape

(1426, 776)

In [15]:
joblib.dump(features_train, 'files/asap1_features_train_776')

['files/asap1_features_train_776']

#### Get Testing Features

In [16]:
features_test = feats[idx_test].squeeze()

In [17]:
features_test.shape

(357, 776)

In [18]:
joblib.dump(features_test, 'files/asap1_features_test_776')

['files/asap1_features_test_776']