# This file is responsible for building regression models to score essays using different set of features, including embeddings, complexity measures, similarity measures and their combinations
## Import library

In [1]:
%run functions

  return warn(


## Load data

In [None]:
loader = DataLoader()
loader.GetData('final')
d = loader.GetShuffled(1000)

In [31]:
d = d.loc[d['score'] > 0,:]

## Regression
### Split for train and test index

In [32]:
ind = list(d.index)
trainIn,testIn = ind[:2838],ind[2838:]

### Define regression function

In [5]:
def regression(x,plt=False):
    TrainX,TestX = [x[i] for i in trainIn],[x[i] for i in testIn]
    TrainY,TestY = d['score'][trainIn],d['score'][testIn]
    model = xgb.XGBRegressor(learning_rate=0.042)
    model.fit(TrainX,TrainY)
    PredY = model.predict(TestX)
    PredTrainY = model.predict(TrainX)
    #print('test RMSE: {:.3f}, test EV: {:.3f}%'.format(mean_squared_error(TestY,PredY,squared=False),explained_variance_score(TestY,PredY)*100))
    #print('train RMSE: {:.3f}, train EV: {:.3f}%'.format(mean_squared_error(TrainY,PredTrainY,squared=False),explained_variance_score(TrainY,PredTrainY)*100))
    if plt:
        g = sns.scatterplot(x=TestY,y=PredY)
        g.set_ylabel('predicted score')
        g.set_xlabel('true score')
        g.set_ylim((0,14))
        g.set_xlim((0,14))
        g = sns.lineplot(x=[0,14],y=[0,14],color='black')
        g.lines[0].set_linestyle("--")
    return (mean_squared_error(TestY,PredY,squared=False),explained_variance_score(TestY,PredY)*100,mean_squared_error(TrainY,PredTrainY,squared=False),explained_variance_score(TrainY,PredTrainY)*100)

### Prepare data
#### Embeddings

In [6]:
xe = torch.load('features/final-meanp.pt')

In [7]:
xe = [np.array(i).reshape(1024) for i in xe]

### Fine-grained complexity measures

In [8]:
xf = []
with open('features/final-winter-fine.csv') as f:
    raw = f.readlines()[1:]

In [9]:
for i in range(len(raw)):
    l = raw[i]
    l = l.rstrip('\n').split(',')[2:]
    l = [float(i) for i in l]
    if i in list(d.index):
        xf.append(np.array(l,dtype=float).reshape(143))
    else:
        xf.append([0])

#### Large-grained L2SCA measures

In [10]:
with open('features/final-winter-l2sca.txt') as f:
    l2 = f.readlines()[1:]
large = dict()
for l in l2:
    i = int(l.split(',')[0].split('.')[0])
    feat = l.rstrip('\n').split(',')[-14:]
    feat = [float(s) for s in feat]
    large[i] = feat
xl = []
for i in range(3830):
    xl.append(large[i])

#### Syntactic similarity measures

In [11]:
f = np.load('features/final_winter_sent.npz')
sim_list = [f[a] for a in f.files]
sim_raw_list = []
for n in f.files:
    values = []
    m = f[n]
    for row in range(len(m)):
        for col in range(row+1,len(m)):
            values.append(m[row,col])
    sim_raw_list.append(np.array(values))
sim_mean = []
sim_max = []
sim_min = []
for matrix in sim_raw_list:
    if len(matrix) > 1:
        sim_mean.append(np.average(matrix))
        sim_max.append(np.max(matrix))
        sim_min.append(np.min(matrix))
    else:
        sim_mean.append(0)
        sim_max.append(0)
        sim_min.append(0)
cluster_num = []
cluster_avg = []
for k in range(len(sim_list)):
    sim = sim_list[k]
    if len(sim) > 1:
        sim_thres = sim.copy()
        for i in range(len(sim_thres)):
            for j in range(len(sim_thres)):
                if sim_thres[i,j] <= np.percentile(np.array(sim_raw_list[k]),50):
                    sim_thres[i,j] = 0
        res = mcl.run_mcl(sim_thres,inflation=2)
        clusters = mcl.get_clusters(res)
        cluster_num.append(len(clusters))
        cluster_avg.append(len(sim)/len(clusters))
    else:
        cluster_num.append(0)
        cluster_avg.append(0)

In [12]:
xs = []
for i in range(3830):
    a = [sim_max[i],sim_mean[i],sim_min[i],cluster_avg[i],cluster_num[i]]
    xs.append(a)

#### Embedding + sim

In [13]:
xes = []
for i in range(3830):
    a = np.concatenate((xe[i],xs[i]))
    xes.append(a)

#### Fine-grained + sim

In [14]:
xfs = []
for i in range(3830):
    a = np.concatenate((xf[i],xs[i]))
    xfs.append(a)

#### Large-grained + sim

In [15]:
xls = []
for i in range(3830):
    a = np.concatenate((xl[i],xs[i]))
    xls.append(a)

### Embedding + fine

In [16]:
xef = []
for i in range(3830):
    a = np.concatenate((xe[i],xf[i]))
    xef.append(a)

#### Embedding + large

In [17]:
xel = []
for i in range(3830):
    a = np.concatenate((xe[i],xl[i]))
    xel.append(a)

#### Embedding + fine + sim

In [18]:
xefs = []
for i in range(3830):
    a = np.concatenate((xef[i],xs[i]))
    xefs.append(a)

#### Embedding + large + sim

In [19]:
xels = []
for i in range(3830):
    a = np.concatenate((xel[i],xs[i]))
    xels.append(a)

### Get performances

In [33]:
names = ['xl','xf','xs','xls','xfs','xe','xefs']
trainRMSE, trainEV, testRMSE, testEV = [],[],[],[]
x = [xl,xf,xs,xls,xfs,xe,xefs]
for i in range(len(x)):
    res = regression(x[i])
    trainRMSE.append(res[2])
    trainEV.append(res[3])
    testRMSE.append(res[0])
    testEV.append(res[1])
r = pd.DataFrame({'test RMSE':testRMSE,
                  'test EV':testEV,
                  'train RMSE':trainRMSE,
                  'trainEV':trainEV},index=names)

In [34]:
r

Unnamed: 0,test RMSE,test EV,train RMSE,trainEV
xl,1.156707,31.824992,0.787036,66.745279
xf,0.750017,71.609637,0.487064,87.916827
xs,1.110637,36.683227,0.877543,58.438118
xls,0.967426,52.233496,0.641521,78.279841
xfs,0.745341,72.005501,0.483758,88.093282
xe,0.601101,81.981004,0.259017,97.356732
xefs,0.490159,88.229262,0.241503,97.833462


In [44]:
for n in names:
    print('{:.3f}'.format(r['train RMSE'][n]))

0.787
0.487
0.878
0.642
0.484
0.259
0.242
