# Integrating data from FACETS

<br>

**Language: Python**

This notebook shows the process for reading in the output from the FACETS program, i.e., the MFRM models, and integrating it with the existing survey data. See dissertation sections 4.3 and 6.2.1 for description of MFRM models and these data in particular. The final dataframe is then split into the control data and the main experimental data. 

**Notebook contents:**
- [Initial setup](#Initial-setup)
- [MFRM integration](#MFRM-integration)
- [Split into main and control dataframes](#Split-into-main-and-control-dataframes)
- [Check main ratings data](#Check-main-ratings-data)

## Initial setup

In [1]:
# Import necessary modules

import pandas as pd
import pprint
from IPython.core.interactiveshell import InteractiveShell
import csv
import joblib
import math
import numpy as np

In [2]:
# Set preferred notebook format

%pprint # Turn off pretty printing
InteractiveShell.ast_node_interactivity = "all" # Show all output, not just last item
pd.set_option('display.max_columns', 999) # Allow viewing of all columns
pd.options.mode.chained_assignment = None # Suppress SettingWithCopyWarning

Pretty printing has been turned OFF


In [3]:
# Read in ratings_long data

ratings_long = joblib.load('../docs/ratings_long.pkl')
ratings_long.head(3)

Unnamed: 0,response_id,text_id,rater_id,CEFR,TR,CC,LR,GRA,HOL,soph,soph_type,accuracy,comment,duration,text_order,gender,age,education,TESOL_cert,ELT_exp_yrs,IELTS_status,current,IELTS_exp_yrs,rater_L1,rater_English,student_L1,L1_range,prof_range
0,1,Text1,R1,B1,4.3,4.3,4.3,3.7,3.4,low,,low,The candidate uses only basic vocabulary repet...,q3,C1_B1_B2,Female,30-39,BA,diploma,6-10,examiner,former,6-10,Spanish,L2,Arabic,wide,wide
1,2,Text15,R1,B2,5.3,6.3,6.7,6.7,6.3,mid,non_col,low,The candidate takes risks and attempts to use ...,q3,C1_B1_B2,Female,30-39,BA,diploma,6-10,examiner,former,6-10,Spanish,L2,Arabic,wide,wide
2,3,Text23,R1,C1,7.3,6.3,8.0,7.3,7.6,mid,col,low,The candidate uses a wide range of vocabulary ...,q3,C1_B1_B2,Female,30-39,BA,diploma,6-10,examiner,former,6-10,Spanish,L2,Arabic,wide,wide


## MFRM integration

In [4]:
# Read in mfrm data (from Facets)

mfrm = pd.read_csv("../docs/mfrm_output.csv")
mfrm.head()

Unnamed: 0,Obs,Stp,Exp,Res,Var,StRes,Wt,LProb,Measure,Displ,Status,MPCat,E1,E2,E3,M1,M2,M3,text_id,rater_id,band
0,43,43,49.36,-6.36,20.84,-1.39,1,-2.02,-0.63,-0.31,1,53,1,1,1,-0.64,0,-0.01,Text1,R1,TR
1,43,43,47.76,-4.76,22.86,-0.99,1,-1.61,-0.71,-0.21,1,50,1,1,2,-0.64,0,0.07,Text1,R1,CC
2,43,43,52.56,-9.56,18.57,-2.22,1,-3.36,-0.46,-0.51,1,53,1,1,3,-0.64,0,-0.17,Text1,R1,LR
3,37,37,47.84,-10.84,22.77,-2.27,1,-4.5,-0.7,-0.48,1,50,1,1,4,-0.64,0,0.06,Text1,R1,GRA
4,34,34,48.1,-14.1,22.5,-2.97,1,-5.69,-0.69,-0.63,1,50,1,1,5,-0.64,0,0.05,Text1,R1,HOL


In [5]:
# Keep only relevant columns

mfrm = mfrm[['Exp','text_id','rater_id','band']]
mfrm = mfrm.rename(columns={'Exp':'fair_score'})

In [6]:
# Convert score to appropriate scale

mfrm.fair_score = (mfrm.fair_score/10).round(1)
mfrm.head()

Unnamed: 0,fair_score,text_id,rater_id,band
0,4.9,Text1,R1,TR
1,4.8,Text1,R1,CC
2,5.3,Text1,R1,LR
3,4.8,Text1,R1,GRA
4,4.8,Text1,R1,HOL


In [7]:
# Change format so all scores in one cell

mfrm['band_score'] = list(zip(mfrm.band,mfrm.fair_score))

mfrm = (mfrm.groupby(['text_id','rater_id'])
      .agg({'band_score': lambda x: x.tolist()})
      .reset_index())

mfrm.head(10)

Unnamed: 0,text_id,rater_id,band_score
0,Text1,R1,"[(TR, 4.9), (CC, 4.8), (LR, 5.3), (GRA, 4.8), ..."
1,Text1,R11,"[(TR, 4.9), (CC, 4.8), (LR, 5.3), (GRA, 4.8), ..."
2,Text1,R19,"[(TR, 4.9), (CC, 4.8), (LR, 5.3), (GRA, 4.8), ..."
3,Text1,R40,"[(TR, 4.9), (CC, 4.8), (LR, 5.3), (GRA, 4.8), ..."
4,Text10,R12,"[(TR, 5.1), (CC, 5.0), (LR, 5.4), (GRA, 5.0), ..."
5,Text10,R14,"[(TR, 5.1), (CC, 5.0), (LR, 5.4), (GRA, 5.0), ..."
6,Text10,R20,"[(TR, 5.1), (CC, 5.0), (LR, 5.4), (GRA, 5.0), ..."
7,Text10,R42,"[(TR, 5.1), (CC, 5.0), (LR, 5.4), (GRA, 5.0), ..."
8,Text11,R24,"[(TR, 6.7), (CC, 6.5), (LR, 7.1), (GRA, 6.5), ..."
9,Text11,R28,"[(TR, 6.7), (CC, 6.5), (LR, 7.1), (GRA, 6.5), ..."


In [8]:
# Create temporary columns of text_id and rater_id for easier mapping

mfrm['temp'] = list(zip(mfrm.text_id,mfrm.rater_id))
ratings_long['temp'] = list(zip(ratings_long.text_id,ratings_long.rater_id))

In [9]:
# Items with missing data (removed as 'unexpected responses' in Facets)

mfrm.loc[(mfrm.band_score.str.len() < 5),:].head()
len(mfrm.loc[(mfrm.band_score.str.len() < 5),:])

Unnamed: 0,text_id,rater_id,band_score,temp
17,Text13,R22,"[(TR, 6.5), (LR, 7.0), (GRA, 6.3), (HOL, 6.4)]","(Text13, R22)"
48,Text2,R43,"[(TR, 4.9), (CC, 4.8), (GRA, 4.8), (HOL, 4.8)]","(Text2, R43)"
51,Text20,R25,"[(LR, 6.4), (GRA, 5.7)]","(Text20, R25)"
56,Text21,R23,"[(CC, 8.4), (LR, 8.9), (GRA, 8.4), (HOL, 8.4)]","(Text21, R23)"
75,Text26,R13,"[(TR, 8.7), (CC, 8.5), (LR, 9.0)]","(Text26, R13)"


12

In [10]:
# Add NA values for any missing data (this is the code for missing data in R)

# Create function
def add_missing(score_list):
    score_list = score_list[:5] + ['x' for x in range(5 - len(score_list))] # Adding padding
    bands = ['TR','CC','LR','GRA','HOL']
    i = 0
    for i in range(len(score_list)):
        if score_list[i][0] != bands[i]:
            score_list.insert(i,(bands[i],np.nan))
            i =+ 1
    return score_list[:5]

In [11]:
# Apply function to band score column and check one example

mfrm.band_score = mfrm.band_score.apply(add_missing)
list(mfrm.loc[(mfrm.text_id == 'Text28') & (mfrm.rater_id == 'R45')].band_score)

[[('TR', nan), ('CC', 7.0), ('LR', 7.7), ('GRA', nan), ('HOL', 7.0)]]

In [12]:
# Create dictionary for mapping

mfrm_dict = pd.Series(mfrm.band_score.values,mfrm.temp).to_dict()

In [13]:
# Map to ratings_long df

ratings_long['fair'] = ratings_long.temp.map(mfrm_dict)
del ratings_long['temp']

In [14]:
# Keep only raters used in the MFRM model

ratings_long = ratings_long.loc[~ratings_long.fair.isnull()]

In [15]:
# Create new 'fair' columns

ratings_long['TR_fair'] = [x[0][1] for x in ratings_long['fair']]
ratings_long['CC_fair'] = [x[1][1] for x in ratings_long['fair']]
ratings_long['LR_fair'] = [x[2][1] for x in ratings_long['fair']]
ratings_long['GRA_fair'] = [x[3][1] for x in ratings_long['fair']]
ratings_long['HOL_fair'] = [x[4][1] for x in ratings_long['fair']]

del ratings_long['fair']

In [16]:
ratings_long.loc[ratings_long.text_id == 'Text31']

Unnamed: 0,response_id,text_id,rater_id,CEFR,TR,CC,LR,GRA,HOL,soph,soph_type,accuracy,comment,duration,text_order,gender,age,education,TESOL_cert,ELT_exp_yrs,IELTS_status,current,IELTS_exp_yrs,rater_L1,rater_English,student_L1,L1_range,prof_range,TR_fair,CC_fair,LR_fair,GRA_fair,HOL_fair
21,22,Text31,R8,B1,6.7,6.0,6.0,5.7,5.5,,,,Band 6: The candidate uses an adequate range o...,q3,B2_C1_B1,Male,50-59,MA,diploma,>20,examiner,current,11-20,Pashtu,L2,Arabic,narrow,narrow,,4.8,5.3,4.8,4.8
87,88,Text31,R30,B1,5.0,5.0,5.0,5.0,5.0,,,,limited range and there seemed to be a tendenc...,q2,B2_C1_B1,Male,30-39,MA,diploma,11-20,examiner,former,1-2,English,L1,Punjabi,narrow,narrow,5.0,4.8,5.3,4.8,4.8
90,91,Text31,R31,B1,4.0,4.7,5.0,5.0,4.5,,,,Most of the more sophisticated lexis is lifted...,q2,B2_C1_B1,Female,60-69,MA,diploma,>20,examiner,former,6-10,English,L1,various,wide,wide,5.0,4.8,5.3,4.8,4.8
99,100,Text31,R34,B1,5.3,5.3,5.3,4.7,5.0,,,,"The candidate is able to express him/herself, ...",q1,B1_C1_B2,Male,40-49,PhD,certificate,>20,examiner,former,1-2,English,L1,Arabic,wide,wide,5.0,4.8,5.3,4.8,4.8


In [17]:
# Impute NA scores - fair scores are the same for all identical text_id regardless of the rater

# Create df with full scores (no NAs), one row each text_id
ratings_long = ratings_long.sort_values('text_id').reset_index(drop=True)
full = ratings_long[['text_id','TR_fair','CC_fair','LR_fair','GRA_fair','HOL_fair']].dropna().drop_duplicates('text_id')

# Create dictionaries of the full scores
full_TR_dict = pd.Series(full.TR_fair.values,full.text_id).to_dict()
full_CC_dict = pd.Series(full.CC_fair.values,full.text_id).to_dict()
full_LR_dict = pd.Series(full.LR_fair.values,full.text_id).to_dict()
full_GRA_dict = pd.Series(full.GRA_fair.values,full.text_id).to_dict()
full_HOL_dict = pd.Series(full.HOL_fair.values,full.text_id).to_dict()

# Apply dicts to replace NaN values
ratings_long.TR_fair = ratings_long.text_id.map(full_TR_dict)
ratings_long.CC_fair = ratings_long.text_id.map(full_CC_dict)
ratings_long.LR_fair = ratings_long.text_id.map(full_LR_dict)
ratings_long.GRA_fair = ratings_long.text_id.map(full_GRA_dict)
ratings_long.HOL_fair = ratings_long.text_id.map(full_HOL_dict)

## Split into main and control dataframes

Create mini dataframe comprised of the original, normalized, and base texts.

In [18]:
# Separate out base texts

base_texts = ratings_long.loc[ratings_long.text_id.isin(['Text1','Text11','Text21'])]
base_texts['type'] = 'base'
base_texts = base_texts[['type','text_id','rater_id','CEFR','TR','CC','LR','GRA','TR_fair','CC_fair','LR_fair','GRA_fair']]

In [19]:
# Round all ratings down to integer (to match original texts rating options)

base_texts[['TR','CC','LR','GRA','TR_fair','CC_fair','LR_fair','GRA_fair']] = \
base_texts[['TR','CC','LR','GRA','TR_fair','CC_fair','LR_fair','GRA_fair']].apply(np.floor)

In [20]:
# Create 'overall' score and 'overall_fair' score columns

def round_down(x, a):
    return math.floor(x / a) * a

base_texts['overall'] = base_texts[['TR','CC','LR','GRA']].mean(axis=1)
base_texts.overall = base_texts.overall.apply(lambda x: round_down(x,0.5))

base_texts['overall_fair'] = base_texts[['TR_fair','CC_fair','LR_fair','GRA_fair']].mean(axis=1)
base_texts.overall_fair = base_texts.overall_fair.apply(lambda x: round_down(x,0.5))

In [21]:
# Separate out normalized texts

norm_texts = ratings_long.loc[ratings_long.text_id.isin(['Text31','Text32','Text33'])]
norm_texts['type'] = 'norm'
norm_texts = norm_texts[['type','text_id','rater_id','CEFR','TR','CC','LR','GRA','TR_fair','CC_fair','LR_fair','GRA_fair']]

In [22]:
norm_texts

Unnamed: 0,type,text_id,rater_id,CEFR,TR,CC,LR,GRA,TR_fair,CC_fair,LR_fair,GRA_fair
101,norm,Text31,R31,B1,4.0,4.7,5.0,5.0,5.0,4.8,5.3,4.8
102,norm,Text31,R8,B1,6.7,6.0,6.0,5.7,5.0,4.8,5.3,4.8
103,norm,Text31,R30,B1,5.0,5.0,5.0,5.0,5.0,4.8,5.3,4.8
104,norm,Text31,R34,B1,5.3,5.3,5.3,4.7,5.0,4.8,5.3,4.8
105,norm,Text32,R20,B2,6.3,6.7,7.3,6.7,6.5,6.3,7.0,6.3
106,norm,Text32,R38,B2,8.0,7.3,7.0,6.7,6.5,6.3,7.0,6.3
107,norm,Text32,R9,B2,6.3,6.3,5.3,5.3,6.5,6.3,7.0,6.3
108,norm,Text32,R12,B2,6.7,6.3,6.7,5.7,6.5,6.3,7.0,6.3
109,norm,Text33,R47,C1,7.3,6.3,7.7,7.7,8.0,7.8,8.5,7.8
110,norm,Text33,R3,C1,8.3,8.3,8.3,8.3,8.0,7.8,8.5,7.8


In [23]:
# Round all ratings down to integer (to match original texts rating options)

norm_texts[['TR','CC','LR','GRA','TR_fair','CC_fair','LR_fair','GRA_fair']] = \
norm_texts[['TR','CC','LR','GRA','TR_fair','CC_fair','LR_fair','GRA_fair']].apply(np.floor)

In [24]:
# Create 'overall' score and 'overall_fair' score columns

norm_texts['overall'] = norm_texts[['TR','CC','LR','GRA']].mean(axis=1)
norm_texts.overall = norm_texts.overall.apply(lambda x: round_down(x,0.5))

norm_texts['overall_fair'] = norm_texts[['TR_fair','CC_fair','LR_fair','GRA_fair']].mean(axis=1)
norm_texts.overall_fair = norm_texts.overall_fair.apply(lambda x: round_down(x,0.5))

In [25]:
# Create original texts series

orig_B1 = pd.Series(['orig','IELTS_B1','IELTS','B1',4,4,4,4,4,4,4,4,4,4])
orig_B2 = pd.Series(['orig','IELTS_B2','IELTS','B2',7,6,7,6,7,6,7,6,6.5,6.5])
orig_C1 = pd.Series(['orig','IELTS_C1','IELTS','C1',8,8,8,8,8,8,8,8,8,8])

orig_texts = pd.DataFrame([orig_B1,orig_B2,orig_C1])
orig_texts.columns = norm_texts.columns
orig_texts

Unnamed: 0,type,text_id,rater_id,CEFR,TR,CC,LR,GRA,TR_fair,CC_fair,LR_fair,GRA_fair,overall,overall_fair
0,orig,IELTS_B1,IELTS,B1,4,4,4,4,4,4,4,4,4.0,4.0
1,orig,IELTS_B2,IELTS,B2,7,6,7,6,7,6,7,6,6.5,6.5
2,orig,IELTS_C1,IELTS,C1,8,8,8,8,8,8,8,8,8.0,8.0


In [26]:
# Combine into one dataframe

control = pd.concat([base_texts,norm_texts,orig_texts]).sort_values('CEFR').reset_index(drop=True)
control.head()

Unnamed: 0,type,text_id,rater_id,CEFR,TR,CC,LR,GRA,TR_fair,CC_fair,LR_fair,GRA_fair,overall,overall_fair
0,base,Text1,R1,B1,4.0,4.0,4.0,3.0,4.0,4.0,5.0,4.0,3.5,4.0
1,orig,IELTS_B1,IELTS,B1,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0
2,norm,Text31,R34,B1,5.0,5.0,5.0,4.0,5.0,4.0,5.0,4.0,4.5,4.5
3,norm,Text31,R30,B1,5.0,5.0,5.0,5.0,5.0,4.0,5.0,4.0,5.0,4.5
4,norm,Text31,R31,B1,4.0,4.0,5.0,5.0,5.0,4.0,5.0,4.0,4.5,4.5


In [27]:
control.groupby('text_id').mean()

Unnamed: 0_level_0,TR,CC,LR,GRA,TR_fair,CC_fair,LR_fair,GRA_fair,overall,overall_fair
text_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
IELTS_B1,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0
IELTS_B2,7.0,6.0,7.0,6.0,7.0,6.0,7.0,6.0,6.5,6.5
IELTS_C1,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0
Text1,4.75,4.5,5.0,4.5,4.0,4.0,5.0,4.0,4.5,4.0
Text11,6.5,6.5,6.75,6.25,6.0,6.0,7.0,6.0,6.375,6.0
Text21,7.25,8.0,8.25,7.75,8.0,8.0,8.0,8.0,7.75,8.0
Text31,5.0,5.0,5.25,4.75,5.0,4.0,5.0,4.0,4.875,4.5
Text32,6.5,6.25,6.25,5.5,6.0,6.0,7.0,6.0,6.0,6.0
Text33,7.75,7.25,7.5,7.5,8.0,7.0,8.0,7.0,7.375,7.5


### Add other text indices from the normalization notebook

In [28]:
# Load original and normalized texts and map to control dataframe

B1_orig_and_norm = joblib.load('../docs/B1_orig&norm.pkl')
B1_orig_and_norm.text_id = pd.Series(['IELTS_B1','Text31'])

B2_orig_and_norm = joblib.load('../docs/B2_orig&norm.pkl')
B2_orig_and_norm.text_id = pd.Series(['IELTS_B2','Text32'])

C1_orig_and_norm = joblib.load('../docs/C1_orig&norm.pkl')
C1_orig_and_norm.text_id = pd.Series(['IELTS_C1','Text33'])

In [29]:
orig_and_norm_texts = pd.concat([B1_orig_and_norm,B2_orig_and_norm,C1_orig_and_norm])
orig_and_norm_texts = orig_and_norm_texts[['text_id','text_len','MLC','CNC','vocD','AG',
                                           'bigram_range','mean_MI', 'mean_tscore','absent_prop','col_errors_per_100',
                                           'correct_cols_per_100','K1to2_p','K3to9_p','K10to16_p']]
orig_and_norm_texts

Unnamed: 0,text_id,text_len,MLC,CNC,vocD,AG,bigram_range,mean_MI,mean_tscore,absent_prop,col_errors_per_100,correct_cols_per_100,K1to2_p,K3to9_p,K10to16_p
0,IELTS_B1,172,6.615,0.615,47.87,0.381,0.088,2.429,115.839,0.979,8.14,4.651,0.75,0.25,0.0
1,Text31,250,6.41,0.641,48.723,0.379,0.095,2.522,111.672,0.985,8.0,4.8,0.75,0.25,0.0
0,IELTS_B2,349,7.286,0.857,46.781,0.9,0.164,2.94,162.093,0.965,3.152,8.883,0.742,0.258,0.0
1,Text32,250,7.314,0.8,43.457,0.938,0.171,2.878,153.301,0.964,3.2,8.8,0.727,0.273,0.0
0,IELTS_C1,254,11.773,2.045,70.996,1.443,0.122,3.26,93.617,0.926,1.969,12.992,0.545,0.364,0.091
1,Text33,250,11.591,2.045,70.776,1.455,0.117,3.229,92.031,0.927,2.0,13.2,0.545,0.364,0.091


In [30]:
# Load base texts and map to control dataframe

base_texts = joblib.load('../docs/base_texts_processed.pkl')
base_texts.text_id = pd.Series(['Text1','Text11','Text21'])

base_texts['correct_cols_per_100'] = (base_texts['correct_cols']/base_texts['text_len'])*100
base_texts['col_errors_per_100'] = (base_texts['col_errors']/base_texts['text_len'])*100

base_texts = base_texts[['text_id','text_len','MLC','CNC','vocD','AG','bigram_range',
                         'mean_MI','mean_tscore','absent_prop','col_errors_per_100','correct_cols_per_100',
                         'K1to2_p', 'K3to9_p','K10to16_p']]
base_texts

Unnamed: 0,text_id,text_len,MLC,CNC,vocD,AG,bigram_range,mean_MI,mean_tscore,absent_prop,col_errors_per_100,correct_cols_per_100,K1to2_p,K3to9_p,K10to16_p
0,Text1,250,6.436,0.667,48.559,0.379,0.094,2.526,106.16,0.987,7.2,4.8,0.75,0.25,0.0
1,Text11,250,7.371,0.829,44.72,0.936,0.169,2.868,157.984,0.963,4.8,8.8,0.727,0.273,0.0
2,Text21,250,11.591,2.045,70.302,1.455,0.117,3.233,93.148,0.928,2.4,12.8,0.562,0.344,0.094


In [31]:
# Join all texts

all_texts = pd.concat([base_texts,orig_and_norm_texts])
all_texts

Unnamed: 0,text_id,text_len,MLC,CNC,vocD,AG,bigram_range,mean_MI,mean_tscore,absent_prop,col_errors_per_100,correct_cols_per_100,K1to2_p,K3to9_p,K10to16_p
0,Text1,250,6.436,0.667,48.559,0.379,0.094,2.526,106.16,0.987,7.2,4.8,0.75,0.25,0.0
1,Text11,250,7.371,0.829,44.72,0.936,0.169,2.868,157.984,0.963,4.8,8.8,0.727,0.273,0.0
2,Text21,250,11.591,2.045,70.302,1.455,0.117,3.233,93.148,0.928,2.4,12.8,0.562,0.344,0.094
0,IELTS_B1,172,6.615,0.615,47.87,0.381,0.088,2.429,115.839,0.979,8.14,4.651,0.75,0.25,0.0
1,Text31,250,6.41,0.641,48.723,0.379,0.095,2.522,111.672,0.985,8.0,4.8,0.75,0.25,0.0
0,IELTS_B2,349,7.286,0.857,46.781,0.9,0.164,2.94,162.093,0.965,3.152,8.883,0.742,0.258,0.0
1,Text32,250,7.314,0.8,43.457,0.938,0.171,2.878,153.301,0.964,3.2,8.8,0.727,0.273,0.0
0,IELTS_C1,254,11.773,2.045,70.996,1.443,0.122,3.26,93.617,0.926,1.969,12.992,0.545,0.364,0.091
1,Text33,250,11.591,2.045,70.776,1.455,0.117,3.229,92.031,0.927,2.0,13.2,0.545,0.364,0.091


In [32]:
# Combine with ratings

control = control.merge(all_texts.set_index('text_id'), on = 'text_id', how = 'outer')
control.head()
len(control)

Unnamed: 0,type,text_id,rater_id,CEFR,TR,CC,LR,GRA,TR_fair,CC_fair,LR_fair,GRA_fair,overall,overall_fair,text_len,MLC,CNC,vocD,AG,bigram_range,mean_MI,mean_tscore,absent_prop,col_errors_per_100,correct_cols_per_100,K1to2_p,K3to9_p,K10to16_p
0,base,Text1,R1,B1,4.0,4.0,4.0,3.0,4.0,4.0,5.0,4.0,3.5,4.0,250,6.436,0.667,48.559,0.379,0.094,2.526,106.16,0.987,7.2,4.8,0.75,0.25,0.0
1,base,Text1,R19,B1,5.0,5.0,6.0,5.0,4.0,4.0,5.0,4.0,5.0,4.0,250,6.436,0.667,48.559,0.379,0.094,2.526,106.16,0.987,7.2,4.8,0.75,0.25,0.0
2,base,Text1,R11,B1,5.0,4.0,5.0,5.0,4.0,4.0,5.0,4.0,4.5,4.0,250,6.436,0.667,48.559,0.379,0.094,2.526,106.16,0.987,7.2,4.8,0.75,0.25,0.0
3,base,Text1,R40,B1,5.0,5.0,5.0,5.0,4.0,4.0,5.0,4.0,5.0,4.0,250,6.436,0.667,48.559,0.379,0.094,2.526,106.16,0.987,7.2,4.8,0.75,0.25,0.0
4,orig,IELTS_B1,IELTS,B1,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,172,6.615,0.615,47.87,0.381,0.088,2.429,115.839,0.979,8.14,4.651,0.75,0.25,0.0


27

In [33]:
# Remove control data from ratings_long

len(ratings_long)
ratings_long = ratings_long.loc[~ratings_long.text_id.isin(['Text31','Text32','Text33'])]
len(ratings_long)

140

128

In [34]:
# Replace all nan with NA (code for missing data in R)

ratings_long = ratings_long.replace(np.nan,'NA')
control = control.replace(np.nan,'NA')

In [35]:
# Write out control and main ratings to csv for analysis in R

ratings_long.to_csv('../docs/ratings_R.csv',index=False)
control.to_csv('../docs/control_R.csv',index=False)

[Back to top](#Integrating-data-from-FACETS)