In [1]:
import matplotlib.pyplot as plt
from sklearn.utils.multiclass import unique_labels
from sklearn.metrics import confusion_matrix

In [2]:
import progressbar
import pandas as pd
import time 
import json
import numpy as np

In [3]:
from calculate_level_new import get_level_from_raw_text

In [4]:
def lower(text):
    lower_text = ''
    for word in text.split():
        w = word.lower()
        lower_text += w + ' '
    lower_text = lower_text.strip()
    return lower_text

with open('./texts/texts_lleo.json') as json_file:  
    text = []
    set_name = []
    data = json.load(json_file)
    for data_i in data:
        text.append(data_i['text'])
        set_name.append(data_i['name'])
text_level = pd.DataFrame(
    {'text': text,
     'set_name': set_name
    })
text_level['set_name'] = text_level['set_name'].apply(lower)
text_level.head()

Unnamed: 0,text,set_name
0,"""Tom!"" No answer.\n""TOM!"" No answer.\nThe old ...",the adventures of tom sawyer part 1
1,The summer evenings were long. Tom walked alon...,the adventures of tom sawyer part 1
2,Tom appeared in front of the house with paint ...,the adventures of tom sawyer part 1
3,Tom began to think of the pleasure planned for...,the adventures of tom sawyer part 1
4,Tom gave the brush to Ben and he painted the f...,the adventures of tom sawyer part 1


In [5]:
set_level_df = pd.read_csv("./texts/set_level.csv", sep = ",")
set_level_df['set'] = set_level_df['set'].apply(lower)
set_level_df.head()

Unnamed: 0,set,level
0,everyday life stories 1,Elementary
1,pocahontas,Elementary
2,robin hood,Elementary
3,the call of the wild,Elementary
4,around the world in eighty days,PreIntermediate


In [6]:
text_level.iloc[0]['text']

'"Tom!" No answer.\n"TOM!" No answer.\nThe old lady looked around the room.\n"When I find you, I—" She did not finish. With her head down, she was looking under the bed. Only the cat came out.\nThere was a little noise behind her. She turned and caught a small boy, stopping him before he could escape.\n"What were you doing in that corner?" "Nothing." "Nothing! What is that on your hands and face?" "You have been eating sweets. I have told you a hundred times not to eat those sweets." Her hand was raised in the air—it started down—it was very near— "Oh! Look behind you, Aunt!" The old lady turned. The boy ran. In a moment he was up on the high board fence. Then he was on the far side of it. Aunt Polly was surprised. Then she laughed a little.'

In [7]:
get_level_from_raw_text(text_level.iloc[0]['text'])

'Elementary/Pre-intermediate'

In [8]:
name_dict = pd.Series(set_level_df.level.values,index=set_level_df.set).to_dict()
level_interpret = {'Elementary':0, 'PreIntermediate':1, 'Intermediate':2, 'UpperIntermediate':3, 'Advanced':4}

In [9]:
def assign_level(name):
    if name in name_dict:
        return level_interpret[name_dict[name]]
    else:
        return None

text_level['level'] = text_level['set_name'].apply(assign_level)
text_level_no_nan = text_level.dropna()
text_level_no_nan.head()

Unnamed: 0,text,set_name,level
0,"""Tom!"" No answer.\n""TOM!"" No answer.\nThe old ...",the adventures of tom sawyer part 1,1.0
1,The summer evenings were long. Tom walked alon...,the adventures of tom sawyer part 1,1.0
2,Tom appeared in front of the house with paint ...,the adventures of tom sawyer part 1,1.0
3,Tom began to think of the pleasure planned for...,the adventures of tom sawyer part 1,1.0
4,Tom gave the brush to Ben and he painted the f...,the adventures of tom sawyer part 1,1.0


In [10]:
text_levels = []
text_ind = 0
for text in text_level_no_nan['text']:
    bar = progressbar.ProgressBar(maxval=len(text_level_no_nan['text']),
                                  widgets=[progressbar.Bar('=', '[', ']'), ' ', progressbar.Percentage()])
    bar.start()
    
    txt_ft = get_level_from_raw_text(text)
    text_levels.append(txt_ft)
    
    text_ind += 1
    bar.update(text_ind)
    time.sleep(0.1)



In [None]:
level_interpret_cefr = {'Beginner':0, 'Elementary/Pre-intermediate':1, 'Intermediate':2, 'Upper-Intermediate':3, 'Advanced':4}
def assign_level_cefr(level_name):
    if level_name in level_interpret_cefr:
        return level_interpret_cefr[level_name]
    else:
        print("wtf??")

In [None]:
text_level_no_nan['cefr_calc_level'] = text_levels
text_level_no_nan['cefr_level_int'] = text_level_no_nan['cefr_calc_level'].apply(assign_level_cefr)
text_level_no_nan['level'] = text_level_no_nan['level'].astype('int64')

text_level_no_nan.head()

In [None]:
text_level_no_nan.to_csv("text_level_cal.csv")

In [None]:
text_level_no_nan[text_level_no_nan['cefr_calc_level'] == "A1"]

In [None]:
def plot_confusion_matrix(y_true, y_pred, classes,
                          normalize=False,
                          title=None,
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if not title:
        if normalize:
            title = 'Normalized confusion matrix'
        else:
            title = 'Confusion matrix, without normalization'

    # Compute confusion matrix
    cm = confusion_matrix(y_true, y_pred)
    # Only use the labels that appear in the data
    #classes = classes[unique_labels(y_true, y_pred)]
    classes = classes
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    fig, ax = plt.subplots()
    im = ax.imshow(cm, interpolation='nearest', cmap=cmap)
    ax.figure.colorbar(im, ax=ax)
    # We want to show all ticks...
    ax.set(xticks=np.arange(cm.shape[1]),
           yticks=np.arange(cm.shape[0]),
           # ... and label them with the respective list entries
           xticklabels=classes, yticklabels=classes,
           title=title,
           ylabel='True label',
           xlabel='Predicted label')

    # Rotate the tick labels and set their alignment.
    plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
             rotation_mode="anchor")

    # Loop over data dimensions and create text annotations.
    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            ax.text(j, i, format(cm[i, j], fmt),
                    ha="center", va="center",
                    color="white" if cm[i, j] > thresh else "black")
    fig.tight_layout()
    return ax
print(type(y_test), type(y_pred))
y_test = text_level_no_nan['level']
y_pred = text_level_no_nan['cefr_level_int']
classes = [0,1,2,3,4]
np.set_printoptions(precision=2)
# Plot non-normalized confusion matrix
plot_confusion_matrix(y_test, y_pred, classes=classes,
                      title='Confusion matrix, without normalization')

# Plot normalized confusion matrix
plot_confusion_matrix(y_test, y_pred, classes=classes, normalize=True,
                      title='Normalized confusion matrix')

plt.show()