## Importing Libraries

In [4]:
import numpy as np
import pandas as pd
import happytransformer
from happytransformer import HappyWordPrediction

## Loading the dataset using Pandas

In [5]:
responses_dataset = pd.read_excel("staub_data_original.xlsx")

## Creating DataFrame

In [6]:
df = pd.DataFrame(responses_dataset)

In [7]:
df

Unnamed: 0,ItemNo,X,ExperimentName,Subject,Session,DataFile.Basename,Block,List1,List1.Sample,Response.RESP,...,Max.Ampl.Mean,cloze,max.cloze,ClozeResp,response.freq,cloze.bin,count.responses,cloze.adj,max.cloze.adj,Sentence
0,1,4,new.cloze.1x,6,1,new.cloze.1x-6-1,28,1,18,6,...,0.211670,0.025,0.625,game,1,0,37,0.027027,0.675676,The ailing team forfeited the
1,1,6,new.cloze.1x,11,1,new.cloze.1x-11-1,129,1,119,6,...,0.060379,0.625,0.625,game,25,0,37,0.675676,0.675676,The ailing team forfeited the
2,1,7,new.cloze.1x,12,1,new.cloze.1x-12-1,335,1,325,6,...,0.999970,0.625,0.625,game,25,0,37,0.675676,0.675676,The ailing team forfeited the
3,1,8,new.cloze.1x,14,1,new.cloze.1x-14-1,166,1,156,6,...,0.040527,0.625,0.625,game,25,0,37,0.675676,0.675676,The ailing team forfeited the
4,1,9,new.cloze.1x,16,1,new.cloze.1x-16-1,272,1,262,6,...,0.084763,0.625,0.625,game,25,0,37,0.675676,0.675676,The ailing team forfeited the
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12040,338,13515,new.cloze.1x,19,1,new.cloze.1x-19-1,47,338,37,6,...,0.716260,0.050,0.225,help,2,0,40,0.050000,0.225000,The young student needed the
12041,338,13516,new.cloze.1x,29,1,new.cloze.1x-29-1,295,338,285,6,...,0.024124,0.050,0.225,help,2,0,40,0.050000,0.225000,The young student needed the
12042,338,13518,new.cloze.1x,36,1,new.cloze.1x-36-1,347,338,337,6,...,0.999970,0.050,0.225,help,2,0,40,0.050000,0.225000,The young student needed the
12043,338,13519,new.cloze.1x,25,1,new.cloze.1x-25-1,243,338,233,6,...,0.076462,0.025,0.225,help,1,0,40,0.025000,0.225000,The young student needed the


## Creating the Masked Questions Numpy Array

In [8]:
questions_vector = np.array(df["Sentence"])

In [9]:
masked_questions_vector = np.array([[i+"[MASK]."] for i in df["Sentence"]])

In [10]:
masked_questions_vector.shape

(12045, 1)

In [11]:
masked_questions_vector[0]

array(['The ailing team forfeited the [MASK].'], dtype='<U48')

In [12]:
s = masked_questions_vector[0]

## Using Pretrained BERT to predict last word of Sentence

In [13]:
happy_wp = HappyWordPrediction()

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/256M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

01/05/2022 20:01:10 - INFO - happytransformer.happy_transformer -   Using model: cuda


In [14]:
s[0]

'The ailing team forfeited the [MASK].'

## Experimenting by returning top 10 words predicted

In [15]:
result = happy_wp.predict_mask(s[0],top_k = 10)

In [16]:
result

[WordPredictionResult(token='tournament', score=0.140961155295372),
 WordPredictionResult(token='championship', score=0.12079109251499176),
 WordPredictionResult(token='match', score=0.11045834422111511),
 WordPredictionResult(token='trophy', score=0.10268363356590271),
 WordPredictionResult(token='title', score=0.09397570788860321),
 WordPredictionResult(token='cup', score=0.08367752283811569),
 WordPredictionResult(token='game', score=0.0674358531832695),
 WordPredictionResult(token='premiership', score=0.025133822113275528),
 WordPredictionResult(token='shield', score=0.017442280426621437),
 WordPredictionResult(token='competition', score=0.014766214415431023)]

### Printing predicted token

In [17]:
result[0].token

'tournament'

### Printing Predited score

In [18]:
result[0].score

0.140961155295372

## Helper Function to get token for every masked sentence in the numpy array

In [19]:
def get_token(sentence):
    result = happy_wp.predict_mask(sentence)
    return result[0].token

## Running Loop over Numpy Array sentences to get last word predicted via BERT

In [20]:
predictions = []
pred_tokens = []
for i in range(df.shape[0]):
    token = get_token(masked_questions_vector[i][0])
    pred_tokens.append([token])
    predictions.append([questions_vector[i] + token])



## Converting List of predicted tokens and predicted sentences to Pandas DataFrame to store as a CSV file

In [21]:
predictions_df = pd.DataFrame(predictions, columns = ["Predicted Sentence"])

In [22]:
tokens_df = pd.DataFrame(pred_tokens, columns = ["Predicted Tokens"])

In [23]:
predictions_df.shape

(12045, 1)

In [24]:
tokens_df.shape

(12045, 1)

In [25]:
predictions_df.to_csv("predictions_df.csv",index = False)

In [26]:
tokens_df.to_csv("tokens_df.csv",index = False)

## Comparing performance of BERT to Human Responses by taking ratio of total correct predictions

In [27]:
people_responses = df["Resp"]

In [28]:
response_df = pd.DataFrame(people_responses)

In [29]:
pred_responses_df = pd.concat([response_df, tokens_df],axis = 1)

### People Responses DataFrame

In [30]:
pred_responses_df

Unnamed: 0,Resp,Predicted Tokens
0,fight,tournament
1,game,tournament
2,game,tournament
3,game,tournament
4,game,tournament
...,...,...
12040,paper,money
12041,paper,money
12042,pencil,money
12043,proctor,money


### Now we match every word from the predicted frame and people responses frame return True if matched and return False if not matched.

In [31]:
pred_responses_df['isin'] =pred_responses_df.apply(lambda row: row['Resp'] in row['Predicted Tokens'],axis=1)

In [32]:
pred_responses_df

Unnamed: 0,Resp,Predicted Tokens,isin
0,fight,tournament,False
1,game,tournament,False
2,game,tournament,False
3,game,tournament,False
4,game,tournament,False
...,...,...,...
12040,paper,money,False
12041,paper,money,False
12042,pencil,money,False
12043,proctor,money,False


### Next we take sum of True responses to get total correct predictions

In [33]:
pred_responses_df["isin"].sum()

1608

We got 1608 correct predictions out of 12045.

## Score Percent

In [34]:
score = (pred_responses_df["isin"].sum() / df.shape[0])*100
print(f"{score}%")

13.34993773349938%


## Using Pandas Frame Concatenation by to Analyze performance of BERT

In [35]:
pred_responses_df_qs = pd.concat([pred_responses_df,df["Sentence"]],axis = 1)

In [36]:
pred_responses_df_qs[60:120]

Unnamed: 0,Resp,Predicted Tokens,isin,Sentence
60,picture,pictures,True,The amateur photographer snapped some
61,picture,pictures,True,The amateur photographer snapped some
62,picture,pictures,True,The amateur photographer snapped some
63,picture,pictures,True,The amateur photographer snapped some
64,picture,pictures,True,The amateur photographer snapped some
65,picture,pictures,True,The amateur photographer snapped some
66,picture,pictures,True,The amateur photographer snapped some
67,picture,pictures,True,The amateur photographer snapped some
68,shot,pictures,False,The amateur photographer snapped some
69,shot,pictures,False,The amateur photographer snapped some


## Analysis Using GroupBy

In [37]:
iterator = pred_responses_df_qs.groupby(["Sentence"])

In [38]:
for name, group in iterator:
    print(name)
    print(group)
    print("\n")

The TV commercial promoted the 
                Resp Predicted Tokens   isin                         Sentence
11483             ad             song  False  The TV commercial promoted the 
11484             ad             song  False  The TV commercial promoted the 
11485             ad             song  False  The TV commercial promoted the 
11486  advertisement             song  False  The TV commercial promoted the 
11487  advertisement             song  False  The TV commercial promoted the 
11488            bad             song  False  The TV commercial promoted the 
11489           beer             song  False  The TV commercial promoted the 
11490           beer             song  False  The TV commercial promoted the 
11491         cereal             song  False  The TV commercial promoted the 
11492           chip             song  False  The TV commercial promoted the 
11493        cleaner             song  False  The TV commercial promoted the 
11494     discussion            

             Resp Predicted Tokens   isin                            Sentence
699       article        newspaper  False  The careful journalist edited the 
700       article        newspaper  False  The careful journalist edited the 
701       article        newspaper  False  The careful journalist edited the 
702       article        newspaper  False  The careful journalist edited the 
703       article        newspaper  False  The careful journalist edited the 
704          book        newspaper  False  The careful journalist edited the 
705          book        newspaper  False  The careful journalist edited the 
706        column        newspaper  False  The careful journalist edited the 
707          copy        newspaper  False  The careful journalist edited the 
708  news article        newspaper  False  The careful journalist edited the 
709     newspaper        newspaper   True  The careful journalist edited the 
710         paper        newspaper   True  The careful journalis

         Resp Predicted Tokens   isin                      Sentence
7488     book          fingers  False  The hard worker finshed his 
7489     book          fingers  False  The hard worker finshed his 
7490     case          fingers  False  The hard worker finshed his 
7491     exam          fingers  False  The hard worker finshed his 
7492      job          fingers  False  The hard worker finshed his 
7493      job          fingers  False  The hard worker finshed his 
7494      job          fingers  False  The hard worker finshed his 
7495      job          fingers  False  The hard worker finshed his 
7496      job          fingers  False  The hard worker finshed his 
7497      job          fingers  False  The hard worker finshed his 
7498      job          fingers  False  The hard worker finshed his 
7499      job          fingers  False  The hard worker finshed his 
7500      job          fingers  False  The hard worker finshed his 
7501      job          fingers  False  The hard 

The humble winner thanked his 
            Resp Predicted Tokens   isin                        Sentence
7707    audience            peers  False  The humble winner thanked his 
7708    audience            peers  False  The humble winner thanked his 
7709      career            peers  False  The humble winner thanked his 
7710       coach            peers  False  The humble winner thanked his 
7711       coach            peers  False  The humble winner thanked his 
7712  competitor            peers  False  The humble winner thanked his 
7713  competitor            peers  False  The humble winner thanked his 
7714      family            peers  False  The humble winner thanked his 
7715      family            peers  False  The humble winner thanked his 
7716      family            peers  False  The humble winner thanked his 
7717         fan            peers  False  The humble winner thanked his 
7718         fan            peers  False  The humble winner thanked his 
7719         fan    

9001          water             milk  False  The naughty cat spilt the 


The nerdy accountant broke his 
            Resp Predicted Tokens   isin                         Sentence
9002       ankle           stride  False  The nerdy accountant broke his 
9003       ankle           stride  False  The nerdy accountant broke his 
9004         arm           stride  False  The nerdy accountant broke his 
9005        back           stride  False  The nerdy accountant broke his 
9006        back           stride  False  The nerdy accountant broke his 
9007        bank           stride  False  The nerdy accountant broke his 
9008        bank           stride  False  The nerdy accountant broke his 
9009  calculator           stride  False  The nerdy accountant broke his 
9010  calculator           stride  False  The nerdy accountant broke his 
9011  calculator           stride  False  The nerdy accountant broke his 
9012  calculator           stride  False  The nerdy accountant broke his 
9013  

The savings account accrued some 
           Resp Predicted Tokens   isin                           Sentence
4066       debt          savings  False  The savings account accrued some 
4067       debt          savings  False  The savings account accrued some 
4068       debt          savings  False  The savings account accrued some 
4069       debt          savings  False  The savings account accrued some 
4070       debt          savings  False  The savings account accrued some 
4071       debt          savings  False  The savings account accrued some 
4072    failure          savings  False  The savings account accrued some 
4073  financial          savings  False  The savings account accrued some 
4074   interest          savings  False  The savings account accrued some 
4075   interest          savings  False  The savings account accrued some 
4076   interest          savings  False  The savings account accrued some 
4077   interest          savings  False  The savings account accru