## Importing Libraries

In [1]:
import numpy as np
import pandas as pd
import happytransformer
from happytransformer import HappyWordPrediction

## Loading the dataset using Pandas

In [2]:
responses_dataset = pd.read_excel("staub_data_original.xlsx")

## Creating DataFrame

In [3]:
df = pd.DataFrame(responses_dataset)

In [4]:
df

Unnamed: 0,ItemNo,X,ExperimentName,Subject,Session,DataFile.Basename,Block,List1,List1.Sample,Response.RESP,...,Max.Ampl.Mean,cloze,max.cloze,ClozeResp,response.freq,cloze.bin,count.responses,cloze.adj,max.cloze.adj,Sentence
0,1,4,new.cloze.1x,6,1,new.cloze.1x-6-1,28,1,18,6,...,0.211670,0.025,0.625,game,1,0,37,0.027027,0.675676,The ailing team forfeited the
1,1,6,new.cloze.1x,11,1,new.cloze.1x-11-1,129,1,119,6,...,0.060379,0.625,0.625,game,25,0,37,0.675676,0.675676,The ailing team forfeited the
2,1,7,new.cloze.1x,12,1,new.cloze.1x-12-1,335,1,325,6,...,0.999970,0.625,0.625,game,25,0,37,0.675676,0.675676,The ailing team forfeited the
3,1,8,new.cloze.1x,14,1,new.cloze.1x-14-1,166,1,156,6,...,0.040527,0.625,0.625,game,25,0,37,0.675676,0.675676,The ailing team forfeited the
4,1,9,new.cloze.1x,16,1,new.cloze.1x-16-1,272,1,262,6,...,0.084763,0.625,0.625,game,25,0,37,0.675676,0.675676,The ailing team forfeited the
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12040,338,13515,new.cloze.1x,19,1,new.cloze.1x-19-1,47,338,37,6,...,0.716260,0.050,0.225,help,2,0,40,0.050000,0.225000,The young student needed the
12041,338,13516,new.cloze.1x,29,1,new.cloze.1x-29-1,295,338,285,6,...,0.024124,0.050,0.225,help,2,0,40,0.050000,0.225000,The young student needed the
12042,338,13518,new.cloze.1x,36,1,new.cloze.1x-36-1,347,338,337,6,...,0.999970,0.050,0.225,help,2,0,40,0.050000,0.225000,The young student needed the
12043,338,13519,new.cloze.1x,25,1,new.cloze.1x-25-1,243,338,233,6,...,0.076462,0.025,0.225,help,1,0,40,0.025000,0.225000,The young student needed the


## Creating the Masked Questions Numpy Array

In [5]:
questions_vector = np.array(df["Sentence"])

In [6]:
masked_questions_vector = np.array([[i+"[MASK]"] for i in df["Sentence"]])

In [7]:
masked_questions_vector.shape

(12045, 1)

In [8]:
masked_questions_vector[0]

array(['The ailing team forfeited the [MASK]'], dtype='<U47')

In [9]:
s = masked_questions_vector[0]

## Using Pretrained BERT to predict last word of Sentence

In [10]:
happy_wp = HappyWordPrediction()

10/25/2021 23:47:15 - INFO - happytransformer.happy_transformer -   Using model: cuda


In [11]:
s[0]

'The ailing team forfeited the [MASK]'

## Experimenting by returning top 10 words predicted

In [12]:
result = happy_wp.predict_mask(s[0],top_k = 10)

In [13]:
result

[WordPredictionResult(token='title', score=0.2217867225408554),
 WordPredictionResult(token='championship', score=0.13578149676322937),
 WordPredictionResult(token='trophy', score=0.11505868285894394),
 WordPredictionResult(token='tournament', score=0.09288104623556137),
 WordPredictionResult(token='match', score=0.06332535296678543),
 WordPredictionResult(token='cup', score=0.038533806800842285),
 WordPredictionResult(token='.', score=0.03294019773602486),
 WordPredictionResult(token='game', score=0.030385807156562805),
 WordPredictionResult(token='playoff', score=0.019028015434741974),
 WordPredictionResult(token='final', score=0.017535381019115448)]

### Printing predicted token

In [14]:
result[0].token

'title'

### Printing Predited score

In [15]:
result[0].score

0.2217867225408554

## Helper Function to get token for every masked sentence in the numpy array

In [16]:
def get_token(sentence):
    result = happy_wp.predict_mask(sentence)
    return result[0].token

## Running Loop over Numpy Array sentences to get last word predicted via BERT

In [17]:
predictions = []
pred_tokens = []
for i in range(df.shape[0]):
    token = get_token(masked_questions_vector[i][0])
    pred_tokens.append([token])
    predictions.append([questions_vector[i] + token])



## Converting List of predicted tokens and predicted sentences to Pandas DataFrame to store as a CSV file

In [18]:
predictions_df = pd.DataFrame(predictions, columns = ["Predicted Sentence"])

In [19]:
tokens_df = pd.DataFrame(pred_tokens, columns = ["Predicted Tokens"])

In [20]:
predictions_df.shape

(12045, 1)

In [21]:
tokens_df.shape

(12045, 1)

In [22]:
predictions_df.to_csv("predictions_df.csv",index = False)

In [23]:
tokens_df.to_csv("tokens_df.csv",index = False)

## Comparing performance of BERT to Human Responses by taking ratio of total correct predictions

In [28]:
people_responses = df["Resp"]

In [35]:
response_df = pd.DataFrame(people_responses)

In [43]:
pred_responses_df = pd.concat([response_df, tokens_df],axis = 1)

### People Responses DataFrame

In [44]:
pred_responses_df

Unnamed: 0,Resp,Predicted Tokens
0,fight,title
1,game,title
2,game,title
3,game,title
4,game,title
...,...,...
12040,paper,.
12041,paper,.
12042,pencil,.
12043,proctor,.


### Now we match every word from the predicted frame and people responses frame return True if matched and return False if not matched.

In [45]:
pred_responses_df['isin'] =pred_responses_df.apply(lambda row: row['Resp'] in row['Predicted Tokens'],axis=1)

In [46]:
pred_responses_df

Unnamed: 0,Resp,Predicted Tokens,isin
0,fight,title,False
1,game,title,False
2,game,title,False
3,game,title,False
4,game,title,False
...,...,...,...
12040,paper,.,False
12041,paper,.,False
12042,pencil,.,False
12043,proctor,.,False


### Next we take sum of True responses to get total correct predictions

In [47]:
pred_responses_df["isin"].sum()

1130

We got 1130 correct predictions out of 12045.

## Score Percent

In [50]:
score = (pred_responses_df["isin"].sum() / df.shape[0])*100
print(f"{score}%")

9.38148609381486%


## Using Pandas Frame Concatenation by to Analyze performance of BERT

In [52]:
pred_responses_df_qs = pd.concat([pred_responses_df,df["Sentence"]],axis = 1)

In [101]:
pred_responses_df_qs[60:120]

Unnamed: 0,Resp,Predicted Tokens,isin,Sentence
60,picture,.,False,The amateur photographer snapped some
61,picture,.,False,The amateur photographer snapped some
62,picture,.,False,The amateur photographer snapped some
63,picture,.,False,The amateur photographer snapped some
64,picture,.,False,The amateur photographer snapped some
65,picture,.,False,The amateur photographer snapped some
66,picture,.,False,The amateur photographer snapped some
67,picture,.,False,The amateur photographer snapped some
68,shot,.,False,The amateur photographer snapped some
69,shot,.,False,The amateur photographer snapped some


## Analysis Using GroupBy

In [121]:
iterator = pred_responses_df_qs.groupby(["Sentence"])

In [123]:
for name, group in iterator:
    print(name)
    print(group)
    print("\n")

The TV commercial promoted the 
                Resp Predicted Tokens   isin                         Sentence
11483             ad             song  False  The TV commercial promoted the 
11484             ad             song  False  The TV commercial promoted the 
11485             ad             song  False  The TV commercial promoted the 
11486  advertisement             song  False  The TV commercial promoted the 
11487  advertisement             song  False  The TV commercial promoted the 
11488            bad             song  False  The TV commercial promoted the 
11489           beer             song  False  The TV commercial promoted the 
11490           beer             song  False  The TV commercial promoted the 
11491         cereal             song  False  The TV commercial promoted the 
11492           chip             song  False  The TV commercial promoted the 
11493        cleaner             song  False  The TV commercial promoted the 
11494     discussion            

       Resp Predicted Tokens   isin                   Sentence
1352   bone         treasure  False  The dirty dog buried the 
1353   bone         treasure  False  The dirty dog buried the 
1354   bone         treasure  False  The dirty dog buried the 
1355   bone         treasure  False  The dirty dog buried the 
1356   bone         treasure  False  The dirty dog buried the 
1357   bone         treasure  False  The dirty dog buried the 
1358   bone         treasure  False  The dirty dog buried the 
1359   bone         treasure  False  The dirty dog buried the 
1360   bone         treasure  False  The dirty dog buried the 
1361   bone         treasure  False  The dirty dog buried the 
1362   bone         treasure  False  The dirty dog buried the 
1363   bone         treasure  False  The dirty dog buried the 
1364   bone         treasure  False  The dirty dog buried the 
1365   bone         treasure  False  The dirty dog buried the 
1366   bone         treasure  False  The dirty dog buri

           Resp Predicted Tokens   isin                           Sentence
2566       bike             race  False  The irresponsible driver ran the 
2567        car             race  False  The irresponsible driver ran the 
2568      light             race  False  The irresponsible driver ran the 
2569      light             race  False  The irresponsible driver ran the 
2570      light             race  False  The irresponsible driver ran the 
2571      light             race  False  The irresponsible driver ran the 
2572      light             race  False  The irresponsible driver ran the 
2573      light             race  False  The irresponsible driver ran the 
2574      light             race  False  The irresponsible driver ran the 
2575      light             race  False  The irresponsible driver ran the 
2576      light             race  False  The irresponsible driver ran the 
2577      light             race  False  The irresponsible driver ran the 
2578      light          

9702  The poignant story described the   


The polite houseguests brought the 
                   Resp Predicted Tokens   isin  \
9703          appetizer         applause  False   
9704               cake         applause  False   
9705               cake         applause  False   
9706               cake         applause  False   
9707          casserole         applause  False   
9708            dessert         applause  False   
9709            dessert         applause  False   
9710            dessert         applause  False   
9711            dessert         applause  False   
9712             dinner         applause  False   
9713              drink         applause  False   
9714             flower         applause  False   
9715               food         applause  False   
9716               gift         applause  False   
9717  housewarming gift         applause  False   
9718               meal         applause  False   
9719              pizza         applause  False   
97

              Resp Predicted Tokens   isin                        Sentence
10853         ball                .  False  The speedy cyclist passed the 
10854        biker                .  False  The speedy cyclist passed the 
10855          car                .  False  The speedy cyclist passed the 
10856          car                .  False  The speedy cyclist passed the 
10857          car                .  False  The speedy cyclist passed the 
10858          car                .  False  The speedy cyclist passed the 
10859          car                .  False  The speedy cyclist passed the 
10860          car                .  False  The speedy cyclist passed the 
10861          car                .  False  The speedy cyclist passed the 
10862          car                .  False  The speedy cyclist passed the 
10863          car                .  False  The speedy cyclist passed the 
10864          car                .  False  The speedy cyclist passed the 
10865          car       