In [31]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import torch
import spacy
import json
from sklearn.naive_bayes import MultinomialNB


nlp = spacy.load('en_core_web_sm')

In [35]:
with open('train_data.json') as f:
    d = json.load(f)

df1 =pd.json_normalize(d, record_path = ['data'])
df2 = pd.json_normalize(d, record_path = ['data','paragraphs'])
df3 = pd.json_normalize(d, record_path = ['data','paragraphs','qas'])
df4 = pd.json_normalize(d, record_path = ['data','paragraphs','qas','answers'])

In [36]:
df1.head()

Unnamed: 0,title,paragraphs
0,Beyoncé,[{'qas': [{'question': 'When did Beyonce start...
1,Frédéric_Chopin,"[{'qas': [{'question': ""What was Frédéric's na..."
2,Sino-Tibetan_relations_during_the_Ming_dynasty,[{'qas': [{'question': 'Who were Wang Jiawei a...
3,IPod,[{'qas': [{'question': 'Which company produces...
4,The_Legend_of_Zelda:_Twilight_Princess,[{'qas': [{'question': 'What category of game ...


In [37]:
df2.head()

Unnamed: 0,qas,context
0,[{'question': 'When did Beyonce start becoming...,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...
1,"[{'question': 'After her second solo album, wh...",Following the disbandment of Destiny's Child i...
2,"[{'question': 'In her music, what are some rec...","A self-described ""modern-day feminist"", Beyonc..."
3,[{'question': 'Beyonce's younger sibling also ...,"Beyoncé Giselle Knowles was born in Houston, T..."
4,[{'question': 'What town did Beyonce go to sch...,Beyoncé attended St. Mary's Elementary School ...


In [38]:
df3.head()

Unnamed: 0,question,id,answers,is_impossible,plausible_answers
0,When did Beyonce start becoming popular?,56be85543aeaaa14008c9063,"[{'text': 'in the late 1990s', 'answer_start':...",False,
1,What areas did Beyonce compete in when she was...,56be85543aeaaa14008c9065,"[{'text': 'singing and dancing', 'answer_start...",False,
2,When did Beyonce leave Destiny's Child and bec...,56be85543aeaaa14008c9066,"[{'text': '2003', 'answer_start': 526}]",False,
3,In what city and state did Beyonce grow up?,56bf6b0f3aeaaa14008c9601,"[{'text': 'Houston, Texas', 'answer_start': 166}]",False,
4,In which decade did Beyonce become famous?,56bf6b0f3aeaaa14008c9602,"[{'text': 'late 1990s', 'answer_start': 276}]",False,


In [39]:
df3

Unnamed: 0,question,id,answers,is_impossible,plausible_answers
0,When did Beyonce start becoming popular?,56be85543aeaaa14008c9063,"[{'text': 'in the late 1990s', 'answer_start':...",False,
1,What areas did Beyonce compete in when she was...,56be85543aeaaa14008c9065,"[{'text': 'singing and dancing', 'answer_start...",False,
2,When did Beyonce leave Destiny's Child and bec...,56be85543aeaaa14008c9066,"[{'text': '2003', 'answer_start': 526}]",False,
3,In what city and state did Beyonce grow up?,56bf6b0f3aeaaa14008c9601,"[{'text': 'Houston, Texas', 'answer_start': 166}]",False,
4,In which decade did Beyonce become famous?,56bf6b0f3aeaaa14008c9602,"[{'text': 'late 1990s', 'answer_start': 276}]",False,
...,...,...,...,...,...
130314,Physics has broadly agreed on the definition o...,5a7e070b70df9f001a875439,[],True,"[{'text': 'matter', 'answer_start': 485}]"
130315,Who coined the term partonic matter?,5a7e070b70df9f001a87543a,[],True,"[{'text': 'Alfvén', 'answer_start': 327}]"
130316,What is another name for anti-matter?,5a7e070b70df9f001a87543b,[],True,"[{'text': 'Gk. common matter', 'answer_start':..."
130317,Matter usually does not need to be used in con...,5a7e070b70df9f001a87543c,[],True,"[{'text': 'a specifying modifier', 'answer_sta..."


In [40]:
df4.head()

Unnamed: 0,text,answer_start
0,in the late 1990s,269
1,singing and dancing,207
2,2003,526
3,"Houston, Texas",166
4,late 1990s,276


In [41]:
df = pd.concat([df1,df2,df3,df4], axis = 1)

In [42]:
df

Unnamed: 0,title,paragraphs,qas,context,question,id,answers,is_impossible,plausible_answers,text,answer_start
0,Beyoncé,[{'qas': [{'question': 'When did Beyonce start...,[{'question': 'When did Beyonce start becoming...,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,When did Beyonce start becoming popular?,56be85543aeaaa14008c9063,"[{'text': 'in the late 1990s', 'answer_start':...",False,,in the late 1990s,269.0
1,Frédéric_Chopin,"[{'qas': [{'question': ""What was Frédéric's na...","[{'question': 'After her second solo album, wh...",Following the disbandment of Destiny's Child i...,What areas did Beyonce compete in when she was...,56be85543aeaaa14008c9065,"[{'text': 'singing and dancing', 'answer_start...",False,,singing and dancing,207.0
2,Sino-Tibetan_relations_during_the_Ming_dynasty,[{'qas': [{'question': 'Who were Wang Jiawei a...,"[{'question': 'In her music, what are some rec...","A self-described ""modern-day feminist"", Beyonc...",When did Beyonce leave Destiny's Child and bec...,56be85543aeaaa14008c9066,"[{'text': '2003', 'answer_start': 526}]",False,,2003,526.0
3,IPod,[{'qas': [{'question': 'Which company produces...,[{'question': 'Beyonce's younger sibling also ...,"Beyoncé Giselle Knowles was born in Houston, T...",In what city and state did Beyonce grow up?,56bf6b0f3aeaaa14008c9601,"[{'text': 'Houston, Texas', 'answer_start': 166}]",False,,"Houston, Texas",166.0
4,The_Legend_of_Zelda:_Twilight_Princess,[{'qas': [{'question': 'What category of game ...,[{'question': 'What town did Beyonce go to sch...,Beyoncé attended St. Mary's Elementary School ...,In which decade did Beyonce become famous?,56bf6b0f3aeaaa14008c9602,"[{'text': 'late 1990s', 'answer_start': 276}]",False,,late 1990s,276.0
...,...,...,...,...,...,...,...,...,...,...,...
130314,,,,,Physics has broadly agreed on the definition o...,5a7e070b70df9f001a875439,[],True,"[{'text': 'matter', 'answer_start': 485}]",,
130315,,,,,Who coined the term partonic matter?,5a7e070b70df9f001a87543a,[],True,"[{'text': 'Alfvén', 'answer_start': 327}]",,
130316,,,,,What is another name for anti-matter?,5a7e070b70df9f001a87543b,[],True,"[{'text': 'Gk. common matter', 'answer_start':...",,
130317,,,,,Matter usually does not need to be used in con...,5a7e070b70df9f001a87543c,[],True,"[{'text': 'a specifying modifier', 'answer_sta...",,


In [43]:
df_true = df[df['is_impossible'] == True]
df_true.drop(columns = ['text','answer_start','id','is_impossible','title'],inplace = True)
def replace_nan_with_empty_list(x):
    if pd.isna(x):
        return ''
    return x

df_true['context'] = df_true['context'].apply(replace_nan_with_empty_list)
df_true['context_new'] = df_true['context'] + df_true['question']

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_true.drop(columns = ['text','answer_start','id','is_impossible','title'],inplace = True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_true['context'] = df_true['context'].apply(replace_nan_with_empty_list)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_true['context_new'] = df_true['context'] + df_true['question']


In [44]:
df_true

Unnamed: 0,paragraphs,qas,context,question,answers,plausible_answers,context_new
2075,,[{'question': 'More people belong to what reli...,Christianity remains the dominant religion in ...,What category of game is Legend of Zelda: Aust...,[],"[{'text': 'action-adventure', 'answer_start': ...",Christianity remains the dominant religion in ...
2076,,[{'question': 'Which religion will have the mo...,According to 2012 Pew Research Center survey i...,What consoles can be used to play Australia Tw...,[],"[{'text': 'GameCube and Wii', 'answer_start': ...",According to 2012 Pew Research Center survey i...
2077,,[{'question': 'What denomination of Christiani...,According to Scientific Elite: Nobel Laureates...,When was Australia Twilight launched in North ...,[],"[{'text': 'November 2006', 'answer_start': 569}]",According to Scientific Elite: Nobel Laureates...
2078,,"[{'plausible_answers': [{'text': 'sciences, ar...",Christians have made a myriad contributions in...,When could GameCube owners purchase Australian...,[],"[{'text': 'November 2006', 'answer_start': 569}]",Christians have made a myriad contributions in...
2079,,[{'plausible_answers': [{'text': 'made a myria...,Christians have made a myriad contributions in...,What year was the Legend of Zelda: Australian ...,[],"[{'text': '2005', 'answer_start': 364}]",Christians have made a myriad contributions in...
...,...,...,...,...,...,...,...
130314,,,,Physics has broadly agreed on the definition o...,[],"[{'text': 'matter', 'answer_start': 485}]",Physics has broadly agreed on the definition o...
130315,,,,Who coined the term partonic matter?,[],"[{'text': 'Alfvén', 'answer_start': 327}]",Who coined the term partonic matter?
130316,,,,What is another name for anti-matter?,[],"[{'text': 'Gk. common matter', 'answer_start':...",What is another name for anti-matter?
130317,,,,Matter usually does not need to be used in con...,[],"[{'text': 'a specifying modifier', 'answer_sta...",Matter usually does not need to be used in con...


In [45]:
df5 = pd.json_normalize(df['plausible_answers'].explode())

In [46]:
df_true_res = pd.concat([df_true,df5],axis = 1)

In [47]:
df_true_res.drop(columns = ['context','question','plausible_answers','answer_start'],inplace = True)

In [48]:
df_t = df_true_res.dropna()

In [49]:
df_t

Unnamed: 0,paragraphs,qas,answers,context_new,text


In [50]:
df_false = df[df['is_impossible'] == False]

In [51]:
df_false.drop(columns = ['is_impossible','plausible_answers','id','answer_start'],inplace = True)
def replace_nan_with_empty_list(x):
    if pd.isna(x):
        return ''
    return x

df_false['title'] = df_false['title'].apply(replace_nan_with_empty_list)
df_false['context'] = df_false['context'].apply(replace_nan_with_empty_list)
df_false['context_new'] = df_false['title'] + df_false['question'] + df_false['context']

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_false.drop(columns = ['is_impossible','plausible_answers','id','answer_start'],inplace = True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_false['title'] = df_false['title'].apply(replace_nan_with_empty_list)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_false['context'] = df_false['context'].apply(replace_nan_with_empty_list)
A value is try

In [52]:
df_false.drop(columns = ['title','context','question'],inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_false.drop(columns = ['title','context','question'],inplace = True)


In [53]:
df_f = df_false.dropna()

In [54]:
df_f

Unnamed: 0,paragraphs,qas,answers,text,context_new
0,[{'qas': [{'question': 'When did Beyonce start...,[{'question': 'When did Beyonce start becoming...,"[{'text': 'in the late 1990s', 'answer_start':...",in the late 1990s,BeyoncéWhen did Beyonce start becoming popular...
1,"[{'qas': [{'question': ""What was Frédéric's na...","[{'question': 'After her second solo album, wh...","[{'text': 'singing and dancing', 'answer_start...",singing and dancing,Frédéric_ChopinWhat areas did Beyonce compete ...
2,[{'qas': [{'question': 'Who were Wang Jiawei a...,"[{'question': 'In her music, what are some rec...","[{'text': '2003', 'answer_start': 526}]",2003,Sino-Tibetan_relations_during_the_Ming_dynasty...
3,[{'qas': [{'question': 'Which company produces...,[{'question': 'Beyonce's younger sibling also ...,"[{'text': 'Houston, Texas', 'answer_start': 166}]","Houston, Texas",IPodIn what city and state did Beyonce grow u...
4,[{'qas': [{'question': 'What category of game ...,[{'question': 'What town did Beyonce go to sch...,"[{'text': 'late 1990s', 'answer_start': 276}]",late 1990s,The_Legend_of_Zelda:_Twilight_PrincessIn which...
...,...,...,...,...,...
437,[{'qas': [{'question': 'Of the huge amount of ...,[{'question': 'What was the name of the Lenape...,"[{'text': 'co-producing credits', 'answer_star...",co-producing credits,InfectionWhat does she get credits for in her ...
438,[{'qas': [{'question': 'What is the practice o...,[{'question': 'In what year did the first Euro...,"[{'text': 'melodies', 'answer_start': 564}]",melodies,HuntingWhat part of production does she do?The...
439,[{'qas': [{'question': 'What country is Kathma...,[{'question': 'In what year did a Spanish expe...,"[{'text': 'Women', 'answer_start': 210}]",Women,KathmanduBeyoncé's early recordings empowered ...
440,[{'qas': [{'plausible_answers': [{'text': 'Myo...,[{'question': 'What was the name of the explor...,"[{'text': 'co-producing', 'answer_start': 376}]",co-producing,Myocardial_infarctionIn addition to co-writing...


In [55]:
df.drop(columns = ['paragraphs','qas','answers'],inplace = True)

In [56]:
df.head()

Unnamed: 0,title,context,question,id,is_impossible,plausible_answers,text,answer_start
0,Beyoncé,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,When did Beyonce start becoming popular?,56be85543aeaaa14008c9063,False,,in the late 1990s,269.0
1,Frédéric_Chopin,Following the disbandment of Destiny's Child i...,What areas did Beyonce compete in when she was...,56be85543aeaaa14008c9065,False,,singing and dancing,207.0
2,Sino-Tibetan_relations_during_the_Ming_dynasty,"A self-described ""modern-day feminist"", Beyonc...",When did Beyonce leave Destiny's Child and bec...,56be85543aeaaa14008c9066,False,,2003,526.0
3,IPod,"Beyoncé Giselle Knowles was born in Houston, T...",In what city and state did Beyonce grow up?,56bf6b0f3aeaaa14008c9601,False,,"Houston, Texas",166.0
4,The_Legend_of_Zelda:_Twilight_Princess,Beyoncé attended St. Mary's Elementary School ...,In which decade did Beyonce become famous?,56bf6b0f3aeaaa14008c9602,False,,late 1990s,276.0


In [57]:
df

Unnamed: 0,title,context,question,id,is_impossible,plausible_answers,text,answer_start
0,Beyoncé,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,When did Beyonce start becoming popular?,56be85543aeaaa14008c9063,False,,in the late 1990s,269.0
1,Frédéric_Chopin,Following the disbandment of Destiny's Child i...,What areas did Beyonce compete in when she was...,56be85543aeaaa14008c9065,False,,singing and dancing,207.0
2,Sino-Tibetan_relations_during_the_Ming_dynasty,"A self-described ""modern-day feminist"", Beyonc...",When did Beyonce leave Destiny's Child and bec...,56be85543aeaaa14008c9066,False,,2003,526.0
3,IPod,"Beyoncé Giselle Knowles was born in Houston, T...",In what city and state did Beyonce grow up?,56bf6b0f3aeaaa14008c9601,False,,"Houston, Texas",166.0
4,The_Legend_of_Zelda:_Twilight_Princess,Beyoncé attended St. Mary's Elementary School ...,In which decade did Beyonce become famous?,56bf6b0f3aeaaa14008c9602,False,,late 1990s,276.0
...,...,...,...,...,...,...,...,...
130314,,,Physics has broadly agreed on the definition o...,5a7e070b70df9f001a875439,True,"[{'text': 'matter', 'answer_start': 485}]",,
130315,,,Who coined the term partonic matter?,5a7e070b70df9f001a87543a,True,"[{'text': 'Alfvén', 'answer_start': 327}]",,
130316,,,What is another name for anti-matter?,5a7e070b70df9f001a87543b,True,"[{'text': 'Gk. common matter', 'answer_start':...",,
130317,,,Matter usually does not need to be used in con...,5a7e070b70df9f001a87543c,True,"[{'text': 'a specifying modifier', 'answer_sta...",,


In [58]:
df_total = pd.concat([df_t,df_f],axis = 0)

In [59]:
df_total

Unnamed: 0,paragraphs,qas,answers,context_new,text
0,[{'qas': [{'question': 'When did Beyonce start...,[{'question': 'When did Beyonce start becoming...,"[{'text': 'in the late 1990s', 'answer_start':...",BeyoncéWhen did Beyonce start becoming popular...,in the late 1990s
1,"[{'qas': [{'question': ""What was Frédéric's na...","[{'question': 'After her second solo album, wh...","[{'text': 'singing and dancing', 'answer_start...",Frédéric_ChopinWhat areas did Beyonce compete ...,singing and dancing
2,[{'qas': [{'question': 'Who were Wang Jiawei a...,"[{'question': 'In her music, what are some rec...","[{'text': '2003', 'answer_start': 526}]",Sino-Tibetan_relations_during_the_Ming_dynasty...,2003
3,[{'qas': [{'question': 'Which company produces...,[{'question': 'Beyonce's younger sibling also ...,"[{'text': 'Houston, Texas', 'answer_start': 166}]",IPodIn what city and state did Beyonce grow u...,"Houston, Texas"
4,[{'qas': [{'question': 'What category of game ...,[{'question': 'What town did Beyonce go to sch...,"[{'text': 'late 1990s', 'answer_start': 276}]",The_Legend_of_Zelda:_Twilight_PrincessIn which...,late 1990s
...,...,...,...,...,...
437,[{'qas': [{'question': 'Of the huge amount of ...,[{'question': 'What was the name of the Lenape...,"[{'text': 'co-producing credits', 'answer_star...",InfectionWhat does she get credits for in her ...,co-producing credits
438,[{'qas': [{'question': 'What is the practice o...,[{'question': 'In what year did the first Euro...,"[{'text': 'melodies', 'answer_start': 564}]",HuntingWhat part of production does she do?The...,melodies
439,[{'qas': [{'question': 'What country is Kathma...,[{'question': 'In what year did a Spanish expe...,"[{'text': 'Women', 'answer_start': 210}]",KathmanduBeyoncé's early recordings empowered ...,Women
440,[{'qas': [{'plausible_answers': [{'text': 'Myo...,[{'question': 'What was the name of the explor...,"[{'text': 'co-producing', 'answer_start': 376}]",Myocardial_infarctionIn addition to co-writing...,co-producing


In [60]:
def preprocess(text):
    doc = nlp(text)
    filtered_tokens = []
    for token in doc:
        if not token.is_punct and not token.is_stop:
            filtered_tokens.append(token.lemma_)
    return ' '.join(filtered_tokens)

In [61]:
df_total['context_new1'] = df_total['context_new'].apply(preprocess)

In [62]:
df_total.head()

Unnamed: 0,paragraphs,qas,answers,context_new,text,context_new1
0,[{'qas': [{'question': 'When did Beyonce start...,[{'question': 'When did Beyonce start becoming...,"[{'text': 'in the late 1990s', 'answer_start':...",BeyoncéWhen did Beyonce start becoming popular...,in the late 1990s,BeyoncéWhen Beyonce start popular?beyoncé Gise...
1,"[{'qas': [{'question': ""What was Frédéric's na...","[{'question': 'After her second solo album, wh...","[{'text': 'singing and dancing', 'answer_start...",Frédéric_ChopinWhat areas did Beyonce compete ...,singing and dancing,frédéric_chopinwhat area Beyonce compete grow ...
2,[{'qas': [{'question': 'Who were Wang Jiawei a...,"[{'question': 'In her music, what are some rec...","[{'text': '2003', 'answer_start': 526}]",Sino-Tibetan_relations_during_the_Ming_dynasty...,2003,sino tibetan_relations_during_the_ming_dynasty...
3,[{'qas': [{'question': 'Which company produces...,[{'question': 'Beyonce's younger sibling also ...,"[{'text': 'Houston, Texas', 'answer_start': 166}]",IPodIn what city and state did Beyonce grow u...,"Houston, Texas",ipodin city state Beyonce grow Beyoncé Gisel...
4,[{'qas': [{'question': 'What category of game ...,[{'question': 'What town did Beyonce go to sch...,"[{'text': 'late 1990s', 'answer_start': 276}]",The_Legend_of_Zelda:_Twilight_PrincessIn which...,late 1990s,The_Legend_of_Zelda:_Twilight_PrincessIn decad...


In [63]:
X_train,X_test,y_train,y_test = train_test_split(
    df_total['context_new1'],
    df_total['text'],
    random_state = 1,
    test_size = 0.2
)

In [71]:
clf = Pipeline([
    ('vectorizer', TfidfVectorizer(ngram_range=(1, 2))), 
    ('classifier', LogisticRegression())  
])

In [72]:
clf.fit(X_train,y_train)

Pipeline(steps=[('vectorizer', TfidfVectorizer(ngram_range=(1, 2))),
                ('classifier', LogisticRegression())])

In [73]:
y_pred = clf.predict(X_test)

In [74]:
print(classification_report(y_test,y_pred))

                                    precision    recall  f1-score   support

                '03 Bonnie & Clyde       0.00      0.00      0.00         1
                               132       0.00      0.00      0.00         1
                     158.8 million       0.00      0.00      0.00         1
                              1995       0.00      0.00      0.00         1
                              2010       0.00      0.00      0.00         1
                              2015       0.00      0.00      0.00         1
                        24 million       0.00      0.00      0.00         1
                                 4       0.00      0.00      0.00         1
                    663,000 copies       0.00      0.00      0.00         1
                        April 2013       0.00      0.00      0.00         1
                            B.I.C.       0.00      0.00      0.00         1
                     Back to Black       0.00      0.00      0.00         1
           

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
