# SeMantic AnsweR Type prediction task

In [57]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score, accuracy_score

In [9]:
train_data = pd.read_json("./datasets/DBpedia/smarttask_dbpedia_train.json")
test_data = pd.read_json("./datasets/DBpedia/smarttask_dbpedia_test.json")

In [10]:
train_data.head()

Unnamed: 0,id,question,category,type
0,dbpedia_1177,Was Jacqueline Kennedy Onassis a follower of M...,boolean,[boolean]
1,dbpedia_14427,What is the name of the opera based on Twelfth...,resource,"[dbo:Opera, dbo:MusicalWork, dbo:Work]"
2,dbpedia_16615,When did Lena Horne receive the Grammy Award f...,literal,[date]
3,dbpedia_23480,Do Prince Harry and Prince William have the sa...,boolean,[boolean]
4,dbpedia_3681,What is the subsidiary company working for Leo...,resource,"[dbo:EducationalInstitution, dbo:Organisation,..."


In [11]:
test_data.head()

Unnamed: 0,id,question,category,type
0,dbpedia_16015,How many ingredients are in the grain} ?,literal,[number]
1,dbpedia_3885,Is the case fatality rate of Fournier gangrene...,boolean,[boolean]
2,dbpedia_12907,Does the shelf life of spinach equal 8?,boolean,[boolean]
3,dbpedia_7955,What sound does a pig make in the French langu...,literal,[string]
4,dbpedia_2376,When was Fergie completed his record label in ...,literal,[date]


## Preprocessing

In [12]:
train_data.category.value_counts()

resource    9584
literal     5188
boolean     2799
Name: category, dtype: int64

### Remove null values

In [13]:
# check for null values
train_data = train_data.dropna()
train_data.isnull().sum()

id          0
question    0
category    0
type        0
dtype: int64

In [14]:
test_data.isnull().sum()

id          0
question    0
category    0
type        0
dtype: int64

## Category Prediction

In [60]:
def ExtractCategoryData(df, test=False):
    return df[["id", "question", "category"]] if not test else df[["id", "question"]]

In [61]:
train_category_data = ExtractCategoryData(train_data)
train_category_data.head()

Unnamed: 0,id,question,category
0,dbpedia_1177,Was Jacqueline Kennedy Onassis a follower of M...,boolean
1,dbpedia_14427,What is the name of the opera based on Twelfth...,resource
2,dbpedia_16615,When did Lena Horne receive the Grammy Award f...,literal
3,dbpedia_23480,Do Prince Harry and Prince William have the sa...,boolean
4,dbpedia_3681,What is the subsidiary company working for Leo...,resource


In [32]:
feature_extraction = TfidfVectorizer()
X_train = feature_extraction.fit_transform(train_category_data.question)
Y_train = train_category_data.category

In [35]:
# train classifier
clf = SVC(probability=True, kernel='rbf')
clf.fit(X_train, Y_train)

SVC(probability=True)

### Evaluation

In [63]:
X_test = feature_extraction.transform(test_data.question)
predictions = clf.predict(X_test)

In [64]:
Y_test = test_data.category

In [65]:
predictions

array(['literal', 'boolean', 'boolean', ..., 'resource', 'resource',
       'literal'], dtype=object)

In [66]:
print("SVM Accuracy Score -> ", accuracy_score(predictions, Y_test)*100)

SVM Accuracy Score ->  93.35768089477288


## Type Prediction

In [15]:
train_data.head(2)

Unnamed: 0,id,question,category,type
0,dbpedia_1177,Was Jacqueline Kennedy Onassis a follower of M...,boolean,[boolean]
1,dbpedia_14427,What is the name of the opera based on Twelfth...,resource,"[dbo:Opera, dbo:MusicalWork, dbo:Work]"


In [16]:
test_data.head(2)

Unnamed: 0,id,question,category,type
0,dbpedia_16015,How many ingredients are in the grain} ?,literal,[number]
1,dbpedia_3885,Is the case fatality rate of Fournier gangrene...,boolean,[boolean]


Build dataframe of types, and their frequency

In [30]:
answer_type_frequencies = pd.DataFrame.from_dict(train_data.type.apply(pd.Series).stack().value_counts().to_dict(), orient='index', columns=["freq"]).reset_index()
answer_type_frequencies.columns = ["type", "freq"]
answer_type_frequencies = answer_type_frequencies.set_index("type")
answer_type_frequencies.head()

Unnamed: 0_level_0,freq
type,Unnamed: 1_level_1
dbo:Agent,4179
boolean,2799
dbo:Person,2713
dbo:Place,2244
dbo:Location,2244


We want to train our model to predict the type of a question, using a set of most frequent types.

In [31]:
answer_type_frequencies_sorted = answer_type_frequencies.sort_values(by="freq", ascending=False).to_dict()["freq"]

In [33]:
def Extract_Most_Frequent_Answer_Type(types, answer_type_frequencies_sorted):
    for answer_type in sorted(answer_type_frequencies_sorted, key=answer_type_frequencies_sorted.get, reverse=True):
        if answer_type_frequencies_sorted[answer_type] < 1000 and answer_type in types:
            return answer_type
        else:
            if answer_type in types:
                return answer_type

In [34]:
train_data["freq_type"] = train_data.type.apply(lambda x: Extract_Most_Frequent_Answer_Type(x, answer_type_frequencies_sorted))
train_data.head()

Unnamed: 0,id,question,category,type,freq_type
0,dbpedia_1177,Was Jacqueline Kennedy Onassis a follower of M...,boolean,[boolean],boolean
1,dbpedia_14427,What is the name of the opera based on Twelfth...,resource,"[dbo:Opera, dbo:MusicalWork, dbo:Work]",dbo:Work
2,dbpedia_16615,When did Lena Horne receive the Grammy Award f...,literal,[date],date
3,dbpedia_23480,Do Prince Harry and Prince William have the sa...,boolean,[boolean],boolean
4,dbpedia_3681,What is the subsidiary company working for Leo...,resource,"[dbo:EducationalInstitution, dbo:Organisation,...",dbo:Agent


In [35]:
test_data["freq_type"] = test_data.type.apply(lambda x: Extract_Most_Frequent_Answer_Type(x, answer_type_frequencies_sorted))
test_data.head()

Unnamed: 0,id,question,category,type,freq_type
0,dbpedia_16015,How many ingredients are in the grain} ?,literal,[number],number
1,dbpedia_3885,Is the case fatality rate of Fournier gangrene...,boolean,[boolean],boolean
2,dbpedia_12907,Does the shelf life of spinach equal 8?,boolean,[boolean],boolean
3,dbpedia_7955,What sound does a pig make in the French langu...,literal,[string],string
4,dbpedia_2376,When was Fergie completed his record label in ...,literal,[date],date


Check for null values and remove them

In [37]:
print("Training Frequency Type nullvalues: ", train_data.freq_type.isnull().sum())
print("Testing Frequency Type nullvalues: ", test_data.freq_type.isnull().sum())

Training Frequency Type nullvalues:  16
Testing Frequency Type nullvalues:  0


In [38]:
train_data = train_data[train_data.freq_type.notnull()]
print("Training Frequency Type nullvalues: ", train_data.freq_type.isnull().sum())

Training Frequency Type nullvalues:  0


Next we extend category column to include the type in the case of literal category

In [40]:
def Extend_Categories(row):
    if row.category == "literal":
        return row.freq_type
    else:
        return row.category

In [45]:
train_data["ext_category"] = train_data.apply(Extend_Categories, axis=1)
train_data.loc[train_data.category == "literal", ["category", "freq_type", "ext_category"]].sample(5)

Unnamed: 0,category,freq_type,ext_category
493,literal,number,number
13925,literal,date,date
8685,literal,date,date
11866,literal,date,date
1854,literal,string,string


In [46]:
test_data["ext_category"] = test_data.apply(Extend_Categories, axis=1)
test_data.loc[test_data.category == "literal", ["category", "freq_type", "ext_category"]].sample(5)

Unnamed: 0,category,freq_type,ext_category
425,literal,number,number
549,literal,date,date
3463,literal,number,number
3215,literal,string,string
2738,literal,number,number


### Frequent types prediction

In [55]:
target = "freq_type"
types_dict = {t: i for i, t in enumerate(train_data.append(test_data)[target].unique())}

In [56]:
X_train = train_data.question
Y_train = train_data[target].apply(lambda x: types_dict[x])

X_test = test_data.question
Y_test = test_data[target].apply(lambda x: types_dict[x])

In [58]:
pipeline_type = Pipeline([
    ('vect', TfidfVectorizer()),
    ('clf', SVC(probability=True, kernel='rbf')),
])
pipeline_type.fit(X_train, Y_train)

Pipeline(steps=[('vect', TfidfVectorizer()), ('clf', SVC(probability=True))])

In [59]:
pipeline_type.score(X_test, Y_test)

0.7945674503538005