# SeMantic AnsweR Type prediction task

In [67]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score, accuracy_score

In [110]:
train_data = pd.read_json("./datasets/DBpedia/smarttask_dbpedia_train.json")
train_data.head()

Unnamed: 0,id,question,category,type
0,dbpedia_1177,Was Jacqueline Kennedy Onassis a follower of M...,boolean,[boolean]
1,dbpedia_14427,What is the name of the opera based on Twelfth...,resource,"[dbo:Opera, dbo:MusicalWork, dbo:Work]"
2,dbpedia_16615,When did Lena Horne receive the Grammy Award f...,literal,[date]
3,dbpedia_23480,Do Prince Harry and Prince William have the sa...,boolean,[boolean]
4,dbpedia_3681,What is the subsidiary company working for Leo...,resource,"[dbo:EducationalInstitution, dbo:Organisation,..."


In [111]:
test_data = pd.read_json("./datasets/DBpedia/smarttask_dbpedia_test.json")
test_data.head()

Unnamed: 0,id,question,category,type
0,dbpedia_16015,How many ingredients are in the grain} ?,literal,[number]
1,dbpedia_3885,Is the case fatality rate of Fournier gangrene...,boolean,[boolean]
2,dbpedia_12907,Does the shelf life of spinach equal 8?,boolean,[boolean]
3,dbpedia_7955,What sound does a pig make in the French langu...,literal,[string]
4,dbpedia_2376,When was Fergie completed his record label in ...,literal,[date]


## Preprocessing

In [112]:
train_data.category.value_counts()

resource    9584
literal     5188
boolean     2799
Name: category, dtype: int64

### Remove null values

In [113]:
# check for null values
train_data = train_data.dropna()
train_data.isnull().sum()

id          0
question    0
category    0
type        0
dtype: int64

In [114]:
test_data.isnull().sum()

id          0
question    0
category    0
type        0
dtype: int64

## Category Prediction

In [60]:
def ExtractCategoryData(df, test=False):
    return df[["id", "question", "category"]] if not test else df[["id", "question"]]

In [61]:
train_category_data = ExtractCategoryData(train_data)
train_category_data.head()

Unnamed: 0,id,question,category
0,dbpedia_1177,Was Jacqueline Kennedy Onassis a follower of M...,boolean
1,dbpedia_14427,What is the name of the opera based on Twelfth...,resource
2,dbpedia_16615,When did Lena Horne receive the Grammy Award f...,literal
3,dbpedia_23480,Do Prince Harry and Prince William have the sa...,boolean
4,dbpedia_3681,What is the subsidiary company working for Leo...,resource


In [32]:
feature_extraction = TfidfVectorizer()
X_train = feature_extraction.fit_transform(train_category_data.question)
Y_train = train_category_data.category

In [35]:
# train classifier
clf = SVC(probability=True, kernel='rbf')
clf.fit(X_train, Y_train)

SVC(probability=True)

### Evaluation

In [63]:
X_test = feature_extraction.transform(test_data.question)
predictions = clf.predict(X_test)

In [64]:
Y_test = test_data.category

In [65]:
predictions

array(['literal', 'boolean', 'boolean', ..., 'resource', 'resource',
       'literal'], dtype=object)

In [66]:
print("SVM Accuracy Score -> ", accuracy_score(predictions, Y_test)*100)

SVM Accuracy Score ->  93.35768089477288


## Type Prediction

### Preprocessing

In [95]:
from sklearn.preprocessing import MultiLabelBinarizer

In [115]:
train_data.head(2)

Unnamed: 0,id,question,category,type
0,dbpedia_1177,Was Jacqueline Kennedy Onassis a follower of M...,boolean,[boolean]
1,dbpedia_14427,What is the name of the opera based on Twelfth...,resource,"[dbo:Opera, dbo:MusicalWork, dbo:Work]"


In [117]:
test_data.head(2)

Unnamed: 0,id,question,category,type
0,dbpedia_16015,How many ingredients are in the grain} ?,literal,[number]
1,dbpedia_3885,Is the case fatality rate of Fournier gangrene...,boolean,[boolean]


In [119]:
# get value count of each type in the list of types in the type-column
unique_types = train_data.type.apply(pd.Series).stack().value_counts()
unique_types

dbo:Agent               4179
boolean                 2799
dbo:Person              2713
dbo:Place               2244
dbo:Location            2244
                        ... 
dbo:Bone                   1
dbo:Motorcycle             1
dbo:HistoricBuilding       1
dbo:Zoo                    1
dbo:RaceTrack              1
Length: 310, dtype: int64

In [124]:
unique_types.loc[unique_types < 100]

dbo:Band                98
dbo:PersonFunction      98
dbo:Building            97
dbo:Profession          97
dbo:Artist              94
                        ..
dbo:Bone                 1
dbo:Motorcycle           1
dbo:HistoricBuilding     1
dbo:Zoo                  1
dbo:RaceTrack            1
Length: 270, dtype: int64

In [109]:
# convert the lists in the type column to a comma separated string
if type(train_data.type[0]) == list:
    train_data["type"] = train_data['type'].apply(lambda x: ','.join(map(str, x)))
    test_data["type"] = test_data['type'].apply(lambda x: ','.join(map(str, x)))

train_data.head(2)

Unnamed: 0,id,question,category,type
0,dbpedia_1177,Was Jacqueline Kennedy Onassis a follower of M...,boolean,boolean
1,dbpedia_14427,What is the name of the opera based on Twelfth...,resource,"dbo:Opera,dbo:MusicalWork,dbo:Work"


TRaining

In [125]:
# train classifier
clf_type = SVC(probability=True, kernel='rbf')
# clf_type.fit(X_train, Y_train)

### Evaluation

In [None]:
X_test = feature_extraction.transform(test_data.question)
predictions = clf.predict(X_test)

In [None]:
Y_test = test_data.category

In [None]:
predictions

array(['literal', 'boolean', 'boolean', ..., 'resource', 'resource',
       'literal'], dtype=object)

In [None]:
print("SVM Accuracy Score -> ", accuracy_score(predictions, Y_test)*100)

SVM Accuracy Score ->  93.35768089477288
