# Approach 1

In [50]:
import nltk
import csv
import pandas as pd
import numpy as np

In [51]:
# Importing the dataset
data = pd.read_csv("training_data.tsv", delimiter = '\t', encoding = 'utf-8')

In [52]:
# Checking out any random sentence from the training data-set
sentence = data['sent'][60]
sentence

'Remind me to carry a bottle when i go out at 2.00 pm'

### Textblob:
**Textblob is a library which has ready-made function to extract noun-phrases**

In [53]:
from textblob import TextBlob

print ("TEXTBLOB")
blob = TextBlob(sentence)

for np in blob.noun_phrases:
    print (np)

TEXTBLOB
remind


**As we can see that the results are very unsatisfactory, so we cannot use this approach. We will now use Approach 2 in which we will build a Support Vector Machine Model**

# Approach 2

## Model Building

#### Part 1: Training and testing the dataset

In [79]:
#importing the required libraries and packages
import pandas as pd
import numpy as np
from sklearn import svm
from sklearn.model_selection import train_test_split
import re
import pickle
from nltk.corpus import stopwords
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.feature_extraction.text import CountVectorizer

In [55]:
#importing the dataset
df=pd.read_csv("training_data.tsv",delimiter='\t')
df=df.dropna()
y=df['label']
X=df['sent']

In [56]:
#Processing Data of X
x=list()
for _ in X:
    t=re.findall('[a-z]+',_.lower())
    t=[word for word in t if not word in set(stopwords.words('english'))]
    x.append(" ".join(t))
x=pd.Series(x)

In [61]:
#Processing data of y
Y=list()
for _ in y:
    if _ =="Not Found":
        Y.append("Not Found")
    else:
        Y.append("Found")
y=pd.Series(Y)

In [62]:
#splitting data for train(75%) and test(25%)
X_train,X_test,y_train,y_test=train_test_split(x,y,test_size=0.25,random_state=123)

In [63]:
#Making list of x
x_train_list=list()
for i in X_train:
    x_train_list.append(i)
x_test_list=list()
for j in X_test:
    x_test_list.append(j)

In [64]:
#Vectorizing x
vectorizer = CountVectorizer(max_features=1000) # n-grams Bag of word
train_data = vectorizer.fit_transform(x_train_list) # expects a list of strings
np.asarray(train_data)
test_data = vectorizer.fit_transform(x_test_list) # expects a list of strings
np.asarray(test_data)

array(<2455x1000 sparse matrix of type '<class 'numpy.int64'>'
	with 9256 stored elements in Compressed Sparse Row format>, dtype=object)

In [65]:
#Model Building (Support Vector Machine)
classifier=svm.LinearSVC()
classifier=classifier.fit(train_data,y_train)

In [66]:
#Saving Model in pickle
f=open("svm_model.pickle",'wb')
pickle.dump(classifier, f)
f.close()

In [77]:
#Predicting results
result=classifier.predict(test_data)
result_list=list(result)

In [80]:
#evaluating the model
cfm=confusion_matrix(y_test,result)
print(cfm)

print("Classification report: ")


print(classification_report(y_test,result))

acc=accuracy_score(y_test,result)
print("Accuracy of the model: ",acc)


[[1183  295]
 [ 603  374]]
Classification report: 
              precision    recall  f1-score   support

       Found       0.66      0.80      0.72      1478
   Not Found       0.56      0.38      0.45       977

   micro avg       0.63      0.63      0.63      2455
   macro avg       0.61      0.59      0.59      2455
weighted avg       0.62      0.63      0.62      2455

Accuracy of the model:  0.6342158859470468


#### Part 2: Testing the 'svm_model' on 'eval_data.text'

In [81]:
import pickle
import re
from sklearn.feature_extraction.text import CountVectorizer

In [82]:
#Loading trained model
f=open("svm_model.pickle",'rb')
clf=pickle.load(f)

In [83]:
#Loading eval file
eval_data=open("eval_data.txt",'r')
X=list()
all_words=list()
for lines in eval_data:
    all_words+=(lines.strip('\n').split(' '))
    X.append(lines.strip('\n'))
eval_data.close()
all_words=list(set(all_words))

In [84]:
#Vectorizing and converting to array: X_test
vectorizer=CountVectorizer(max_features=1000)
X_test=vectorizer.fit_transform(X).toarray()

In [85]:
#Predicting Result (found/not found)
result=list(clf.predict(X_test))

In [86]:
common_words_in_reminder=['i','me','at','to','date','time','for','tommorow','tonight','today',\
                           'sunday','monday','tuesday','wednesday','thursday','friday','saturday',\
                           'morning','evening']

In [87]:
#For found, extract result text
def extract(index):
    #Create the extractor function
    temp=X[index].lower()
    t=re.findall('remi[a-z]+ me? to? (.+?) at',temp)
    if len(t)==0:
        t=re.findall('rem[a-z]+ me? to? (.+?) on',temp)
    if len(t)==0:
        t=temp
        t+=' .y'
        t=re.findall('remi[\w.]+ (.+?) {}'.format(t.split(" ")[len(t.split(" "))-1]),t)
    if len(t)>0:
        t=str(t[0]).split()    
        t=[word for word in t if not word in common_words_in_reminder]
    if len(t)>0:
        result[index]=" ".join(t)
    else:
        result[index]="Not Found"

for i in range(len(X)):
    if(result[i]=="Found"):
        extract(i)

In [88]:
#Saving the final result
file=open("final_result_eval.tsv",'w')
file.write("sent\tlabel\n")
for i in range(len(result)):
    file.write(str(X[i])+'\t'+str(result[i])+'\n')
file.close()