In [1]:
import pandas as pd
import os

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, chi2
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix
import pickle

In [8]:
train1 = pd.read_csv('/home/andrei/Work/data/notes.csv') #2
train2 = pd.read_csv('/home/andrei/Work/data/location.csv') #0
train3 = pd.read_csv('/home/andrei/Work/data/plans.csv') #1
df = train1.append(train2).append(train3)

In [4]:
train1

Unnamed: 0,text,class
0,The train is delayed.,2
1,The train is on time.,2
2,The trains are all running late.,2
3,The flights are all delayed.,2
4,The parcel was lost in transit.,2
...,...,...
146,The website has crashed.,2
147,The robots are working perfectly.,2
148,The studies have been a success.,2
149,The exposed parts are vulnerable to damage.,2


In [9]:
df["text_lower"]  = [text.lower() for text in df["text"]]

In [10]:
df

Unnamed: 0,text,class,text_lower
0,The train is delayed.,2,the train is delayed.
1,The train is on time.,2,the train is on time.
2,The trains are all running late.,2,the trains are all running late.
3,The flights are all delayed.,2,the flights are all delayed.
4,The parcel was lost in transit.,2,the parcel was lost in transit.
...,...,...,...
395,How can I get rid of the wasps?,1,how can i get rid of the wasps?
396,What time should I make a cocktail?,1,what time should i make a cocktail?
397,What are my plans for July?,1,what are my plans for july?
398,When is the family reunion?,1,when is the family reunion?


In [11]:
len(df)


951

In [12]:
#vectorizer = TfidfVectorizer(min_df= 3, stop_words="english", sublinear_tf=True, norm='l2', ngram_range=(1, 2))
vectorizer = CountVectorizer(min_df= 1, ngram_range=(1, 1))
final_features = vectorizer.fit_transform(df['text_lower']).toarray()
final_features.shape
len(sorted(list(vectorizer.vocabulary_)))

1294

In [13]:
X = df['text_lower']
Y = df['class']

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.25)

In [15]:
pipeline = Pipeline([('vect', vectorizer),
                     ('chi',  SelectKBest(chi2, k='all')), #1200
                     ('clf', LogisticRegression(random_state=0))])

In [16]:
model = pipeline.fit(X_train, y_train) #build model
#print(os.getcwd())


In [17]:
saved_model = os.getcwd() + "/" + "model_get_type.pkl"
with open(saved_model, 'wb') as file:
    pickle.dump(model, file)
#with open(saved_model, 'rb') as file:
#    model = pickle.load(file)
ytest = np.array(y_test)

In [18]:
print(classification_report(ytest, model.predict(X_test)))

              precision    recall  f1-score   support

           0       0.96      0.99      0.98       103
           1       0.98      0.98      0.98        99
           2       0.94      0.86      0.90        36

    accuracy                           0.97       238
   macro avg       0.96      0.94      0.95       238
weighted avg       0.97      0.97      0.97       238



In [19]:
print(confusion_matrix(ytest, model.predict(X_test)))

[[102   0   1]
 [  1  97   1]
 [  3   2  31]]


In [20]:
len(df)

951

In [45]:
test = ['My parking spot is F34'.lower()]
#test = ['Did I do anything last September?']
model.predict(test)
#model.predict_proba(test) #time 1 loc 0

array([2])

In [217]:
test = [line.strip("\n") for line in open(os.getcwd()+"/"+"plans4_test") if line!='\n']
test

['Make raspberry jelly today.',
 'Show favourite film at film night.',
 'Dig over unused area of vegetable patch next.',
 'Tidy all parts of shed.',
 'Wear tallest elaborate wig today.',
 'Make carbonara sauce for pasta.',
 'Put a tall, evergreen shrub near fence.',
 'Pair a strong blue cheese with the wine.',
 'Buy a practical and well-made bicycle.',
 'Audition for all the best parts.',
 'Turn up radiator before getting out of bed.',
 'Prune plum tree after threat of frost has passed.',
 'Start planning party straight away.',
 'Visit new art exhibition Tuesday afternoon.',
 'Switch off radio when leaving room.',
 'Restoration scheduled to begin end of March.',
 'Next available appointment 23rd March.',
 'Show due to start filming 2nd May.',
 'Deadline for applications in two weeks.',
 'House viewing booked for Monday 5th April.',
 'Report due for publication in September.',
 "My sister's flight is at 8 pm.",
 'Grandmother due on stage 9.30 pm.',
 'New table due to be delivered this a

In [19]:
#test = ["when is the meeting" , "when is smth"]

#model.predict(test)
model.predict_proba(test) #time 1 loc 0

array([[0.8471124, 0.1528876]])

In [19]:
final_features

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [20]:
len(final_features)

400

In [21]:
final_features[1]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [22]:
final_features[2]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,