In [38]:
import pandas as pd

In [39]:
df=pd.read_csv('/content/drive/MyDrive/blogtext.csv',nrows=200000)

In [40]:
df.head()

Unnamed: 0,id,gender,age,topic,sign,date,text
0,2059027,male,15,Student,Leo,"14,May,2004","Info has been found (+/- 100 pages,..."
1,2059027,male,15,Student,Leo,"13,May,2004",These are the team members: Drewe...
2,2059027,male,15,Student,Leo,"12,May,2004",In het kader van kernfusie op aarde...
3,2059027,male,15,Student,Leo,"12,May,2004",testing!!! testing!!!
4,3581210,male,33,InvestmentBanking,Aquarius,"11,June,2004",Thanks to Yahoo!'s Toolbar I can ...


In [41]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

**Initial Pre-processing**

In [42]:
df['text'] = df['text'].str.replace('[^A-Za-z]',' ')
df['text'] = df['text'].str.lower()
df["text"] = df["text"].str.strip()
df["text"] = df["text"].str.split()  

In [43]:
df["text"]

0         [info, has, been, found, pages, and, mb, of, p...
1         [these, are, the, team, members, drewes, van, ...
2         [in, het, kader, van, kernfusie, op, aarde, ma...
3                                        [testing, testing]
4         [thanks, to, yahoo, s, toolbar, i, can, now, c...
                                ...                        
199995    [needing, longing, yearning, wishing, waiting,...
199996    [i, m, not, out, to, hurt, him, i, tore, his, ...
199997    [without, a, strong, commitment, to, diversity...
199998    [urllink, educating, for, a, diverse, america,...
199999    [the, whole, point, of, the, liberal, revoluti...
Name: text, Length: 200000, dtype: object

**Stopword Removal**

In [44]:
from nltk.corpus import stopwords
stopwords = set(stopwords.words('english'))
df.text = df.text.apply(lambda x: ' '.join([word for word in x if word not in stopwords]))

In [45]:
df["text"]

0         info found pages mb pdf files wait untill team...
1         team members drewes van der laag urllink mail ...
2         het kader van kernfusie op aarde maak je eigen...
3                                           testing testing
4         thanks yahoo toolbar capture urls popups means...
                                ...                        
199995     needing longing yearning wishing waiting wanting
199996    hurt tore world apart walks around broken hear...
199997    without strong commitment diversity world lead...
199998    urllink educating diverse america summit sympo...
199999    whole point liberal revolution gave rise free ...
Name: text, Length: 200000, dtype: object

**Creating Label**

In [46]:

df['age'] = df['age'].astype(str)


df['labels'] = df.apply(lambda col : [col['gender'],col['age'],col['topic'],col['sign']], axis=1)

merged_data=df.drop(labels =['date','gender', 'age','topic','sign','id'], axis = 1)
merged_data.head()

Unnamed: 0,text,labels
0,info found pages mb pdf files wait untill team...,"[male, 15, Student, Leo]"
1,team members drewes van der laag urllink mail ...,"[male, 15, Student, Leo]"
2,het kader van kernfusie op aarde maak je eigen...,"[male, 15, Student, Leo]"
3,testing testing,"[male, 15, Student, Leo]"
4,thanks yahoo toolbar capture urls popups means...,"[male, 33, InvestmentBanking, Aquarius]"


In [47]:
merged_data['labels'].astype('str').value_counts()

['male', '35', 'Technology', 'Aries']         2298
['female', '17', 'Student', 'Capricorn']      2039
['female', '17', 'indUnk', 'Aquarius']        1930
['male', '36', 'Fashion', 'Aries']            1616
['male', '17', 'Technology', 'Taurus']        1552
                                              ... 
['female', '47', 'Consulting', 'Scorpio']        1
['female', '25', 'Technology', 'Aquarius']       1
['male', '43', 'Technology', 'Cancer']           1
['female', '33', 'Arts', 'Taurus']               1
['male', '38', 'Internet', 'Taurus']             1
Name: labels, Length: 2375, dtype: int64

**Splitting Train Test**

In [48]:
from sklearn.model_selection import train_test_split
X = merged_data['text']
y=merged_data['labels']
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.20, random_state = 123)

In [49]:
X_test.shape

(40000,)

In [50]:
y_test.shape

(40000,)

In [51]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(ngram_range = (1,2), 
                             min_df = 0.15, max_df = 0.8, max_features = 100)

In [52]:

X_train_dtm = vectorizer.fit_transform(X_train)
X_test_dtm = vectorizer.transform(X_test)
X_train_dtm

<160000x34 sparse matrix of type '<class 'numpy.int64'>'
	with 1165366 stored elements in Compressed Sparse Row format>

In [53]:
vectorizer.get_feature_names()[:10]

['back', 'could', 'day', 'even', 'first', 'get', 'go', 'going', 'good', 'got']

In [54]:
pd.DataFrame(X_train_dtm.toarray(), columns = vectorizer.get_feature_names())

Unnamed: 0,back,could,day,even,first,get,go,going,good,got,know,last,like,make,much,new,one,people,really,right,say,see,something,still,things,think,time,today,urllink,want,way,well,work,would
0,5,3,8,1,4,14,3,7,1,17,8,0,9,0,4,0,1,0,6,2,2,3,4,2,1,4,3,1,0,0,3,2,2,4
1,0,0,0,1,0,1,2,0,0,2,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
2,0,0,1,0,2,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
159995,2,3,1,1,0,1,1,1,0,1,3,0,3,1,0,0,1,0,1,1,0,0,2,0,1,0,0,0,0,1,1,1,0,2
159996,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
159997,0,0,0,0,0,10,2,0,1,0,0,0,0,1,0,3,0,0,0,0,0,2,1,0,0,0,1,0,0,1,0,0,0,0
159998,0,0,1,0,1,0,5,0,0,0,1,0,1,0,0,0,1,0,0,0,0,1,1,0,0,0,0,0,1,1,0,1,0,1


In [55]:
dfT = df[['gender', 'age', 'topic', 'sign']]
dfT['age'] = dfT['age'].astype('str')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [74]:
dfT

Unnamed: 0,gender,age,topic,sign
0,male,15,Student,Leo
1,male,15,Student,Leo
2,male,15,Student,Leo
3,male,15,Student,Leo
4,male,33,InvestmentBanking,Aquarius
...,...,...,...,...
199995,female,27,indUnk,Libra
199996,female,27,indUnk,Libra
199997,female,40,Accounting,Scorpio
199998,female,40,Accounting,Scorpio


In [56]:
keys=[] 
values=[] 

for i in range(dfT.shape[1]): # iterate through all the colummns        
    for j in range(dfT.iloc[:,i].value_counts().shape[0]): # iterate through all the rows of value_counts of that column
        keys.append(dfT.iloc[:,i].value_counts().index[j])         
        values.append(dfT.iloc[:,i].value_counts().iloc[j])

In [57]:
dictionary = dict(zip(keys,values))

In [58]:
print(dictionary)

{'male': 103992, 'female': 96008, '24': 25763, '17': 23768, '16': 20146, '23': 19494, '25': 17699, '26': 16942, '27': 15669, '15': 11431, '14': 7624, '35': 6593, '34': 5344, '36': 4957, '13': 4935, '33': 4520, '37': 3058, '40': 1666, '43': 1426, '39': 1396, '42': 1355, '38': 1230, '41': 1190, '45': 1126, '48': 1012, '46': 935, '47': 503, '44': 218, 'indUnk': 72599, 'Student': 40273, 'Technology': 16845, 'Education': 11574, 'Arts': 8687, 'Communications-Media': 6216, 'Internet': 5099, 'Engineering': 4644, 'Non-Profit': 3497, 'Government': 2894, 'Science': 2834, 'Marketing': 2195, 'Fashion': 2022, 'Consulting': 1997, 'Publishing': 1634, 'BusinessServices': 1526, 'Law': 1406, 'Religion': 1365, 'Museums-Libraries': 1168, 'Advertising': 1067, 'Military': 1044, 'Manufacturing': 915, 'Transportation': 877, 'Accounting': 787, 'Telecommunications': 739, 'Agriculture': 702, 'Banking': 701, 'HumanResources': 580, 'Sports-Recreation': 540, 'Tourism': 514, 'Biotech': 473, 'Chemicals': 472, 'Investm

In [59]:
from sklearn.preprocessing import MultiLabelBinarizer 
mlb = MultiLabelBinarizer(classes=sorted(dictionary.keys()))
y_train_mlb = mlb.fit_transform(y_train)
y_test_mlb = mlb.transform(y_test)

In [70]:
mlb.inverse_transform(y_train_mlb)[1]

('23', 'Accounting', 'Sagittarius', 'female')

In [61]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(solver='lbfgs', multi_class='ovr')
ovr = OneVsRestClassifier(lr)

ovr.fit(X_train_dtm, y_train_mlb)
y_pred_ovr_test = ovr.predict(X_test_dtm)
#y_proba_ovr = ovr.predict_proba(X_test_dtm)
y_pred_ovr_test

array([[0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 0, 1],
       ...,
       [0, 0, 0, ..., 1, 0, 0],
       [0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 0, 1]])

In [62]:
y_pred_ovr_train = ovr.predict(X_train_dtm)
y_pred_ovr_train

array([[0, 0, 0, ..., 1, 0, 0],
       [0, 0, 0, ..., 1, 0, 0],
       [0, 0, 0, ..., 0, 0, 1],
       ...,
       [0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 1, 0, 0],
       [0, 0, 0, ..., 0, 0, 1]])

In [63]:
from sklearn.metrics import accuracy_score, f1_score, average_precision_score, recall_score
def print_scores(actual, predicted, averaging_type):
    print('\nAVERAGING TYPE==> ',averaging_type)
    print('F1 score: ',f1_score(actual,predicted, average=averaging_type))
    print('Average Precision Score: ',average_precision_score(actual,predicted, average=averaging_type))
    print('Average Recall Score: ',recall_score(actual,predicted, average=averaging_type))

In [64]:
print('--------------------------TRAIN SCORES--------------------------------')
print('Accuracy score: ',accuracy_score(y_train_mlb, y_pred_ovr_train))
print_scores(y_train_mlb, y_pred_ovr_train, 'micro')
print_scores(y_train_mlb, y_pred_ovr_train, 'macro')
print_scores(y_train_mlb, y_pred_ovr_train, 'weighted')

--------------------------TRAIN SCORES--------------------------------
Accuracy score:  0.0

AVERAGING TYPE==>  micro
F1 score:  0.225695401918789
Average Precision Score:  0.12209898736367093
Average Recall Score:  0.1413140625

AVERAGING TYPE==>  macro
F1 score:  0.013792299722609655
Average Precision Score:  0.05083496949637669
Average Recall Score:  0.014118399297249886

AVERAGING TYPE==>  weighted
F1 score:  0.13436021788415017
Average Precision Score:  0.2209429972802017
Average Recall Score:  0.1413140625


In [65]:
print('--------------------------TEST SCORES--------------------------------')
print('Accuracy score: ',accuracy_score(y_test_mlb, y_pred_ovr_test))
print_scores(y_test_mlb, y_pred_ovr_test, 'micro')
print_scores(y_test_mlb, y_pred_ovr_test, 'macro')
print_scores(y_test_mlb, y_pred_ovr_test, 'weighted')

--------------------------TEST SCORES--------------------------------
Accuracy score:  0.0

AVERAGING TYPE==>  micro
F1 score:  0.2251092618092558
Average Precision Score:  0.12161653477099614
Average Recall Score:  0.141

AVERAGING TYPE==>  macro
F1 score:  0.013706438993501291
Average Precision Score:  0.05080557061366544
Average Recall Score:  0.014090260902638424

AVERAGING TYPE==>  weighted
F1 score:  0.13403594436879326
Average Precision Score:  0.22098280750859975
Average Recall Score:  0.141


In [72]:
five_pred = y_pred_ovr_test[:5]
five_actual = y_test_mlb[:5]

In [67]:
five_actual = mlb.inverse_transform(five_actual)
five_actual

[('26', 'Sagittarius', 'Telecommunications', 'female'),
 ('26', 'Aquarius', 'Science', 'male'),
 ('24', 'BusinessServices', 'Leo', 'female'),
 ('25', 'Technology', 'Virgo', 'male'),
 ('16', 'Aries', 'Education', 'female')]

**PART 2**

In [152]:
df2=pd.read_json('GL Bot.json')

In [78]:
df2

Unnamed: 0,tag,patterns,responses,context_set
0,Intro,"[hi, how are you, is anyone there, hello, what...",[Hello! how can i help you ?],
1,Exit,"[thank you, thanks, cya, see you, later, see y...","[I hope I was able to assist you, Good Bye]",
2,Olympus,"[olympus, explain me how olympus works, I am n...",[Link: Olympus wiki],
3,SL,"[i am not able to understand svm, explain me h...",[Link: Machine Learning wiki ],
4,NN,"[what is deep learning, unable to understand d...",[Link: Neural Nets wiki],
5,Bot,"[what is your name, who are you, name please, ...",[I am your virtual learning assistant],
6,Profane,"[what the hell, bloody stupid bot, do you thin...",[Please use respectful words],
7,Ticket,"[my problem is not solved, you did not help me...",[Tarnsferring the request to your PM],


In [81]:
df_filter=df2[['patterns','responses']]

Preprocessing data

In [85]:
patterns=[]
response=[]
for i,j in zip(df2['patterns'],df2['responses']):
  patterns.extend(i)
  response.extend([j[0]]*len(i))

df_processed=pd.DataFrame()
df_processed['patterns']=patterns
df_processed['response']=response

In [86]:
df_processed

Unnamed: 0,patterns,response
0,hi,Hello! how can i help you ?
1,how are you,Hello! how can i help you ?
2,is anyone there,Hello! how can i help you ?
3,hello,Hello! how can i help you ?
4,whats up,Hello! how can i help you ?
...,...,...
123,not good solution,Tarnsferring the request to your PM
124,no help,Tarnsferring the request to your PM
125,wasted my time,Tarnsferring the request to your PM
126,useless bot,Tarnsferring the request to your PM


In [87]:
df_processed['response'].value_counts()

Link: Machine Learning wiki                  29
Link: Neural Nets wiki                       24
Hello! how can i help you ?                  20
I hope I was able to assist you, Good Bye    16
Link: Olympus wiki                           13
Please use respectful words                   9
Tarnsferring the request to your PM           9
I am your virtual learning assistant          8
Name: response, dtype: int64

In [88]:
df_filter=df_processed.copy()

df_filter['patterns'] = df_filter['patterns'].str.replace('[^A-Za-z]',' ')
df_filter['patterns'] = df_filter['patterns'].str.lower()
df_filter["patterns"] = df_filter["patterns"].str.strip()
 

In [90]:
def pre_process(text):
  text=text.replace('[^A-Za-z]',' ')
  text=text.lower()
  return text

In [89]:
df_filter

Unnamed: 0,patterns,response
0,hi,Hello! how can i help you ?
1,how are you,Hello! how can i help you ?
2,is anyone there,Hello! how can i help you ?
3,hello,Hello! how can i help you ?
4,whats up,Hello! how can i help you ?
...,...,...
123,not good solution,Tarnsferring the request to your PM
124,no help,Tarnsferring the request to your PM
125,wasted my time,Tarnsferring the request to your PM
126,useless bot,Tarnsferring the request to your PM


In [92]:
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score

In [94]:

Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(df_filter['patterns'],df_filter['response'],test_size=0.3)

In [95]:
Tfidf_vect = TfidfVectorizer(max_features=5000)
Tfidf_vect.fit(df_filter['patterns'])
Train_X_Tfidf = Tfidf_vect.transform(Train_X)
Test_X_Tfidf = Tfidf_vect.transform(Test_X)

Created Classifier and testing accuracy 

In [101]:
# fit the training dataset on the NB classifier
SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
SVM.fit(Train_X_Tfidf,Train_Y)
# predict the labels on validation dataset
predictions_SVM = SVM.predict(Train_X_Tfidf)
# Use accuracy_score function to get the accuracy
print("SVM Accuracy Score -> ",accuracy_score(predictions_SVM, Train_Y)*100)

SVM Accuracy Score ->  100.0


In [109]:
def call_model(input):
  inputp=pre_process(input)
  test=Tfidf_vect.transform([inputp])
  return SVM.predict(test)[0]

Manual Testing

In [110]:
call_model('not good solution')

'Tarnsferring the request to your PM'

In [111]:
call_model('wasted my time')

'Tarnsferring the request to your PM'

In [112]:
call_model('my time was fully wasted')

'Tarnsferring the request to your PM'

In [114]:
call_model('is anyone there, i am waiting')

'Hello! how can i help you ?'

In [184]:
call_model('thank you')

'I hope I was able to assist you, Good Bye'

In [176]:
df2['responses_p']=[str(x.strip("['']"))  for x in df2['responses']]

In [189]:

while 1:
    input= %sx read -p ''
    output=call_model(input[0])
    tag=list(df2[df2['responses_p']==output]['tag'])[0]
    if tag == 'Exit': 
        print(output)
        break
    else:
        print(output)
        continue



Hello! how can i help you ?
I am your virtual learning assistant
Tarnsferring the request to your PM
I hope I was able to assist you, Good Bye
