In [1]:
import nltk
import numpy as np
from sklearn.naive_bayes import MultinomialNB
from nltk.tokenize import word_tokenize
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.neural_network import MLPClassifier
import statistics

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
import warnings
warnings.filterwarnings('ignore')

In [2]:
pred_to_label={0:'csai',1:'cse',2:'ece'}

In [3]:
df=pd.read_csv('ai_assignment5_data.csv')
df=df.drop(df.columns[0],axis=1)

In [4]:
def remove_stop_words(s):
    tokenized_sent=word_tokenize(s)
    removing_stop=[]
    for j in tokenized_sent:
        if(j not in stop_words):
            removing_stop.append(lemmatizer.lemmatize(j))
    
    final_string=' '.join(removing_stop)
    return(final_string)

# Using Bag of Words Implementation with Naive Bayes and MLP Classifier

In [5]:
corpus=[]

for i in range(len(df)):
    final_string=remove_stop_words(df['text'][i].lower())
    corpus.append(final_string)

In [6]:
count_vectorizer = CountVectorizer()
count_corpus = count_vectorizer.fit_transform(corpus)

tfidf_vectorizer = TfidfVectorizer()
tfidf_corpus = tfidf_vectorizer.fit_transform(corpus)

In [7]:
count_vectorizer.get_feature_names()

['advanced',
 'ai',
 'algorithm',
 'artificial',
 'career',
 'computer',
 'cv',
 'data',
 'database',
 'dbms',
 'deep',
 'dsa',
 'eld',
 'fascinated',
 'intelligence',
 'interest',
 'language',
 'learning',
 'lie',
 'like',
 'machine',
 'management',
 'ml',
 'natural',
 'nlp',
 'processing',
 'programming',
 'pursue',
 'really',
 'signal',
 'sn',
 'structure',
 'system',
 'vision',
 'vlsi',
 'want',
 'wireless']

In [8]:
count_corpus.toarray().shape

(100, 37)

In [9]:
tfidf_corpus.toarray().shape

(100, 37)

In [10]:
mnb1=MultinomialNB()
mnb1.fit(count_corpus, df['label'])

mnb2=MultinomialNB()
mnb2.fit(tfidf_corpus, df['label'])

mlp1 = MLPClassifier(max_iter=300).fit(count_corpus, df['label'])
mlp2 = MLPClassifier(max_iter=300).fit(tfidf_corpus, df['label'])


## Using POS Tagging 

In [11]:
def get_cgpa(s):
    wordsList = nltk.word_tokenize(s)
    pos_tags = nltk.pos_tag(wordsList)
    
    for i in pos_tags:
        if(i[1]=='CD'):   #if the pos tag is cardinal digit it is cgpa
            return(i[0])
    
    return('7')

In [12]:
inp=input("Tell about your academic subjects and your cgpa: ")

inp=inp.lower()
cgpa=get_cgpa(inp)
inp=remove_stop_words(inp)
inp=[inp]

inp1=count_vectorizer.transform(inp).toarray()
inp2=tfidf_vectorizer.transform(inp).toarray()

Tell about your academic subjects and your cgpa: i like studying data structures and algorithms and my grade point is 9.9


In [13]:
pred1=mnb1.predict(inp1)[0]
pred2=mnb2.predict(inp2)[0]

print("Prediction using count vectorizer (Naive Bayes): "+str(pred_to_label[pred1]))
print("Prediction using tfidf vectorizer (Naive Bayes): "+str(pred_to_label[pred2]))

Prediction using count vectorizer (Naive Bayes): cse
Prediction using tfidf vectorizer (Naive Bayes): cse


In [14]:
pred3=mlp1.predict(inp1)[0]
pred4=mlp1.predict(inp2)[0]

print("Prediction using count vectorizer (ANN): "+str(pred_to_label[pred3]))
print("Prediction using tfidf vectorizer (ANN): "+str(pred_to_label[pred4]))

Prediction using count vectorizer (ANN): cse
Prediction using tfidf vectorizer (ANN): cse


## Final Prediction

In [15]:
#finally taking a max vote

final_ans=[]
final_ans.append(pred1)
final_ans.append(pred2)
final_ans.append(pred3)
final_ans.append(pred4)

maxi = statistics.mode(final_ans)

print("Final prediction: "+pred_to_label[maxi])

Final prediction: cse


In [16]:
import os
os.getcwd()

'C:\\Users\\Bhavya'

In [17]:
if os.path.exists("C:/Users/Bhavya/ai_assigment5.txt"):
    os.remove("C:/Users/Bhavya/ai_assigment5.txt")
    print("Deleted")
else:
    print("The file does not exist")

Deleted


In [18]:
file1 = open("C:/Users/Bhavya/ai_assigment5.txt","w+")

In [19]:
to_write='interest('+pred_to_label[maxi]+','+cgpa+').'
file1.write(to_write)

18

In [20]:
print(to_write)

interest(cse,9.9).


In [21]:
file1.close()