Importing Libraries

In [None]:
import nltk
nltk.download('all')

In [None]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score

In [None]:
np.random.seed(500)

Loading DATA

In [None]:
data=pd.read_csv('/content/train.csv',encoding='latin-1')

In [None]:
data

Processing Data

In [None]:
data['text'].dropna(inplace=True)
data['text'] = [entry.lower() for entry in data['text']]
data['text']= [word_tokenize(entry) for entry in data['text']]

In [None]:
data

In [None]:
tag_map = defaultdict(lambda : wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV


In [None]:
print(tag_map)
word_Lemmatized = WordNetLemmatizer()
data['text']

In [None]:
for index, entry in enumerate(data['text']):
    # print(index, entry)
    # print("--------------------------------")
    Final_words = []
    for word, tag in pos_tag(entry):
        # print(word, tag)
        if word not in stopwords.words('english') and word.isalpha():
            word_Final = word_Lemmatized.lemmatize(word,pos = tag_map[tag[0]])
            Final_words.append(word_Final)
    data.loc[index,'text_final'] = str(Final_words)

In [None]:
data

In [None]:
Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(data['text_final'],data['category'],test_size=0.3)

Converting string into numbers

In [None]:
Encoder = LabelEncoder()
Train_Y = Encoder.fit_transform(Train_Y)
Test_Y = Encoder.fit_transform(Test_Y)

In [None]:
Tfidf_vect = TfidfVectorizer(max_features=5000)
Tfidf_vect.fit(data['text_final'])

In [None]:
Train_X_Tfidf = Tfidf_vect.transform(Train_X)
Test_X_Tfidf = Tfidf_vect.transform(Test_X)

In [None]:
Train_Y

array([0, 0, 2, ..., 2, 1, 2])

**Training With Decision Trees**

In [None]:
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
classifier.fit(Train_X_Tfidf, Train_Y)

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
y_pred = classifier.predict(Test_X_Tfidf)
cm = confusion_matrix(Test_Y, y_pred)
print(cm)
accuracy_score(Test_Y, y_pred)

[[123   7  18   8   5]
 [  8  96   3   8   5]
 [  8   9  94   4  10]
 [  2   2   3 129   0]
 [ 13   6   1   5 101]]


0.812874251497006

Training with **random forest**

In [None]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
classifier.fit(Train_X_Tfidf, Train_Y)

In [None]:
y_pred = classifier.predict(Test_X_Tfidf)
cm = confusion_matrix(Test_Y, y_pred)
print(cm)
accuracy_score(Test_Y, y_pred)

[[149   4   3   1   4]
 [  6 109   0   3   2]
 [  7   0 115   3   0]
 [  0   0   0 136   0]
 [  9   1   1   6 109]]


0.9251497005988024

Training with **SVM**

In [None]:
from sklearn.svm import SVC
classifier = SVC(kernel = 'linear', random_state = 0)
classifier.fit(Train_X_Tfidf, Train_Y)

In [None]:
y_pred = classifier.predict(Test_X_Tfidf)
cm = confusion_matrix(Test_Y, y_pred)
print(cm)
accuracy_score(Test_Y, y_pred)

[[155   2   2   1   1]
 [  0 120   0   0   0]
 [  3   0 122   0   0]
 [  1   0   0 135   0]
 [  0   0   0   1 125]]


0.9835329341317365

Training with **Kernel SVM**

In [None]:
from sklearn.svm import SVC
classifier = SVC(kernel = 'rbf', random_state = 0)
classifier.fit(Train_X_Tfidf, Train_Y)

In [None]:
y_pred = classifier.predict(Test_X_Tfidf)
cm = confusion_matrix(Test_Y, y_pred)
print(cm)
accuracy_score(Test_Y, y_pred)

[[154   2   2   1   2]
 [  0 120   0   0   0]
 [  4   0 121   0   0]
 [  1   0   0 135   0]
 [  0   0   0   1 125]]


0.9805389221556886

**The Best Model was SVM which predicted with 98.35% accuracy **

In [None]:
# New text entries for prediction
new_texts = [
    'This movie is fantastic!',
    'I like to play cricket'
]

# Vectorize the new text data
X_new = Tfidf_vect.transform(new_texts)

# Predict labels for new text entries using the SVM classifier
y_pred_new = classifier.predict(X_new)

print("Predictions:")
for text, label in zip(new_texts, y_pred_new):
    print(f'Text: {text} --> Predicted Label: {label}')


Predictions:
Text: This movie is fantastic! --> Predicted Label: 1
Text: I like to play cricket --> Predicted Label: 3
