In [1]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer


In [2]:
data = pd.read_csv('Reddit_Data.csv')

In [3]:
data.shape

(37249, 2)

In [4]:
data.head()

Unnamed: 0,clean_comment,category
0,family mormon have never tried explain them t...,1
1,buddhism has very much lot compatible with chr...,1
2,seriously don say thing first all they won get...,-1
3,what you have learned yours and only yours wha...,0
4,for your own benefit you may want read living ...,1


In [5]:
data['clean_comment'] = data['clean_comment'].apply(str)

In [6]:
data['category'].nunique()

3

In [7]:
data['category'].value_counts()

 1    15830
 0    13142
-1     8277
Name: category, dtype: int64

In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37249 entries, 0 to 37248
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   clean_comment  37249 non-null  object
 1   category       37249 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 582.1+ KB


# Data Preprocessing

In [9]:
# converting all string to lower case
data['clean_comment'] = data['clean_comment'].str.lower()

In [10]:
# tokenize all the strings
data['tokens'] = data['clean_comment'].apply(word_tokenize)

In [11]:
stop_words = set(stopwords.words('english'))

In [12]:
# remove all stopwords
data['tokens'] = data['tokens'].apply(lambda tokens: [token for token in tokens if token not in stop_words])

In [13]:
#stemmer = PorterStemmer()

In [14]:
#data['tokens'] = data['tokens'].apply(lambda tokens: [stemmer.stem(token) for token in tokens])

In [16]:
lemmatizer = WordNetLemmatizer()

In [17]:
# lemmatize all the tokens
data['tokens'] = data['tokens'].apply(lambda tokens: [lemmatizer.lemmatize(token) for token in tokens])

In [18]:
#vectorizer = TfidfVectorizer(max_features=3000)  

In [19]:
vectorizer = CountVectorizer(max_features=5000)

In [20]:
X = vectorizer.fit_transform(data['tokens'].apply(lambda tokens: ' '.join(tokens))).toarray()

# Modelling

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, data['category'], test_size=0.2, random_state=42)

In [22]:
classifier = MultinomialNB()

In [23]:
classifier.fit(X_train, y_train)

In [24]:
y_pred = classifier.predict(X_test)

In [25]:
accuracy = accuracy_score(y_test, y_pred)

In [26]:
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.68


In [27]:
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Classification Report:
              precision    recall  f1-score   support

          -1       0.65      0.61      0.63      1667
           0       0.81      0.55      0.66      2615
           1       0.63      0.82      0.72      3168

    accuracy                           0.68      7450
   macro avg       0.70      0.66      0.67      7450
weighted avg       0.70      0.68      0.68      7450



In [28]:
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))



Confusion Matrix:
[[1013  137  517]
 [ 183 1443  989]
 [ 364  195 2609]]
