In [1]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
import pandas as pd
import ast
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression

In [2]:
df_reddit = pd.read_csv("reddit_data.csv", engine='python', on_bad_lines='skip')
df_kaggle = pd.read_csv("kaggle_data.csv", engine='python', on_bad_lines='skip')

# LR Models

### TF-IDF

In [7]:
vectorizer = TfidfVectorizer(ngram_range=(1,1))

X = vectorizer.fit_transform(df_reddit['body'])

y = df_reddit['class']

# Encode classes as numerical values for logistic regression
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.3, random_state=42)

# saga solver is stochastic gradient descent model for multiclass classification
logistic_regression_model = LogisticRegression(multi_class='multinomial', solver='saga', max_iter=1000)
logistic_regression_model.fit(X_train, y_train)

y_pred_lr = logistic_regression_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred_lr)
print(f"Accuracy: {accuracy}")



Accuracy: 0.3223850766155896


In [8]:
vectorizer = TfidfVectorizer(ngram_range=(1,1))

X = vectorizer.fit_transform(df_kaggle['posts'])

y = df_kaggle['type']

# Encode classes as numerical values for logistic regression
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.3, random_state=42)

# saga solver is stochastic gradient descent model for multiclass classification
logistic_regression_model = LogisticRegression(multi_class='multinomial', solver='saga', max_iter=1000)
logistic_regression_model.fit(X_train, y_train)

y_pred_lr = logistic_regression_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred_lr)
print(f"Accuracy: {accuracy}")



Accuracy: 0.23939160413709187


### CountVec

In [3]:
vectorizer = CountVectorizer()

X = vectorizer.fit_transform(df_reddit['body'])

y = df_reddit['class']

# Encode classes as numerical values for logistic regression
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.3, random_state=42)

# saga solver is stochastic gradient descent model for multiclass classification
logistic_regression_model = LogisticRegression(multi_class='multinomial', solver='saga', max_iter=100)
logistic_regression_model.fit(X_train, y_train)

y_pred_lr = logistic_regression_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred_lr)
print(f"Accuracy: {accuracy}")



Accuracy: 0.3046272182181576


In [3]:
vectorizer = CountVectorizer()

X = vectorizer.fit_transform(df_kaggle['posts'])

y = df_kaggle['type']

# Encode classes as numerical values for logistic regression
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.3, random_state=42)

# saga solver is stochastic gradient descent model for multiclass classification
logistic_regression_model = LogisticRegression(multi_class='multinomial', solver='saga', max_iter=1000)
logistic_regression_model.fit(X_train, y_train)

y_pred_lr = logistic_regression_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred_lr)
print(f"Accuracy: {accuracy}")



Accuracy: 0.23200973433380653


# NB Models

### TF-IDF

In [3]:
vectorizer = TfidfVectorizer(ngram_range=(1,1))

X = vectorizer.fit_transform(df_reddit['body'])

y = df_reddit['class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

model = MultinomialNB()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Reddit Accuracy: {accuracy}")

Reddit Accuracy: 0.28872266973532795


In [4]:
vectorizer = TfidfVectorizer(ngram_range=(1,1))

X = vectorizer.fit_transform(df_kaggle['posts'])

y = df_kaggle['type']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

model = MultinomialNB()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Kaggle Accuracy: {accuracy}")

Kaggle Accuracy: 0.22865138917055364


### CountVec

In [5]:
vectorizer = CountVectorizer()

X = vectorizer.fit_transform(df_reddit['body'])

y = df_reddit['class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

model = MultinomialNB()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

Accuracy: 0.312815698625159


In [6]:
vectorizer = CountVectorizer()

X = vectorizer.fit_transform(df_kaggle['posts'])

y = df_kaggle['type']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

model = MultinomialNB()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

Accuracy: 0.24041370918677754
