In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import nltk
from nltk.corpus import stopwords
import re
import joblib


In [3]:
# Load dataset
data = pd.read_csv('article_level_data.csv')

# Display the first few rows
print(data.head())

# Ensure you have the columns 'article' and 'class'


   Unnamed: 0                                            article  class
0           0  NLP is a multidisciplinary field that draws fr...      0
1           1  There are a variety of emerging applications f...      0
2           2  As each new means of communication and social ...      0
3           3  These suggestions include:, Learn about the pu...      0
4           4  In recent years there has been growing concern...      0


In [4]:
# Download stopwords if not already done
nltk.download('stopwords')
stop_words = set(nltk.corpus.stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:

def preprocess_text(text):
    text = re.sub(r'\W', ' ', str(text))
    text = re.sub(r'\d', ' ', text)
    text = text.lower()
    text = ' '.join(word for word in text.split() if word not in stop_words)
    return text

data['article'] = data['article'].apply(preprocess_text)


In [6]:
data['article'] = data['article'].apply(preprocess_text)

In [7]:
data.head()

Unnamed: 0.1,Unnamed: 0,article,class
0,0,nlp multidisciplinary field draws linguistics ...,0
1,1,variety emerging applications nlp including fo...,0
2,2,new means communication social interaction int...,0
3,3,suggestions include learn purpose newsgroup po...,0
4,4,recent years growing concern internet users ma...,0


In [16]:
# Split data into features and target
X = data['article']
y = data['class']  # Assuming 'class' is the column containing the labels


In [17]:
# Convert text to numerical features
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X = tfidf_vectorizer.fit_transform(X).toarray()

In [18]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [19]:
# Train Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)


In [20]:
# Evaluate model
train_accuracy = rf_model.score(X_train, y_train)
test_accuracy = rf_model.score(X_test, y_test)

In [21]:
print("Train Accuracy:", train_accuracy)
print("Test Accuracy:", test_accuracy)

Train Accuracy: 0.9987714987714987
Test Accuracy: 0.8872549019607843


In [22]:
# Predict on the test set
y_pred = rf_model.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.8872549019607843
Classification Report:
               precision    recall  f1-score   support

           0       0.87      0.90      0.88        96
           1       0.90      0.88      0.89       108

    accuracy                           0.89       204
   macro avg       0.89      0.89      0.89       204
weighted avg       0.89      0.89      0.89       204



In [23]:
# Save the model and vectorizer
joblib.dump(rf_model, 'random_forest_model.pkl')
joblib.dump(tfidf_vectorizer, 'tfidf_vectorizer.pkl')

['tfidf_vectorizer.pkl']