In [13]:
import pandas as pd
import re
import string
import nltk
from nltk.stem import PorterStemmer
import joblib
import json

# Text preprocessing
def wordopt(text):
    text = text.lower()
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('\[.*?\]', '', text)
    text = re.sub("\\W", " ", text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

ps = PorterStemmer()
with open('static/detector/models/stopwords-tl.json', 'r') as f:
    stopwords = json.load(f)

def preprocess(text):
    text = wordopt(text)
    tokens = nltk.word_tokenize(text)
    stems = [ps.stem(token) for token in tokens]
    filtered = [stem for stem in stems if stem not in stopwords]
    return filtered

# Load the trained TF-IDF vectorizer and MLP model
vectorizer = joblib.load('static/detector/models/vectorizer_mlp.joblib')
model = joblib.load('static/detector/models/stack_mlp.joblib')

# Function to predict the label for a given text
def predict(text):
    processed = preprocess(text)
    text_processed = ' '.join(processed)
    tfidf = vectorizer.transform([text_processed])  # Note that transform expects a list of documents
    proba = model.predict_proba(tfidf)
    prediction = 1 if proba[0][1] > proba[0][0] else 0
    return prediction



In [15]:
# Load the CSV file
df = pd.read_csv('static/detector/datasets/train_data.csv')

# Apply the prediction function to each row
df['prediction'] = df['article'].apply(predict)

# Reorder columns to place 'label' first, then 'prediction', and 'article' last
df = df[['label', 'prediction', 'article']]

# Save the updated DataFrame to a new CSV file
df.to_csv('static/detector/datasets/train_data_prediksyon.csv', index=False)

print("Predictions added and saved to new CSV file.")

Predictions added and saved to new CSV file.
