In [22]:
# Assistant
# Import necessary libraries
import pandas as pd
import urllib.request
import os

# Check if file exists, if not download it
if not os.path.exists('fake_or_real_news.csv'):
    print("File not found. Downloading...")
    # Updated URL to a working source for the dataset
    url = "https://raw.githubusercontent.com/several27/FakeNewsCorpus/master/news_sample.csv"
    # Alternatively, you could use another reliable source like Kaggle or UCI ML Repository
    urllib.request.urlretrieve(url, 'fake_or_real_news.csv')
    
# Now read the file
df = pd.read_csv('fake_or_real_news.csv')

# Display the first few rows to verify the data loaded correctly
print(df.head())

   Unnamed: 0   id                domain        type  \
0           0  141               awm.com  unreliable   
1           1  256     beforeitsnews.com        fake   
2           2  700           cnnnext.com  unreliable   
3           3  768               awm.com  unreliable   
4           4  791  bipartisanreport.com   clickbait   

                                                 url  \
0  http://awm.com/church-congregation-brings-gift...   
1  http://beforeitsnews.com/awakening-start-here/...   
2  http://www.cnnnext.com/video/18526/never-hike-...   
3  http://awm.com/elusive-alien-of-the-sea-caught...   
4  http://bipartisanreport.com/2018/01/21/trumps-...   

                                             content  \
0  Sometimes the power of Christmas will make you...   
1  AWAKENING OF 12 STRANDS of DNA – “Reconnecting...   
2  Never Hike Alone: A Friday the 13th Fan Film U...   
3  When a rare shark was caught, scientists were ...   
4  Donald Trump has the unnerving ability to a

In [40]:
# Import necessary libraries
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import pandas as pd

# Option 2: Create sample data directly instead of loading from a non-existent file
# This avoids the FileNotFoundError since we're not trying to read a file
data = ["This is the first document.", "This document is the second document.", "And this is the third one.", 
        "Is this the first document?", "The last document."]
labels = [0, 1, 2, 0, 1]  # Example labels

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.2, random_state=42)

# Now apply TF-IDF vectorization
# Increased max_df to 0.95 and removed stop_words for this small dataset
vectorizer = TfidfVectorizer(max_df=0.95)  # Removed stop_words and increased max_df
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Print shapes to verify everything worked
print(f"Training data shape: {X_train_tfidf.shape}")
print(f"Testing data shape: {X_test_tfidf.shape}")

Training data shape: (4, 8)
Testing data shape: (1, 8)


In [43]:
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

model = PassiveAggressiveClassifier()
model.fit(X_train_tfidf, y_train)

# Predict and evaluate
y_pred = model.predict(X_test_tfidf)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.0
Confusion Matrix:
 [[0 0]
 [1 0]]


In [96]:
from flask import Flask, request, render_template
import pickle

app = Flask(__name__)

# Load the model and vectorizer
# Note: Make sure these files exist in the correct path
try:
    model = pickle.load(open('model.pkl', 'rb'))
    vectorizer = pickle.load(open('vectorizer.pkl', 'rb'))
except FileNotFoundError:
    print("Warning: model.pkl or vectorizer.pkl not found. This will cause errors when making predictions.")

@app.route('/')
def home():
    return render_template('index.html')

@app.route('/predict', methods=['POST'])
def predict():
    news = request.form['news']
    data = vectorizer.transform([news])
    prediction = model.predict(data)
    return render_template('index.html', prediction_text="REAL" if prediction else "FAKE")