In [None]:
# %% [markdown]
# # Fake News Classifier
# ## Model Training Notebook

# %%
# Install required packages


# %%
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, classification_report
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
import matplotlib.pyplot as plt
import seaborn as sns

# %%
# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

# %%
# Load dataset

df = pd.read_csv("C:/Users/Dell/Desktop/streamlit for the fake news/Fake.csv").sample(10000)  # Smaller sample for demo

# %%
# Preprocessing
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    text = str(text).lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    words = text.split()
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return ' '.join(words)

df['cleaned_text'] = df['content'].apply(preprocess_text)

# %%
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    df['cleaned_text'], 
    df['type'].apply(lambda x: 1 if x == 'reliable' else 0),
    test_size=0.2,
    random_state=42
)

# %%
# Vectorization
tfidf = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

# %%
# Model training
model = LogisticRegression(max_iter=1000)
model.fit(X_train_tfidf, y_train)

# %%
# Evaluation
preds = model.predict(X_test_tfidf)
print(classification_report(y_test, preds))

# %%
# Save artifacts
with open('model.pkl', 'wb') as f:
    pickle.dump(model, f)
    
with open('tfidf.pkl', 'wb') as f:
    pickle.dump(tfidf, f)

# %%
# Visualizations
plt.figure(figsize=(10,5))
sns.heatmap(pd.DataFrame(classification_report(y_test, preds, output_dict=True)).iloc[:-1, :].T, annot=True)
plt.title('Classification Report Heatmap')
plt.show()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


KeyError: 'content'

In [None]:
# %% [markdown]
# # Fake News Classifier
# ## Model Training Notebook

# %%
# Install required packages


# %%
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, classification_report
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
import matplotlib.pyplot as plt
import seaborn as sns

# %%
# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

# %%
# Load dataset

df = pd.read_csv("C:/Users/Dell/Desktop/streamlit for the fake news/Fake.csv").sample(10000)  # Smaller sample for demo

# %%
# Preprocessing
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    text = str(text).lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    words = text.split()
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return ' '.join(words)

df['cleaned_text'] = df['content'].apply(preprocess_text)

# %%
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    df['cleaned_text'], 
    df['type'].apply(lambda x: 1 if x == 'reliable' else 0),
    test_size=0.2,
    random_state=42
)

# %%
# Vectorization
tfidf = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

# %%
# Model training
model = LogisticRegression(max_iter=1000)
model.fit(X_train_tfidf, y_train)

# %%
# Evaluation
preds = model.predict(X_test_tfidf)
print(classification_report(y_test, preds))

# %%
# Save artifacts
with open('model.pkl', 'wb') as f:
    pickle.dump(model, f)
    
with open('tfidf.pkl', 'wb') as f:
    pickle.dump(tfidf, f)

# %%
# Visualizations
plt.figure(figsize=(10,5))
sns.heatmap(pd.DataFrame(classification_report(y_test, preds, output_dict=True)).iloc[:-1, :].T, annot=True)
plt.title('Classification Report Heatmap')
plt.show()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


KeyError: 'content'

In [None]:
# %% [markdown]
# # Fake News Classifier
# ## Model Training Notebook

# %%
# Install required packages


# %%
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, classification_report
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
import matplotlib.pyplot as plt
import seaborn as sns

# %%
# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

# %%
# Load dataset

df = pd.read_csv("C:/Users/Dell/Desktop/streamlit for the fake news/Fake.csv").sample(10000)  # Smaller sample for demo

# %%
# Preprocessing
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    text = str(text).lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    words = text.split()
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return ' '.join(words)

df['cleaned_text'] = df['content'].apply(preprocess_text)

# %%
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    df['cleaned_text'], 
    df['type'].apply(lambda x: 1 if x == 'reliable' else 0),
    test_size=0.2,
    random_state=42
)

# %%
# Vectorization
tfidf = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

# %%
# Model training
model = LogisticRegression(max_iter=1000)
model.fit(X_train_tfidf, y_train)

# %%
# Evaluation
preds = model.predict(X_test_tfidf)
print(classification_report(y_test, preds))

# %%
# Save artifacts
with open('model.pkl', 'wb') as f:
    pickle.dump(model, f)
    
with open('tfidf.pkl', 'wb') as f:
    pickle.dump(tfidf, f)

# %%
# Visualizations
plt.figure(figsize=(10,5))
sns.heatmap(pd.DataFrame(classification_report(y_test, preds, output_dict=True)).iloc[:-1, :].T, annot=True)
plt.title('Classification Report Heatmap')
plt.show()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


KeyError: 'content'

In [None]:
# %% [markdown]
# # Fake News Classifier
# ## Model Training Notebook

# %%
# Install required packages


# %%
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, classification_report
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
import matplotlib.pyplot as plt
import seaborn as sns

# %%
# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

# %%
# Load dataset

df = pd.read_csv("C:/Users/Dell/Desktop/streamlit for the fake news/Fake.csv").sample(10000)  # Smaller sample for demo

# %%
# Preprocessing
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    text = str(text).lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    words = text.split()
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return ' '.join(words)

df['cleaned_text'] = df['content'].apply(preprocess_text)

# %%
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    df['cleaned_text'], 
    df['type'].apply(lambda x: 1 if x == 'reliable' else 0),
    test_size=0.2,
    random_state=42
)

# %%
# Vectorization
tfidf = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

# %%
# Model training
model = LogisticRegression(max_iter=1000)
model.fit(X_train_tfidf, y_train)

# %%
# Evaluation
preds = model.predict(X_test_tfidf)
print(classification_report(y_test, preds))

# %%
# Save artifacts
with open('model.pkl', 'wb') as f:
    pickle.dump(model, f)
    
with open('tfidf.pkl', 'wb') as f:
    pickle.dump(tfidf, f)

# %%
# Visualizations
plt.figure(figsize=(10,5))
sns.heatmap(pd.DataFrame(classification_report(y_test, preds, output_dict=True)).iloc[:-1, :].T, annot=True)
plt.title('Classification Report Heatmap')
plt.show()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


KeyError: 'content'